diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-05 08:20:40 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-05 08:20:40 -0400 |
commit | 7aacb78f566a97c90b8f460bf33e075a022060c1 (patch) | |
tree | 09ccb0001e6e6a9716379d287e4193265472228e | |
parent | 52d5cc23a30b76f43522b2d37229d4449f099d73 (diff) |
Add Wiley recipe and get info from DOI where possible
-rwxr-xr-x | article-epub.py | 3 | ||||
-rw-r--r-- | article_epub/publisher.py | 62 | ||||
-rw-r--r-- | article_epub/publishers/__init__.py | 1 | ||||
-rw-r--r-- | article_epub/publishers/sciencedirect.py | 70 | ||||
-rw-r--r-- | article_epub/publishers/springer.py | 63 | ||||
-rw-r--r-- | article_epub/publishers/wiley.py | 47 |
6 files changed, 174 insertions, 72 deletions
diff --git a/article-epub.py b/article-epub.py index 7b301e9..2a5d339 100755 --- a/article-epub.py +++ b/article-epub.py @@ -11,7 +11,8 @@ def main(): url = sys.argv[1] doi = None - domain = url.split("//")[-1].split("/")[0].split('?')[0] + domain = ".".join(url.split("//")[-1].split("/")[0] \ + .split('?')[0].split('.')[-2:]) art = article_epub.publisher.get_publishers()[domain](url=url,doi=doi) diff --git a/article_epub/publisher.py b/article_epub/publisher.py index f8abfc8..0ea9259 100644 --- a/article_epub/publisher.py +++ b/article_epub/publisher.py @@ -6,6 +6,8 @@ import sys import pypandoc from time import sleep import subprocess +import requests +import json _publishers = list() _publisher_domains = dict() @@ -16,8 +18,12 @@ class Publisher(object): self.url = url self.doi = doi + def get_final_url(self): + pass + def soupify(self): """Get HTML from article's page""" + self.get_final_url() os.environ['MOZ_HEADLESS'] = '1' binary = FirefoxBinary('/usr/bin/firefox') try: @@ -33,12 +39,44 @@ class Publisher(object): if self.doi != None: print('Waiting for redirects..') sleep(5) #To allow redirects - + + sleep(5) self.url = driver.current_url self.soup = BeautifulSoup(driver.page_source,'html.parser') driver.quit() + def doi2json(self): + """ + Get a dictionary of metadata for a given DOI. + """ + url = "http://dx.doi.org/" + self.doi + headers = {"accept": "application/json"} + r = requests.get(url, headers = headers) + self.meta = r.json() + + def get_metadata(self): + self.doi2json() + + self.title = self.meta['title'] + + self.author_surnames = [] + self.author_givennames = [] + for i in self.meta['author']: + self.author_surnames.append(i['family']) + self.author_givennames.append(i['given']) + + self.journal = self.meta['container-title'] + + if 'published-print' in self.meta.keys(): + self.volume = str(self.meta['volume']) + self.pages = str(self.meta['page']) + self.year = str(self.meta['published-print']['date-parts'][0][0]) + else: + self.volume = '' + self.pages = '' + self.year = str(self.meta['published-online']['date-parts'][0][0]) + def get_citation(self): all_authors = '' @@ -52,19 +90,23 @@ class Publisher(object): else: cap = '. ' - self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \ - +self.journal+' '+self.volume+': '+self.pages+'.' \ - +' doi: '+self.doi + if self.volume != '': + self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \ + +self.journal+' '+self.volume+': '+self.pages+'.' \ + +' doi: '+self.doi + else: + self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \ + +self.journal+'. '+' doi: '+self.doi def extract_data(self): - self.get_title() - self.get_authors() + #self.get_title() + #self.get_authors() + self.get_doi() + self.get_metadata() self.get_abstract() self.get_keywords() - self.get_metadata() self.get_body() self.get_references() - self.get_citation() def epubify(self): """Convert data into epub format""" @@ -75,7 +117,9 @@ class Publisher(object): all_authors += self.author_surnames[i] if(i != (len(self.author_surnames) - 1)): all_authors += ', ' - + + self.get_citation() + args = [] args.append('-M') args.append('title="'+self.title+'"') diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py index 939ee39..730fab3 100644 --- a/article_epub/publishers/__init__.py +++ b/article_epub/publishers/__init__.py @@ -1,2 +1,3 @@ from article_epub.publishers.sciencedirect import ScienceDirect from article_epub.publishers.springer import Springer +from article_epub.publishers.wiley import Wiley diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py index 26ca971..34fecfd 100644 --- a/article_epub/publishers/sciencedirect.py +++ b/article_epub/publishers/sciencedirect.py @@ -3,26 +3,30 @@ from article_epub.publisher import Publisher, register_publisher class ScienceDirect(Publisher): """Class for Science Direct (Elsevier) articles""" - domains = ["sciencedirect.com","www.sciencedirect.com", - "linkinghub.elsevier.com"] + domains = ["sciencedirect.com","elsevier.com"] - def get_title(self): - """Get article title""" - self.title = self.soup.find('span',class_='title-text').text + #def get_title(self): + # """Get article title""" + # self.title = self.soup.find('span',class_='title-text').text - def get_authors(self): - """Get author given and surnammes""" - author_raw = self.soup.find('div',class_='author-group') \ - .find_all('span',class_='text surname') - self.author_surnames = [] - for i in author_raw: - self.author_surnames.append(i.text) - - author_raw = self.soup.find('div',class_='author-group') \ - .find_all('span',class_='text given-name') - self.author_givennames = [] - for i in author_raw: - self.author_givennames.append(i.text) + #def get_authors(self): + # """Get author given and surnammes""" + # author_raw = self.soup.find('div',class_='author-group') \ + # .find_all('span',class_='text surname') + # self.author_surnames = [] + # for i in author_raw: + # self.author_surnames.append(i.text) + # + # author_raw = self.soup.find('div',class_='author-group') \ + # .find_all('span',class_='text given-name') + # self.author_givennames = [] + # for i in author_raw: + # self.author_givennames.append(i.text) + + def get_doi(self): + if self.doi == None: + doi_raw = self.soup.find('a',class_='doi').get('href').split('/') + self.doi = str(doi_raw[3]+'/'+doi_raw[4]) def get_abstract(self): """Get article abstract""" @@ -36,21 +40,21 @@ class ScienceDirect(Publisher): for i in keys_raw: self.keywords.append(i.text) - def get_metadata(self): - """Get assortment of other metadata""" - if self.doi == None: - doi_raw = self.soup.find('a',class_='doi').get('href').split('/') - self.doi = doi_raw[3]+'/'+doi_raw[4] - - self.journal = self.soup.find('div',class_='publication-volume') \ - .find('span',class_='size-xl').text - - pubdate_raw = self.soup.find('div',class_='publication-volume') \ - .find('span',class_='size-m').text.split(',') - - self.year = pubdate_raw[-2].split(' ')[-1] - self.volume = pubdate_raw[0].split(' ')[1] - self.pages = pubdate_raw[-1].split(' ')[2] + #def get_metadata(self): + # """Get assortment of other metadata""" + # if self.doi == None: + # doi_raw = self.soup.find('a',class_='doi').get('href').split('/') + # self.doi = doi_raw[3]+'/'+doi_raw[4] + # + # self.journal = self.soup.find('div',class_='publication-volume') \ + # .find('span',class_='size-xl').text + # + # pubdate_raw = self.soup.find('div',class_='publication-volume') \ + # .find('span',class_='size-m').text.split(',') + # + # self.year = pubdate_raw[-2].split(' ')[-1] + # self.volume = pubdate_raw[0].split(' ')[1] + # self.pages = pubdate_raw[-1].split(' ')[2] def get_body(self): """Get body of article""" diff --git a/article_epub/publishers/springer.py b/article_epub/publishers/springer.py index bdd0354..77930d4 100644 --- a/article_epub/publishers/springer.py +++ b/article_epub/publishers/springer.py @@ -3,21 +3,26 @@ from article_epub.publisher import Publisher, register_publisher class Springer(Publisher): """Class for Springer articles""" - domains = ["link.springer.com","springer.com","www.springer.com"] + domains = ["springer.com"] - def get_title(self): - """Get article title""" - self.title = self.soup.find('h1',class_='ArticleTitle').text - - def get_authors(self): - """Get author given and surnammes""" - author_raw = self.soup.find_all('span',class_='authors__name') - self.author_surnames = [] - self.author_givennames = [] - for i in author_raw: - name = i.text.split('\xa0') - self.author_surnames.append(name[-1]) - self.author_givennames.append(' '.join(name[:-1])) + #def get_title(self): + # """Get article title""" + # self.title = self.soup.find('h1',class_='ArticleTitle').text + + #def get_authors(self): + # """Get author given and surnammes""" + # author_raw = self.soup.find_all('span',class_='authors__name') + # self.author_surnames = [] + # self.author_givennames = [] + # for i in author_raw: + # name = i.text.split('\xa0') + # self.author_surnames.append(name[-1]) + # self.author_givennames.append(' '.join(name[:-1])) + + def get_doi(self): + if self.doi == None: + doi_raw = self.soup.find('span',{"id":"doi-url"}).text.split('/') + self.doi = str(doi_raw[-2]+'/'+doi_raw[-1]) def get_abstract(self): """Get article abstract""" @@ -30,21 +35,21 @@ class Springer(Publisher): for i in keywords_raw: self.keywords.append(i.text.replace('\xa0','')) - def get_metadata(self): - """Get assortment of other metadata""" - if self.doi == None: - doi_raw = self.soup.find('span',{"id":"doi-url"}).text.split('/') - self.doi = doi_raw[-2]+'/'+doi_raw[-1] - - self.journal = self.soup.find('span',class_="JournalTitle").text - - self.year = self.soup.find('time')['datetime'].split('-')[0] - - self.volume = self.soup.find('span',class_="ArticleCitation_Volume") \ - .text[:-2].split(' ')[-1] - - self.pages = self.soup.find('span',class_="ArticleCitation_Pages") \ - .text.split(' ')[-1] + #def get_metadata(self): + # """Get assortment of other metadata""" + # if self.doi == None: + # doi_raw = self.soup.find('span',{"id":"doi-url"}).text.split('/') + # self.doi = doi_raw[-2]+'/'+doi_raw[-1] + # + # self.journal = self.soup.find('span',class_="JournalTitle").text + # + # self.year = self.soup.find('time')['datetime'].split('-')[0] + # + # self.volume = self.soup.find('span',class_="ArticleCitation_Volume") \ + # .text[:-2].split(' ')[-1] + # + # self.pages = self.soup.find('span',class_="ArticleCitation_Pages") \ + # .text.split(' ')[-1] def get_body(self): """Get body of article""" diff --git a/article_epub/publishers/wiley.py b/article_epub/publishers/wiley.py new file mode 100644 index 0000000..0133c1d --- /dev/null +++ b/article_epub/publishers/wiley.py @@ -0,0 +1,47 @@ +from article_epub.publisher import Publisher, register_publisher + +class Wiley(Publisher): + """Class for Springer articles""" + + domains = ["wiley.com"] + + def get_final_url(self): + if '/abs/' in self.url: + self.url = self.url.replace('/abs/','/full/') + + def get_doi(self): + if self.doi == None: + doi_raw = self.soup.find('a',class_='epub-doi').text.split('/') + self.doi = str(doi_raw[3]+'/'+doi_raw[4]) + + def get_abstract(self): + """Get article abstract""" + self.abstract = self.soup.find('section', + class_='article-section__abstract') + + def get_keywords(self): + """Get article keywords""" + keywords_raw = self.soup.find('section',class_='keywords') \ + .find_all('a',class_='badge-type') + self.keywords = [] + for i in keywords_raw: + self.keywords.append(i.text.replace('\n','').replace('\u200a','')) + + def get_body(self): + """Get body of article""" + body_raw = self.soup.find_all('div',class_='article-section__content') + body_raw = body_raw[1:] + self.body = '' + for i in body_raw: + self.body += str(i) + + def get_references(self): + """Get references list""" + references_raw = str(self.soup.find('section', + {'id':'references-section'})) + references_raw = references_raw.replace('"display: none;"','') + references_raw = references_raw.replace('Literature Cited','') + references_raw = references_raw.replace('data-bib-id','id') + self.references = '<h2>Literature Cited</h2>\n'+references_raw + +register_publisher(Wiley) |