diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-05 13:14:43 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-05 13:14:43 -0400 |
commit | 34913dd9ff573cd506e45dda5c13008f451cec11 (patch) | |
tree | 08de3d70a4a530337ac30b34c0f4a4455ac8c668 | |
parent | 7aacb78f566a97c90b8f460bf33e075a022060c1 (diff) |
Add BioOne to supported publishers
-rwxr-xr-x | article-epub.py | 9 | ||||
-rw-r--r-- | article_epub/publisher.py | 6 | ||||
-rw-r--r-- | article_epub/publishers/__init__.py | 1 | ||||
-rw-r--r-- | article_epub/publishers/bioone.py | 69 | ||||
-rw-r--r-- | article_epub/publishers/sciencedirect.py | 34 | ||||
-rw-r--r-- | article_epub/publishers/springer.py | 30 |
6 files changed, 80 insertions, 69 deletions
diff --git a/article-epub.py b/article-epub.py index 2a5d339..3b1ef25 100755 --- a/article-epub.py +++ b/article-epub.py @@ -4,7 +4,8 @@ import sys import requests def main(): - if sys.argv[1] == '-d': + if sys.argv[1] == '-d': + print("Getting URL from DOI...") url = requests.get('https://doi.org/'+sys.argv[2]).url doi = sys.argv[2] else: @@ -14,9 +15,11 @@ def main(): domain = ".".join(url.split("//")[-1].split("/")[0] \ .split('?')[0].split('.')[-2:]) - art = article_epub.publisher.get_publishers()[domain](url=url,doi=doi) + try: + art = article_epub.publisher.get_publishers()[domain](url=url,doi=doi) + except: + sys.exit('Publisher not supported.') - print('Downloading content...') art.soupify() art.extract_data() art.epubify() diff --git a/article_epub/publisher.py b/article_epub/publisher.py index 0ea9259..7af9a47 100644 --- a/article_epub/publisher.py +++ b/article_epub/publisher.py @@ -25,12 +25,15 @@ class Publisher(object): """Get HTML from article's page""" self.get_final_url() os.environ['MOZ_HEADLESS'] = '1' + print('Starting headless browser...') binary = FirefoxBinary('/usr/bin/firefox') try: driver = webdriver.Firefox(firefox_binary=binary, log_path='/tmp/gecko_log') except: sys.exit('Failed to load Firefox; is it installed?') + + print('Loading page...') try: driver.get(self.url) except: @@ -99,8 +102,7 @@ class Publisher(object): +self.journal+'. '+' doi: '+self.doi def extract_data(self): - #self.get_title() - #self.get_authors() + print('Extracting data from HTML...') self.get_doi() self.get_metadata() self.get_abstract() diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py index 730fab3..f17fab8 100644 --- a/article_epub/publishers/__init__.py +++ b/article_epub/publishers/__init__.py @@ -1,3 +1,4 @@ from article_epub.publishers.sciencedirect import ScienceDirect from article_epub.publishers.springer import Springer from article_epub.publishers.wiley import Wiley +from article_epub.publishers.bioone import BioOne diff --git a/article_epub/publishers/bioone.py b/article_epub/publishers/bioone.py new file mode 100644 index 0000000..73c4379 --- /dev/null +++ b/article_epub/publishers/bioone.py @@ -0,0 +1,69 @@ +from article_epub.publisher import Publisher, register_publisher +import requests +from bs4 import BeautifulSoup + +class BioOne(Publisher): + """Class for BioOne articles""" + + domains = ["bioone.org"] + + def get_final_url(self): + if '/abs/' in self.url: + self.url = self.url.replace('/doi/abs/','/doi/') + + def get_doi(self): + if self.doi == None: + doi_raw = self.soup.find('p',class_='articleRef') \ + .find('a').text.split('/') + self.doi = str(doi_raw[3]+'/'+doi_raw[4]) + + def get_abstract(self): + """Get article abstract""" + abstract_raw = str(self.soup.find('div',class_='abstractSection')) + self.abstract = abstract_raw.replace('<h3','<h2') \ + .replace('</h3>','</h2>').replace('Abstract. ','ABSTRACT') + + def get_keywords(self): + """Get article keywords""" + pass + + def get_body(self): + """Get body of article""" + body_full = self.soup.find('div',class_='hlFld-Fulltext') + links_old = body_full.find_all('a',class_='ref') + for i in links_old: + try: + tag = '#'+i['onclick'].split("'")[1] + i['href'] = str(tag) + i['onclick'] = '' + except: + pass + + print('Downloading higher-quality images...') + imgs_old = body_full.find_all('div',class_='articleImage') + for i in imgs_old: + try: + link = i.find('a',class_='popupLink') + imgpage = BeautifulSoup(requests.get('https://bioone.org' \ + +str(link['href'])).content,'html.parser') + imglink = 'http://bioone.org'+str(imgpage.find('img')['src']) + link.find('img')['src'] = imglink + link['href'] = '' + except: + pass + + body_raw = body_full.find_all('div',class_='NLM_sec_level_1') + self.body = '' + for i in body_raw: + self.body += str(i) + + self.body = self.body.replace('<h6>','<h2>').replace('</h6>','</h2>') + self.body = self.body.replace('enlarge figure','') + + def get_references(self): + """Get references list""" + references_raw = str(self.soup.find('div',class_='articleReferences')) + self.references = references_raw.replace('<h3>','<h2>') \ + .replace('</h3>','</h2>') + +register_publisher(BioOne) diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py index 34fecfd..6ca4d8f 100644 --- a/article_epub/publishers/sciencedirect.py +++ b/article_epub/publishers/sciencedirect.py @@ -5,24 +5,6 @@ class ScienceDirect(Publisher): domains = ["sciencedirect.com","elsevier.com"] - #def get_title(self): - # """Get article title""" - # self.title = self.soup.find('span',class_='title-text').text - - #def get_authors(self): - # """Get author given and surnammes""" - # author_raw = self.soup.find('div',class_='author-group') \ - # .find_all('span',class_='text surname') - # self.author_surnames = [] - # for i in author_raw: - # self.author_surnames.append(i.text) - # - # author_raw = self.soup.find('div',class_='author-group') \ - # .find_all('span',class_='text given-name') - # self.author_givennames = [] - # for i in author_raw: - # self.author_givennames.append(i.text) - def get_doi(self): if self.doi == None: doi_raw = self.soup.find('a',class_='doi').get('href').split('/') @@ -40,22 +22,6 @@ class ScienceDirect(Publisher): for i in keys_raw: self.keywords.append(i.text) - #def get_metadata(self): - # """Get assortment of other metadata""" - # if self.doi == None: - # doi_raw = self.soup.find('a',class_='doi').get('href').split('/') - # self.doi = doi_raw[3]+'/'+doi_raw[4] - # - # self.journal = self.soup.find('div',class_='publication-volume') \ - # .find('span',class_='size-xl').text - # - # pubdate_raw = self.soup.find('div',class_='publication-volume') \ - # .find('span',class_='size-m').text.split(',') - # - # self.year = pubdate_raw[-2].split(' ')[-1] - # self.volume = pubdate_raw[0].split(' ')[1] - # self.pages = pubdate_raw[-1].split(' ')[2] - def get_body(self): """Get body of article""" body_raw = str(self.soup.find('div',class_='Body')) diff --git a/article_epub/publishers/springer.py b/article_epub/publishers/springer.py index 77930d4..d53eafc 100644 --- a/article_epub/publishers/springer.py +++ b/article_epub/publishers/springer.py @@ -4,20 +4,6 @@ class Springer(Publisher): """Class for Springer articles""" domains = ["springer.com"] - - #def get_title(self): - # """Get article title""" - # self.title = self.soup.find('h1',class_='ArticleTitle').text - - #def get_authors(self): - # """Get author given and surnammes""" - # author_raw = self.soup.find_all('span',class_='authors__name') - # self.author_surnames = [] - # self.author_givennames = [] - # for i in author_raw: - # name = i.text.split('\xa0') - # self.author_surnames.append(name[-1]) - # self.author_givennames.append(' '.join(name[:-1])) def get_doi(self): if self.doi == None: @@ -35,22 +21,6 @@ class Springer(Publisher): for i in keywords_raw: self.keywords.append(i.text.replace('\xa0','')) - #def get_metadata(self): - # """Get assortment of other metadata""" - # if self.doi == None: - # doi_raw = self.soup.find('span',{"id":"doi-url"}).text.split('/') - # self.doi = doi_raw[-2]+'/'+doi_raw[-1] - # - # self.journal = self.soup.find('span',class_="JournalTitle").text - # - # self.year = self.soup.find('time')['datetime'].split('-')[0] - # - # self.volume = self.soup.find('span',class_="ArticleCitation_Volume") \ - # .text[:-2].split(' ')[-1] - # - # self.pages = self.soup.find('span',class_="ArticleCitation_Pages") \ - # .text.split(' ')[-1] - def get_body(self): """Get body of article""" self.body = self.soup.find('div',{"id":"body"}) |