From f2eb10a42b8772c082875296f2d426ecd85e7770 Mon Sep 17 00:00:00 2001 From: Ken Kellner Date: Fri, 20 Apr 2018 12:01:46 -0400 Subject: Support for Annual Reviews; tables don't work --- article_epub/publishers/__init__.py | 1 + article_epub/publishers/annualreviews.py | 107 +++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 article_epub/publishers/annualreviews.py diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py index 2fa5e09..d153124 100644 --- a/article_epub/publishers/__init__.py +++ b/article_epub/publishers/__init__.py @@ -8,3 +8,4 @@ from article_epub.publishers.nih import NIH from article_epub.publishers.nrc import NRC from article_epub.publishers.royalsociety import RoyalSociety from article_epub.publishers.tandf import TandF +from article_epub.publishers.annualreviews import AnnualReviews diff --git a/article_epub/publishers/annualreviews.py b/article_epub/publishers/annualreviews.py new file mode 100644 index 0000000..d5ebf01 --- /dev/null +++ b/article_epub/publishers/annualreviews.py @@ -0,0 +1,107 @@ +from article_epub.publisher import Publisher, register_publisher +import sys +import copy +import requests + +class AnnualReviews(Publisher): + """Class for Annual Reviews articles""" + + name = "Annual Reviews" + domains = ["annualreviews.org"] + + def get_final_url(self): + pass + + def check_fulltext(self): + test = self.soup.find_all('div',class_='hlFld-Fulltext') + if len(test) < 1: + sys.exit('Error: Can\'t access fulltext of article') + else: + return(True) + + def get_doi(self): + if self.doi == None: + self.doi = str(self.soup.find('meta',{'scheme':'doi'})['content']) + + def get_abstract(self): + """Get article abstract""" + abstract_raw = self.soup.find('div',class_='hlFld-Abstract') + try: + abstract_raw.find('iframe').decompose() + except: + pass + try: + abstract_raw.find('span',class_='title').decompose() + except: + pass + + self.abstract = str(abstract_raw) + + def get_keywords(self): + """Get article keywords""" + self.keywords = [] + try: + keywords_raw = self.soup.find('div',class_='hlFld-KeywordText') \ + .find_all('a') + for i in keywords_raw: + self.keywords.append(i.text) + except: + pass + + def get_body(self): + """Get body of article""" + body_raw = copy.copy(self.soup.find('div',class_='hlFld-Fulltext')) + + try: + body_raw.find('div',class_='lit-cited').decompose() + body_raw.find('div',{'id':'citations'}).decompose() + except: + pass + + for i in body_raw.find_all('a',class_='scrollRef'): + i['href'] = '#'+i['refid']+'ref' + + for i in body_raw.find_all('a',class_='scrollFig'): + try: + i['href'] = '#'+i['data-figindex'] + except: + pass + + for i in body_raw.find_all('figure'): + oldlink = 'https://www.annualreviews.org'+ \ + i.find('a').find('img')['src'] + newlink = oldlink.replace('small','medium') + response = requests.head(newlink).headers['content-type'] + if 'image' in response: + i.find('a').find('img')['src'] = newlink + else: + newlinkjpeg = newlink.replace('.gif','.jpeg') + i.find('a').find('img')['src'] = newlinkjpeg + + for i in body_raw.find_all('span',class_='NLM_inline-graphic'): + link = 'https://www.annualreviews.org'+\ + i.find('img')['src'] + i.find('img')['src'] = link + + for i in body_raw.find_all('div',class_='equation'): + link = 'https://www.annualreviews.org'+i.find('img')['src'] + i.find('img')['src'] = link + + self.body = str(body_raw) + + def get_references(self): + """Get references list""" + references = self.soup.find('div',class_='lit-cited') + + for i in references.find_all('ul',class_='off-links'): + i.decompose() + for i in references.find_all('div',class_='article-locations'): + i.decompose() + for i in references.find_all('a',class_='ar-modal-link citation'): + i.decompose() + for i in references.find_all('div',class_='citation-content'): + i.decompose() + + self.references = str(references) + +register_publisher(AnnualReviews) -- cgit v1.2.3