aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Kellner <ken@kenkellner.com>2018-04-20 12:01:46 -0400
committerKen Kellner <ken@kenkellner.com>2018-04-20 12:01:46 -0400
commitf2eb10a42b8772c082875296f2d426ecd85e7770 (patch)
treea4188159427c788e7258cd183570cc5541869d56
parent1c17504de24c88924925a0b903a6878974968218 (diff)
Support for Annual Reviews; tables don't work
-rw-r--r--article_epub/publishers/__init__.py1
-rw-r--r--article_epub/publishers/annualreviews.py107
2 files changed, 108 insertions, 0 deletions
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py
index 2fa5e09..d153124 100644
--- a/article_epub/publishers/__init__.py
+++ b/article_epub/publishers/__init__.py
@@ -8,3 +8,4 @@ from article_epub.publishers.nih import NIH
from article_epub.publishers.nrc import NRC
from article_epub.publishers.royalsociety import RoyalSociety
from article_epub.publishers.tandf import TandF
+from article_epub.publishers.annualreviews import AnnualReviews
diff --git a/article_epub/publishers/annualreviews.py b/article_epub/publishers/annualreviews.py
new file mode 100644
index 0000000..d5ebf01
--- /dev/null
+++ b/article_epub/publishers/annualreviews.py
@@ -0,0 +1,107 @@
+from article_epub.publisher import Publisher, register_publisher
+import sys
+import copy
+import requests
+
+class AnnualReviews(Publisher):
+ """Class for Annual Reviews articles"""
+
+ name = "Annual Reviews"
+ domains = ["annualreviews.org"]
+
+ def get_final_url(self):
+ pass
+
+ def check_fulltext(self):
+ test = self.soup.find_all('div',class_='hlFld-Fulltext')
+ if len(test) < 1:
+ sys.exit('Error: Can\'t access fulltext of article')
+ else:
+ return(True)
+
+ def get_doi(self):
+ if self.doi == None:
+ self.doi = str(self.soup.find('meta',{'scheme':'doi'})['content'])
+
+ def get_abstract(self):
+ """Get article abstract"""
+ abstract_raw = self.soup.find('div',class_='hlFld-Abstract')
+ try:
+ abstract_raw.find('iframe').decompose()
+ except:
+ pass
+ try:
+ abstract_raw.find('span',class_='title').decompose()
+ except:
+ pass
+
+ self.abstract = str(abstract_raw)
+
+ def get_keywords(self):
+ """Get article keywords"""
+ self.keywords = []
+ try:
+ keywords_raw = self.soup.find('div',class_='hlFld-KeywordText') \
+ .find_all('a')
+ for i in keywords_raw:
+ self.keywords.append(i.text)
+ except:
+ pass
+
+ def get_body(self):
+ """Get body of article"""
+ body_raw = copy.copy(self.soup.find('div',class_='hlFld-Fulltext'))
+
+ try:
+ body_raw.find('div',class_='lit-cited').decompose()
+ body_raw.find('div',{'id':'citations'}).decompose()
+ except:
+ pass
+
+ for i in body_raw.find_all('a',class_='scrollRef'):
+ i['href'] = '#'+i['refid']+'ref'
+
+ for i in body_raw.find_all('a',class_='scrollFig'):
+ try:
+ i['href'] = '#'+i['data-figindex']
+ except:
+ pass
+
+ for i in body_raw.find_all('figure'):
+ oldlink = 'https://www.annualreviews.org'+ \
+ i.find('a').find('img')['src']
+ newlink = oldlink.replace('small','medium')
+ response = requests.head(newlink).headers['content-type']
+ if 'image' in response:
+ i.find('a').find('img')['src'] = newlink
+ else:
+ newlinkjpeg = newlink.replace('.gif','.jpeg')
+ i.find('a').find('img')['src'] = newlinkjpeg
+
+ for i in body_raw.find_all('span',class_='NLM_inline-graphic'):
+ link = 'https://www.annualreviews.org'+\
+ i.find('img')['src']
+ i.find('img')['src'] = link
+
+ for i in body_raw.find_all('div',class_='equation'):
+ link = 'https://www.annualreviews.org'+i.find('img')['src']
+ i.find('img')['src'] = link
+
+ self.body = str(body_raw)
+
+ def get_references(self):
+ """Get references list"""
+ references = self.soup.find('div',class_='lit-cited')
+
+ for i in references.find_all('ul',class_='off-links'):
+ i.decompose()
+ for i in references.find_all('div',class_='article-locations'):
+ i.decompose()
+ for i in references.find_all('a',class_='ar-modal-link citation'):
+ i.decompose()
+ for i in references.find_all('div',class_='citation-content'):
+ i.decompose()
+
+ self.references = str(references)
+
+register_publisher(AnnualReviews)