From 8f61b92000c9a03a8d0740f268dc1befca66ad88 Mon Sep 17 00:00:00 2001 From: Ken Kellner Date: Fri, 20 Apr 2018 14:52:58 -0400 Subject: Add Nature Publishing support; links to tables only --- article_epub/publishers/__init__.py | 1 + article_epub/publishers/nature.py | 97 +++++++++++++++++++++++++++++++++++++ article_epub/utilities.py | 7 ++- 3 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 article_epub/publishers/nature.py diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py index d153124..206b7bc 100644 --- a/article_epub/publishers/__init__.py +++ b/article_epub/publishers/__init__.py @@ -9,3 +9,4 @@ from article_epub.publishers.nrc import NRC from article_epub.publishers.royalsociety import RoyalSociety from article_epub.publishers.tandf import TandF from article_epub.publishers.annualreviews import AnnualReviews +from article_epub.publishers.nature import Nature diff --git a/article_epub/publishers/nature.py b/article_epub/publishers/nature.py new file mode 100644 index 0000000..77e1f9c --- /dev/null +++ b/article_epub/publishers/nature.py @@ -0,0 +1,97 @@ +from article_epub.publisher import Publisher, register_publisher +import sys +import copy + +class Nature(Publisher): + """Class for Nature Publishing articles""" + + name = "Nature Publishing" + domains = ["nature.com"] + + def check_fulltext(self): + test = self.soup.find('a',{'data-track-action':'subscribe'}) + if test != None: + sys.exit('Error: Can\'t access fulltext of article') + else: + return(True) + + def get_doi(self): + if self.doi == None: + self.doi = str(self.soup.find('meta',{'name':'DOI'})['content']) + + def get_abstract(self): + """Get article abstract""" + abstract_raw = self.soup.find('div',{'id':'abstract-section'}) + try: + abstract_raw.find('span').decompose() + except: + pass + + self.abstract = str(abstract_raw) + + def get_keywords(self): + """Get article keywords""" + self.keywords = [] + try: + keywords_raw = self.soup.find_all('a',class_='subject-tag-link') + for i in keywords_raw: + self.keywords.append(i.text) + except: + pass + + def get_body(self): + """Get body of article""" + body_raw = copy.copy(self.soup.find('div',class_='article-body')) + + try: + body_raw.find('section',{'aria-labelledby':'abstract'}).decompose() + except: + pass + + try: + body_raw.find('section',{'aria-labelledby':'references'}).decompose() + except: + pass + + try: + body_raw.find('section', \ + {'aria-labelledby':'author-information'}).decompose() + body_raw.find('section',{'aria-labelledby':'rightslink'}) \ + .decompose() + body_raw.find('section',{'aria-labelledby':'article-comments'}) \ + .decompose() + except: + pass + + for i in body_raw.find_all('span',class_='js-section-title-label'): + i.decompose() + + for i in body_raw.find_all('a',{'data-track-action':'view table'}): + link = 'https://www.nature.com'+i['href'] + i['href'] = link + + for i in body_raw.find_all('a',{'data-track-action':'reference anchor'}): + part = i['href'].split('#')[1] + i['href'] = '#'+part + + for i in body_raw.find_all('a',{'data-track-action':'view figure'}): + link = 'https://www.nature.com'+i['href'] + i['href'] = link + + self.body = str(body_raw) + + def get_references(self): + """Get references list""" + ref_all = self.soup.find('div',{'id':'references-section'}) + ref_all.find('span',class_='js-section-title-label').decompose() + refs = ref_all.find('ol').find_all('li',recursive=False) + for i in refs: + try: + i.find('span').decompose() + i.find('ul',class_='js-ref-links').decompose() + except: + pass + + self.references = str(ref_all) + +register_publisher(Nature) diff --git a/article_epub/utilities.py b/article_epub/utilities.py index 1fa1f61..60a2d97 100644 --- a/article_epub/utilities.py +++ b/article_epub/utilities.py @@ -14,11 +14,16 @@ def url_from_title(title): .find('div',class_='gs_ri').find('a') possible_title = result.text possible_link = result['href'] + + if possible_title == '': + print('No matching link available.') + sys.exit('Getting URL from title failed') + print('Provided title:') print(title) print('Found following article:') print(possible_title) - choice = input("Is this correct (y/n)? ") + choice = input("\033[0;37m"+"Is this correct (y/n)? "+"\033[00m") if choice == 'y': return(possible_link) else: -- cgit v1.2.3