diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-19 16:23:43 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-19 16:23:43 -0400 |
commit | 1c17504de24c88924925a0b903a6878974968218 (patch) | |
tree | 747afab2dcba392828073b38f516a723c63ec389 | |
parent | f2029998c90f7ebf7a108011fd508eb1c2c00d24 (diff) |
Some bugfixes and QOL improvements
-rw-r--r-- | article_epub/publisher.py | 5 | ||||
-rw-r--r-- | article_epub/publishers/bioone.py | 2 | ||||
-rw-r--r-- | article_epub/publishers/nrc.py | 13 | ||||
-rw-r--r-- | article_epub/publishers/oxford.py | 27 | ||||
-rw-r--r-- | article_epub/publishers/tandf.py | 20 | ||||
-rw-r--r-- | article_epub/publishers/wiley.py | 12 |
6 files changed, 60 insertions, 19 deletions
diff --git a/article_epub/publisher.py b/article_epub/publisher.py index 9756abb..1cc0562 100644 --- a/article_epub/publisher.py +++ b/article_epub/publisher.py @@ -183,10 +183,13 @@ def match_publisher(url,doi): """Match a URL to a publisher class""" domain = ".".join(url.split("//")[-1].split("/")[0] \ .split('?')[0].split('.')[-2:]) + if domain == 'doi.org': + sys.exit('DOI not found; is it correct?') + try: art = get_publishers()[domain](url=url,doi=doi) print('Matched URL to publisher: '+art.name) return(art) except: - sys.exit('Publisher not supported.') + sys.exit('Publisher ['+domain+'] not supported.') diff --git a/article_epub/publishers/bioone.py b/article_epub/publishers/bioone.py index 41485be..df9edeb 100644 --- a/article_epub/publishers/bioone.py +++ b/article_epub/publishers/bioone.py @@ -10,7 +10,7 @@ class BioOne(Publisher): domains = ["bioone.org"] def check_fulltext(self): - if self.soup.find('div',class_='hlFld-Fulltext') == None: + if self.soup.find('div',class_='NLM_sec_level_1') == None: sys.exit('Error: Can\'t access fulltext of article') else: return(True) diff --git a/article_epub/publishers/nrc.py b/article_epub/publishers/nrc.py index 645cbc1..aff1dd4 100644 --- a/article_epub/publishers/nrc.py +++ b/article_epub/publishers/nrc.py @@ -27,10 +27,13 @@ class NRC(Publisher): def get_keywords(self): """Get article keywords""" - keywords_raw = self.soup.find('font',{'size':'-1'}).find_all('a') self.keywords = [] - for i in keywords_raw: - self.keywords.append(i.text) + try: + keywords_raw = self.soup.find('font',{'size':'-1'}).find_all('a') + for i in keywords_raw: + self.keywords.append(i.text) + except: + pass def get_body(self): """Get body of article""" @@ -46,8 +49,8 @@ class NRC(Publisher): i.find('p').decompose() if len(figs) > 0: - temp_raw = 'http://nrcresearchpress.com'+newlink - template = temp_raw.split('f')[0:-2][0] + newlink = figs[0].find('img')['src'] + template = newlink.split('f1')[0] for i in body_raw.find_all('div',class_='short-legend'): i.decompose() diff --git a/article_epub/publishers/oxford.py b/article_epub/publishers/oxford.py index 0b86191..aa0a431 100644 --- a/article_epub/publishers/oxford.py +++ b/article_epub/publishers/oxford.py @@ -23,6 +23,10 @@ class Oxford(Publisher): def get_abstract(self): """Get article abstract""" abstract_raw = self.soup.find('section',class_='abstract') + if abstract_raw == None: + self.abstract = '' + return + self.abstract = '<h2>Abstract</h2>\n'+str(abstract_raw) def get_keywords(self): @@ -44,15 +48,24 @@ class Oxford(Publisher): body_raw.find('h2',class_='abstract-title').decompose() except: pass - body_raw.find('section',class_='abstract').decompose() + try: + body_raw.find('section',class_='abstract').decompose() + except: + pass try: body_raw.find('div',class_='article-metadata-panel').decompose() except: pass - body_raw.find('div',class_='ref-list').decompose() + try: + body_raw.find('div',class_='ref-list').decompose() + body_raw.find('h2',class_='backreferences-title').decompose() + except: + pass body_raw.find('span',{'id':'UserHasAccess'}).decompose() - body_raw.find('div',class_='copyright').decompose() - body_raw.find('h2',class_='backreferences-title').decompose() + try: + body_raw.find('div',class_='copyright').decompose() + except: + pass for i in body_raw.find_all('div',class_='fig-modal'): i.decompose() @@ -78,8 +91,12 @@ class Oxford(Publisher): def get_references(self): """Get references list""" - references_title = self.soup.find('h2',class_='backreferences-title') references_raw = self.soup.find('div',class_='ref-list') + if references_raw == None: + self.references = '' + return + + references_title = self.soup.find('h2',class_='backreferences-title') refs_format = '' for i in references_raw.find_all('div',recursive=False): for j in i.find_all('a'): diff --git a/article_epub/publishers/tandf.py b/article_epub/publishers/tandf.py index 7d33518..963aaf2 100644 --- a/article_epub/publishers/tandf.py +++ b/article_epub/publishers/tandf.py @@ -80,9 +80,23 @@ class TandF(Publisher): link = 'https://www.tandfonline.com'+csv['href'] csv['href'] = link i.find('a',{'id':'displaySizeTable'}).decompose() - - for i in self.soup.find_all('span',class_='NLM_disp-formula-image'): - i.decompose() + + for i in self.soup.find_all('span', class_='disp-formula'): + + link = 'https://www.tandfonline.com'+ \ + i.find('noscript').find('img')['src'] + + for j in i.find_all('noscript'): + j.decompose() + + i.find('img')['src'] = link + + for i in self.soup.find_all('span', class_='NLM_inline-graphic'): + for j in i.find_all('noscript'): + j.decompose() + + link = 'https://www.tandfonline.com'+i.find('img')['src'] + i.find('img')['src'] = link self.body = '' for i in body_raw: diff --git a/article_epub/publishers/wiley.py b/article_epub/publishers/wiley.py index 6d40491..9f08c59 100644 --- a/article_epub/publishers/wiley.py +++ b/article_epub/publishers/wiley.py @@ -12,11 +12,15 @@ class Wiley(Publisher): self.url = self.url.replace('/abs/','/full/') def check_fulltext(self): - if self.soup.find('section',class_='article-section__full') \ - .find('div',class_='article-section__content').text == '\n\xa0\n': - sys.exit('Error: Can\'t access fulltext of article') + full = self.soup.find('section',class_='article-section__full') + if full != None: + if full.find('div',class_='article-section__content') \ + .text == '\n\xa0\n': + sys.exit('Error: Can\'t access fulltext of article') + else: + return(True) else: - return(True) + sys.exit('Error: Can\'t access fulltext of article') def get_doi(self): if self.doi == None: |