Some bugfixes and QOL improvements

author: Ken Kellner <ken@kenkellner.com> 2018-04-19 16:23:43 -0400
committer: Ken Kellner <ken@kenkellner.com> 2018-04-19 16:23:43 -0400
commit: 1c17504de24c88924925a0b903a6878974968218 (patch)
tree: 747afab2dcba392828073b38f516a723c63ec389
parent: f2029998c90f7ebf7a108011fd508eb1c2c00d24 (diff)
6 files changed, 60 insertions, 19 deletions
diff --git a/article_epub/publisher.py b/article_epub/publisher.py
index 9756abb..1cc0562 100644
--- a/article_epub/publisher.py
+++ b/article_epub/publisher.py
@@ -183,10 +183,13 @@ def match_publisher(url,doi):
     """Match a URL to a publisher class"""
     domain = ".".join(url.split("//")[-1].split("/")[0] \
             .split('?')[0].split('.')[-2:])
+    if domain == 'doi.org':
+        sys.exit('DOI not found; is it correct?')
+
     try:
         art = get_publishers()[domain](url=url,doi=doi)
         print('Matched URL to publisher: '+art.name)
         return(art)
     except:
-        sys.exit('Publisher not supported.')
+        sys.exit('Publisher ['+domain+'] not supported.')
 
diff --git a/article_epub/publishers/bioone.py b/article_epub/publishers/bioone.py
index 41485be..df9edeb 100644
--- a/article_epub/publishers/bioone.py
+++ b/article_epub/publishers/bioone.py
@@ -10,7 +10,7 @@ class BioOne(Publisher):
     domains = ["bioone.org"]
     
     def check_fulltext(self):
-        if self.soup.find('div',class_='hlFld-Fulltext') == None:
+        if self.soup.find('div',class_='NLM_sec_level_1') == None:
             sys.exit('Error: Can\'t access fulltext of article')
         else:
             return(True)
diff --git a/article_epub/publishers/nrc.py b/article_epub/publishers/nrc.py
index 645cbc1..aff1dd4 100644
--- a/article_epub/publishers/nrc.py
+++ b/article_epub/publishers/nrc.py
@@ -27,10 +27,13 @@ class NRC(Publisher):
 
     def get_keywords(self):
         """Get article keywords"""
-        keywords_raw = self.soup.find('font',{'size':'-1'}).find_all('a')
         self.keywords = []
-        for i in keywords_raw:
-            self.keywords.append(i.text)
+        try:
+            keywords_raw = self.soup.find('font',{'size':'-1'}).find_all('a')
+            for i in keywords_raw:
+                self.keywords.append(i.text)
+        except:
+            pass
 
     def get_body(self):
         """Get body of article"""
@@ -46,8 +49,8 @@ class NRC(Publisher):
             i.find('p').decompose()
         
         if len(figs) > 0:
-            temp_raw = 'http://nrcresearchpress.com'+newlink
-            template = temp_raw.split('f')[0:-2][0]
+            newlink = figs[0].find('img')['src']
+            template = newlink.split('f1')[0]
             for i in body_raw.find_all('div',class_='short-legend'):
                 i.decompose()
             
diff --git a/article_epub/publishers/oxford.py b/article_epub/publishers/oxford.py
index 0b86191..aa0a431 100644
--- a/article_epub/publishers/oxford.py
+++ b/article_epub/publishers/oxford.py
@@ -23,6 +23,10 @@ class Oxford(Publisher):
     def get_abstract(self):
         """Get article abstract"""
         abstract_raw = self.soup.find('section',class_='abstract')
+        if abstract_raw == None:
+            self.abstract = ''
+            return
+
         self.abstract = '<h2>Abstract</h2>\n'+str(abstract_raw)
 
     def get_keywords(self):
@@ -44,15 +48,24 @@ class Oxford(Publisher):
             body_raw.find('h2',class_='abstract-title').decompose()
         except:
             pass
-        body_raw.find('section',class_='abstract').decompose()
+        try:
+            body_raw.find('section',class_='abstract').decompose()
+        except:
+            pass
         try:
             body_raw.find('div',class_='article-metadata-panel').decompose()
         except:
             pass
-        body_raw.find('div',class_='ref-list').decompose()
+        try:
+            body_raw.find('div',class_='ref-list').decompose()
+            body_raw.find('h2',class_='backreferences-title').decompose()
+        except:
+            pass
         body_raw.find('span',{'id':'UserHasAccess'}).decompose()
-        body_raw.find('div',class_='copyright').decompose()
-        body_raw.find('h2',class_='backreferences-title').decompose()
+        try:
+            body_raw.find('div',class_='copyright').decompose()
+        except:
+            pass
 
         for i in body_raw.find_all('div',class_='fig-modal'):
             i.decompose()
@@ -78,8 +91,12 @@ class Oxford(Publisher):
     
     def get_references(self):
         """Get references list"""
-        references_title = self.soup.find('h2',class_='backreferences-title')
         references_raw = self.soup.find('div',class_='ref-list')
+        if references_raw == None:
+            self.references = ''
+            return
+        
+        references_title = self.soup.find('h2',class_='backreferences-title')
         refs_format = ''
         for i in references_raw.find_all('div',recursive=False):
             for j in i.find_all('a'):
diff --git a/article_epub/publishers/tandf.py b/article_epub/publishers/tandf.py
index 7d33518..963aaf2 100644
--- a/article_epub/publishers/tandf.py
+++ b/article_epub/publishers/tandf.py
@@ -80,9 +80,23 @@ class TandF(Publisher):
             link = 'https://www.tandfonline.com'+csv['href']
             csv['href'] = link
             i.find('a',{'id':'displaySizeTable'}).decompose()
-        
-        for i in self.soup.find_all('span',class_='NLM_disp-formula-image'):
-            i.decompose()
+       
+        for i in self.soup.find_all('span', class_='disp-formula'):
+            
+            link = 'https://www.tandfonline.com'+ \
+                    i.find('noscript').find('img')['src']
+
+            for j in i.find_all('noscript'):
+                j.decompose()
+
+            i.find('img')['src'] = link
+
+        for i in self.soup.find_all('span', class_='NLM_inline-graphic'):
+            for j in i.find_all('noscript'):
+                j.decompose()
+            
+            link = 'https://www.tandfonline.com'+i.find('img')['src']
+            i.find('img')['src'] = link
 
         self.body = ''
         for i in body_raw:
diff --git a/article_epub/publishers/wiley.py b/article_epub/publishers/wiley.py
index 6d40491..9f08c59 100644
--- a/article_epub/publishers/wiley.py
+++ b/article_epub/publishers/wiley.py
@@ -12,11 +12,15 @@ class Wiley(Publisher):
             self.url = self.url.replace('/abs/','/full/')
    
     def check_fulltext(self):
-        if self.soup.find('section',class_='article-section__full') \
-            .find('div',class_='article-section__content').text == '\n\xa0\n':
-            sys.exit('Error: Can\'t access fulltext of article')
+        full = self.soup.find('section',class_='article-section__full')
+        if full != None:
+            if full.find('div',class_='article-section__content') \
+                .text == '\n\xa0\n':
+                sys.exit('Error: Can\'t access fulltext of article')
+            else:
+                return(True)
         else:
-            return(True)
+            sys.exit('Error: Can\'t access fulltext of article')
     
     def get_doi(self):
         if self.doi == None:
author	Ken Kellner <ken@kenkellner.com>	2018-04-19 16:23:43 -0400
committer	Ken Kellner <ken@kenkellner.com>	2018-04-19 16:23:43 -0400
commit	1c17504de24c88924925a0b903a6878974968218 (patch)
tree	747afab2dcba392828073b38f516a723c63ec389
parent	f2029998c90f7ebf7a108011fd508eb1c2c00d24 (diff)