Fix some bugs. Give publishers more readable names.

author: Ken Kellner <ken@kenkellner.com> 2018-04-18 15:01:50 -0400
committer: Ken Kellner <ken@kenkellner.com> 2018-04-18 15:01:50 -0400
commit: e67799a4d254fd1d9c5082cf664e8ccc40bf341a (patch)
tree: eceb8e47988149649ef88665f2c83e44360459fa
parent: d2569d0a1f262e74a3ffd8add3ecb874040e57a9 (diff)
13 files changed, 37 insertions, 8 deletions
diff --git a/README.md b/README.md
index d34c9b9..00d944e 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,7 @@ Currently, the following publishers are supported:
 * PLoS ONE
 * National Institutes of Health (NIH)
 * NRC Research Press
+* Taylor & Francis
 
 Dependencies
 ------------
diff --git a/article-epub.py b/article-epub.py
index 6753f6e..496e829 100755
--- a/article-epub.py
+++ b/article-epub.py
@@ -22,7 +22,7 @@ def main():
         pubs = article_epub.list_publishers()
         print('Available publishers:')
         for i in pubs:
-            print('• '+i.__name__)
+            print('• '+i)
         sys.exit()
     
     if args.u != None:
diff --git a/article_epub/publisher.py b/article_epub/publisher.py
index e8f0170..9756abb 100644
--- a/article_epub/publisher.py
+++ b/article_epub/publisher.py
@@ -11,6 +11,7 @@ import json
 
 _publishers = list()
 _publisher_domains = dict()
+_publisher_names = list()
 
 class Publisher(object):
     """General class for scientific article publishers"""
@@ -168,6 +169,7 @@ class Publisher(object):
 
 def register_publisher(publisher):
     _publishers.append(publisher)
+    _publisher_names.append(publisher.name)
     for d in publisher.domains:
         _publisher_domains[d] = publisher
 
@@ -175,7 +177,7 @@ def get_publishers():
     return _publisher_domains
 
 def list_publishers():
-    return _publishers
+    return _publisher_names
 
 def match_publisher(url,doi):
     """Match a URL to a publisher class"""
@@ -183,7 +185,7 @@ def match_publisher(url,doi):
             .split('?')[0].split('.')[-2:])
     try:
         art = get_publishers()[domain](url=url,doi=doi)
-        print('Matched URL to publisher: '+art.__class__.__name__)
+        print('Matched URL to publisher: '+art.name)
         return(art)
     except:
         sys.exit('Publisher not supported.')
diff --git a/article_epub/publishers/bioone.py b/article_epub/publishers/bioone.py
index dcc0bab..41485be 100644
--- a/article_epub/publishers/bioone.py
+++ b/article_epub/publishers/bioone.py
@@ -5,7 +5,8 @@ import sys
 
 class BioOne(Publisher):
     """Class for BioOne articles"""
-
+    
+    name = "BioOne"
     domains = ["bioone.org"]
     
     def check_fulltext(self):
diff --git a/article_epub/publishers/nih.py b/article_epub/publishers/nih.py
index 60294aa..7e0df3f 100644
--- a/article_epub/publishers/nih.py
+++ b/article_epub/publishers/nih.py
@@ -6,6 +6,7 @@ from bs4 import BeautifulSoup
 class NIH(Publisher):
     """Class for NIH NCBI articles"""
 
+    name = "NIH-NCBI"
     domains = ["nih.gov"]
 
     def soupify(self):
diff --git a/article_epub/publishers/nrc.py b/article_epub/publishers/nrc.py
index 57c5f75..645cbc1 100644
--- a/article_epub/publishers/nrc.py
+++ b/article_epub/publishers/nrc.py
@@ -5,6 +5,7 @@ import sys
 class NRC(Publisher):
     """Class for NRC Research Press articles"""
 
+    name = "NRC Research Press"
     domains = ["nrcresearchpress.com"]
     
     def check_fulltext(self):
diff --git a/article_epub/publishers/oxford.py b/article_epub/publishers/oxford.py
index ee72f11..948ec6f 100644
--- a/article_epub/publishers/oxford.py
+++ b/article_epub/publishers/oxford.py
@@ -5,6 +5,7 @@ import sys
 class Oxford(Publisher):
     """Class for Oxford articles"""
 
+    name = "Oxford Academic"
     domains = ["oup.com"]
 
     def check_fulltext(self):
@@ -39,7 +40,10 @@ class Oxford(Publisher):
         """Get body of article"""
         body_raw = copy.copy(self.soup.find(
             'div',{'data-widgetname':'ArticleFulltext'}))
-        body_raw.find('h2',class_='abstract-title').decompose()
+        try:
+            body_raw.find('h2',class_='abstract-title').decompose()
+        except:
+            pass
         body_raw.find('section',class_='abstract').decompose()
         body_raw.find('div',class_='article-metadata-panel').decompose()
         body_raw.find('div',class_='ref-list').decompose()
diff --git a/article_epub/publishers/plosone.py b/article_epub/publishers/plosone.py
index 8638fad..01204d6 100644
--- a/article_epub/publishers/plosone.py
+++ b/article_epub/publishers/plosone.py
@@ -3,6 +3,7 @@ from article_epub.publisher import Publisher, register_publisher
 class PLoSONE(Publisher):
     """Class for PLoS ONE articles"""
 
+    name = "PLoS ONE"
     domains = ["plos.org"]
 
     def get_doi(self):
diff --git a/article_epub/publishers/royalsociety.py b/article_epub/publishers/royalsociety.py
index 0c3b2d4..9da5846 100644
--- a/article_epub/publishers/royalsociety.py
+++ b/article_epub/publishers/royalsociety.py
@@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
 class RoyalSociety(Publisher):
     """Class for Royal Society Publishing articles"""
 
+    name = "Royal Society Publishing"
     domains = ["royalsocietypublishing.org"]
 
     def check_fulltext(self):
diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py
index 3ffcad1..186e68f 100644
--- a/article_epub/publishers/sciencedirect.py
+++ b/article_epub/publishers/sciencedirect.py
@@ -4,6 +4,7 @@ import sys
 class ScienceDirect(Publisher):
     """Class for Science Direct (Elsevier) articles"""
 
+    name = "Elsevier"
     domains = ["sciencedirect.com","elsevier.com"]
 
     def check_fulltext(self):
diff --git a/article_epub/publishers/springer.py b/article_epub/publishers/springer.py
index 9013d1d..4e3bcb8 100644
--- a/article_epub/publishers/springer.py
+++ b/article_epub/publishers/springer.py
@@ -4,6 +4,7 @@ import sys
 class Springer(Publisher):
     """Class for Springer articles"""
 
+    name = "Springer"
     domains = ["springer.com"]
 
     def check_fulltext(self):
diff --git a/article_epub/publishers/tandf.py b/article_epub/publishers/tandf.py
index 498e116..7d33518 100644
--- a/article_epub/publishers/tandf.py
+++ b/article_epub/publishers/tandf.py
@@ -4,6 +4,7 @@ import sys
 class TandF(Publisher):
     """Class for Taylor & Francis articles"""
 
+    name = "Taylor & Francis"
     domains = ["tandfonline.com"]
 
     def get_final_url(self):
@@ -24,7 +25,15 @@ class TandF(Publisher):
     def get_abstract(self):
         """Get article abstract"""
         abstract_raw = self.soup.find('div',class_='hlFld-Abstract')
-        abstract_raw.find('p',class_='summary-title').decompose()
+        try:
+            abstract_raw.find('p',class_='summary-title').decompose()
+        except:
+            pass
+        try:
+            abstract_raw.find('div',{'id':'mathJaxToggle'}).decompose()
+        except:
+            pass
+        
         self.abstract = str(abstract_raw)
 
     def get_keywords(self):
@@ -71,6 +80,9 @@ class TandF(Publisher):
             link = 'https://www.tandfonline.com'+csv['href']
             csv['href'] = link
             i.find('a',{'id':'displaySizeTable'}).decompose()
+        
+        for i in self.soup.find_all('span',class_='NLM_disp-formula-image'):
+            i.decompose()
 
         self.body = ''
         for i in body_raw:
@@ -81,6 +93,9 @@ class TandF(Publisher):
         references_raw = self.soup.find('ul',{'id':'references-Section'})
         for i in references_raw.find_all('div',class_='xlinks-container'):
             i.decompose()
+
+        for i in references_raw.find_all('img'):
+            i.decompose()
         
         references_title = '<h2>References</h2>\n'
         self.references = references_title+str(references_raw)
diff --git a/article_epub/publishers/wiley.py b/article_epub/publishers/wiley.py
index d5fbdc7..de15213 100644
--- a/article_epub/publishers/wiley.py
+++ b/article_epub/publishers/wiley.py
@@ -4,6 +4,7 @@ import sys
 class Wiley(Publisher):
     """Class for Springer articles"""
 
+    name = "Wiley"
     domains = ["wiley.com"]
 
     def get_final_url(self):
@@ -11,8 +12,7 @@ class Wiley(Publisher):
             self.url = self.url.replace('/abs/','/full/')
    
     def check_fulltext(self):
-        test = self.soup.find_all('div',class_='article-section__content')
-        if len(test) < 4:
+        if self.soup.find('section',class_='article-section__full') == None:
             sys.exit('Error: Can\'t access fulltext of article')
         else:
             return(True)
author	Ken Kellner <ken@kenkellner.com>	2018-04-18 15:01:50 -0400
committer	Ken Kellner <ken@kenkellner.com>	2018-04-18 15:01:50 -0400
commit	e67799a4d254fd1d9c5082cf664e8ccc40bf341a (patch)
tree	eceb8e47988149649ef88665f2c83e44360459fa
parent	d2569d0a1f262e74a3ffd8add3ecb874040e57a9 (diff)