aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Kellner <ken@kenkellner.com>2018-04-18 15:01:50 -0400
committerKen Kellner <ken@kenkellner.com>2018-04-18 15:01:50 -0400
commite67799a4d254fd1d9c5082cf664e8ccc40bf341a (patch)
treeeceb8e47988149649ef88665f2c83e44360459fa
parentd2569d0a1f262e74a3ffd8add3ecb874040e57a9 (diff)
Fix some bugs. Give publishers more readable names.
-rw-r--r--README.md1
-rwxr-xr-xarticle-epub.py2
-rw-r--r--article_epub/publisher.py6
-rw-r--r--article_epub/publishers/bioone.py3
-rw-r--r--article_epub/publishers/nih.py1
-rw-r--r--article_epub/publishers/nrc.py1
-rw-r--r--article_epub/publishers/oxford.py6
-rw-r--r--article_epub/publishers/plosone.py1
-rw-r--r--article_epub/publishers/royalsociety.py1
-rw-r--r--article_epub/publishers/sciencedirect.py1
-rw-r--r--article_epub/publishers/springer.py1
-rw-r--r--article_epub/publishers/tandf.py17
-rw-r--r--article_epub/publishers/wiley.py4
13 files changed, 37 insertions, 8 deletions
diff --git a/README.md b/README.md
index d34c9b9..00d944e 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,7 @@ Currently, the following publishers are supported:
* PLoS ONE
* National Institutes of Health (NIH)
* NRC Research Press
+* Taylor & Francis
Dependencies
------------
diff --git a/article-epub.py b/article-epub.py
index 6753f6e..496e829 100755
--- a/article-epub.py
+++ b/article-epub.py
@@ -22,7 +22,7 @@ def main():
pubs = article_epub.list_publishers()
print('Available publishers:')
for i in pubs:
- print('• '+i.__name__)
+ print('• '+i)
sys.exit()
if args.u != None:
diff --git a/article_epub/publisher.py b/article_epub/publisher.py
index e8f0170..9756abb 100644
--- a/article_epub/publisher.py
+++ b/article_epub/publisher.py
@@ -11,6 +11,7 @@ import json
_publishers = list()
_publisher_domains = dict()
+_publisher_names = list()
class Publisher(object):
"""General class for scientific article publishers"""
@@ -168,6 +169,7 @@ class Publisher(object):
def register_publisher(publisher):
_publishers.append(publisher)
+ _publisher_names.append(publisher.name)
for d in publisher.domains:
_publisher_domains[d] = publisher
@@ -175,7 +177,7 @@ def get_publishers():
return _publisher_domains
def list_publishers():
- return _publishers
+ return _publisher_names
def match_publisher(url,doi):
"""Match a URL to a publisher class"""
@@ -183,7 +185,7 @@ def match_publisher(url,doi):
.split('?')[0].split('.')[-2:])
try:
art = get_publishers()[domain](url=url,doi=doi)
- print('Matched URL to publisher: '+art.__class__.__name__)
+ print('Matched URL to publisher: '+art.name)
return(art)
except:
sys.exit('Publisher not supported.')
diff --git a/article_epub/publishers/bioone.py b/article_epub/publishers/bioone.py
index dcc0bab..41485be 100644
--- a/article_epub/publishers/bioone.py
+++ b/article_epub/publishers/bioone.py
@@ -5,7 +5,8 @@ import sys
class BioOne(Publisher):
"""Class for BioOne articles"""
-
+
+ name = "BioOne"
domains = ["bioone.org"]
def check_fulltext(self):
diff --git a/article_epub/publishers/nih.py b/article_epub/publishers/nih.py
index 60294aa..7e0df3f 100644
--- a/article_epub/publishers/nih.py
+++ b/article_epub/publishers/nih.py
@@ -6,6 +6,7 @@ from bs4 import BeautifulSoup
class NIH(Publisher):
"""Class for NIH NCBI articles"""
+ name = "NIH-NCBI"
domains = ["nih.gov"]
def soupify(self):
diff --git a/article_epub/publishers/nrc.py b/article_epub/publishers/nrc.py
index 57c5f75..645cbc1 100644
--- a/article_epub/publishers/nrc.py
+++ b/article_epub/publishers/nrc.py
@@ -5,6 +5,7 @@ import sys
class NRC(Publisher):
"""Class for NRC Research Press articles"""
+ name = "NRC Research Press"
domains = ["nrcresearchpress.com"]
def check_fulltext(self):
diff --git a/article_epub/publishers/oxford.py b/article_epub/publishers/oxford.py
index ee72f11..948ec6f 100644
--- a/article_epub/publishers/oxford.py
+++ b/article_epub/publishers/oxford.py
@@ -5,6 +5,7 @@ import sys
class Oxford(Publisher):
"""Class for Oxford articles"""
+ name = "Oxford Academic"
domains = ["oup.com"]
def check_fulltext(self):
@@ -39,7 +40,10 @@ class Oxford(Publisher):
"""Get body of article"""
body_raw = copy.copy(self.soup.find(
'div',{'data-widgetname':'ArticleFulltext'}))
- body_raw.find('h2',class_='abstract-title').decompose()
+ try:
+ body_raw.find('h2',class_='abstract-title').decompose()
+ except:
+ pass
body_raw.find('section',class_='abstract').decompose()
body_raw.find('div',class_='article-metadata-panel').decompose()
body_raw.find('div',class_='ref-list').decompose()
diff --git a/article_epub/publishers/plosone.py b/article_epub/publishers/plosone.py
index 8638fad..01204d6 100644
--- a/article_epub/publishers/plosone.py
+++ b/article_epub/publishers/plosone.py
@@ -3,6 +3,7 @@ from article_epub.publisher import Publisher, register_publisher
class PLoSONE(Publisher):
"""Class for PLoS ONE articles"""
+ name = "PLoS ONE"
domains = ["plos.org"]
def get_doi(self):
diff --git a/article_epub/publishers/royalsociety.py b/article_epub/publishers/royalsociety.py
index 0c3b2d4..9da5846 100644
--- a/article_epub/publishers/royalsociety.py
+++ b/article_epub/publishers/royalsociety.py
@@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
class RoyalSociety(Publisher):
"""Class for Royal Society Publishing articles"""
+ name = "Royal Society Publishing"
domains = ["royalsocietypublishing.org"]
def check_fulltext(self):
diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py
index 3ffcad1..186e68f 100644
--- a/article_epub/publishers/sciencedirect.py
+++ b/article_epub/publishers/sciencedirect.py
@@ -4,6 +4,7 @@ import sys
class ScienceDirect(Publisher):
"""Class for Science Direct (Elsevier) articles"""
+ name = "Elsevier"
domains = ["sciencedirect.com","elsevier.com"]
def check_fulltext(self):
diff --git a/article_epub/publishers/springer.py b/article_epub/publishers/springer.py
index 9013d1d..4e3bcb8 100644
--- a/article_epub/publishers/springer.py
+++ b/article_epub/publishers/springer.py
@@ -4,6 +4,7 @@ import sys
class Springer(Publisher):
"""Class for Springer articles"""
+ name = "Springer"
domains = ["springer.com"]
def check_fulltext(self):
diff --git a/article_epub/publishers/tandf.py b/article_epub/publishers/tandf.py
index 498e116..7d33518 100644
--- a/article_epub/publishers/tandf.py
+++ b/article_epub/publishers/tandf.py
@@ -4,6 +4,7 @@ import sys
class TandF(Publisher):
"""Class for Taylor & Francis articles"""
+ name = "Taylor & Francis"
domains = ["tandfonline.com"]
def get_final_url(self):
@@ -24,7 +25,15 @@ class TandF(Publisher):
def get_abstract(self):
"""Get article abstract"""
abstract_raw = self.soup.find('div',class_='hlFld-Abstract')
- abstract_raw.find('p',class_='summary-title').decompose()
+ try:
+ abstract_raw.find('p',class_='summary-title').decompose()
+ except:
+ pass
+ try:
+ abstract_raw.find('div',{'id':'mathJaxToggle'}).decompose()
+ except:
+ pass
+
self.abstract = str(abstract_raw)
def get_keywords(self):
@@ -71,6 +80,9 @@ class TandF(Publisher):
link = 'https://www.tandfonline.com'+csv['href']
csv['href'] = link
i.find('a',{'id':'displaySizeTable'}).decompose()
+
+ for i in self.soup.find_all('span',class_='NLM_disp-formula-image'):
+ i.decompose()
self.body = ''
for i in body_raw:
@@ -81,6 +93,9 @@ class TandF(Publisher):
references_raw = self.soup.find('ul',{'id':'references-Section'})
for i in references_raw.find_all('div',class_='xlinks-container'):
i.decompose()
+
+ for i in references_raw.find_all('img'):
+ i.decompose()
references_title = '<h2>References</h2>\n'
self.references = references_title+str(references_raw)
diff --git a/article_epub/publishers/wiley.py b/article_epub/publishers/wiley.py
index d5fbdc7..de15213 100644
--- a/article_epub/publishers/wiley.py
+++ b/article_epub/publishers/wiley.py
@@ -4,6 +4,7 @@ import sys
class Wiley(Publisher):
"""Class for Springer articles"""
+ name = "Wiley"
domains = ["wiley.com"]
def get_final_url(self):
@@ -11,8 +12,7 @@ class Wiley(Publisher):
self.url = self.url.replace('/abs/','/full/')
def check_fulltext(self):
- test = self.soup.find_all('div',class_='article-section__content')
- if len(test) < 4:
+ if self.soup.find('section',class_='article-section__full') == None:
sys.exit('Error: Can\'t access fulltext of article')
else:
return(True)