diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-18 15:01:50 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-18 15:01:50 -0400 |
commit | e67799a4d254fd1d9c5082cf664e8ccc40bf341a (patch) | |
tree | eceb8e47988149649ef88665f2c83e44360459fa | |
parent | d2569d0a1f262e74a3ffd8add3ecb874040e57a9 (diff) |
Fix some bugs. Give publishers more readable names.
-rw-r--r-- | README.md | 1 | ||||
-rwxr-xr-x | article-epub.py | 2 | ||||
-rw-r--r-- | article_epub/publisher.py | 6 | ||||
-rw-r--r-- | article_epub/publishers/bioone.py | 3 | ||||
-rw-r--r-- | article_epub/publishers/nih.py | 1 | ||||
-rw-r--r-- | article_epub/publishers/nrc.py | 1 | ||||
-rw-r--r-- | article_epub/publishers/oxford.py | 6 | ||||
-rw-r--r-- | article_epub/publishers/plosone.py | 1 | ||||
-rw-r--r-- | article_epub/publishers/royalsociety.py | 1 | ||||
-rw-r--r-- | article_epub/publishers/sciencedirect.py | 1 | ||||
-rw-r--r-- | article_epub/publishers/springer.py | 1 | ||||
-rw-r--r-- | article_epub/publishers/tandf.py | 17 | ||||
-rw-r--r-- | article_epub/publishers/wiley.py | 4 |
13 files changed, 37 insertions, 8 deletions
@@ -23,6 +23,7 @@ Currently, the following publishers are supported: * PLoS ONE * National Institutes of Health (NIH) * NRC Research Press +* Taylor & Francis Dependencies ------------ diff --git a/article-epub.py b/article-epub.py index 6753f6e..496e829 100755 --- a/article-epub.py +++ b/article-epub.py @@ -22,7 +22,7 @@ def main(): pubs = article_epub.list_publishers() print('Available publishers:') for i in pubs: - print('• '+i.__name__) + print('• '+i) sys.exit() if args.u != None: diff --git a/article_epub/publisher.py b/article_epub/publisher.py index e8f0170..9756abb 100644 --- a/article_epub/publisher.py +++ b/article_epub/publisher.py @@ -11,6 +11,7 @@ import json _publishers = list() _publisher_domains = dict() +_publisher_names = list() class Publisher(object): """General class for scientific article publishers""" @@ -168,6 +169,7 @@ class Publisher(object): def register_publisher(publisher): _publishers.append(publisher) + _publisher_names.append(publisher.name) for d in publisher.domains: _publisher_domains[d] = publisher @@ -175,7 +177,7 @@ def get_publishers(): return _publisher_domains def list_publishers(): - return _publishers + return _publisher_names def match_publisher(url,doi): """Match a URL to a publisher class""" @@ -183,7 +185,7 @@ def match_publisher(url,doi): .split('?')[0].split('.')[-2:]) try: art = get_publishers()[domain](url=url,doi=doi) - print('Matched URL to publisher: '+art.__class__.__name__) + print('Matched URL to publisher: '+art.name) return(art) except: sys.exit('Publisher not supported.') diff --git a/article_epub/publishers/bioone.py b/article_epub/publishers/bioone.py index dcc0bab..41485be 100644 --- a/article_epub/publishers/bioone.py +++ b/article_epub/publishers/bioone.py @@ -5,7 +5,8 @@ import sys class BioOne(Publisher): """Class for BioOne articles""" - + + name = "BioOne" domains = ["bioone.org"] def check_fulltext(self): diff --git a/article_epub/publishers/nih.py b/article_epub/publishers/nih.py index 60294aa..7e0df3f 100644 --- a/article_epub/publishers/nih.py +++ b/article_epub/publishers/nih.py @@ -6,6 +6,7 @@ from bs4 import BeautifulSoup class NIH(Publisher): """Class for NIH NCBI articles""" + name = "NIH-NCBI" domains = ["nih.gov"] def soupify(self): diff --git a/article_epub/publishers/nrc.py b/article_epub/publishers/nrc.py index 57c5f75..645cbc1 100644 --- a/article_epub/publishers/nrc.py +++ b/article_epub/publishers/nrc.py @@ -5,6 +5,7 @@ import sys class NRC(Publisher): """Class for NRC Research Press articles""" + name = "NRC Research Press" domains = ["nrcresearchpress.com"] def check_fulltext(self): diff --git a/article_epub/publishers/oxford.py b/article_epub/publishers/oxford.py index ee72f11..948ec6f 100644 --- a/article_epub/publishers/oxford.py +++ b/article_epub/publishers/oxford.py @@ -5,6 +5,7 @@ import sys class Oxford(Publisher): """Class for Oxford articles""" + name = "Oxford Academic" domains = ["oup.com"] def check_fulltext(self): @@ -39,7 +40,10 @@ class Oxford(Publisher): """Get body of article""" body_raw = copy.copy(self.soup.find( 'div',{'data-widgetname':'ArticleFulltext'})) - body_raw.find('h2',class_='abstract-title').decompose() + try: + body_raw.find('h2',class_='abstract-title').decompose() + except: + pass body_raw.find('section',class_='abstract').decompose() body_raw.find('div',class_='article-metadata-panel').decompose() body_raw.find('div',class_='ref-list').decompose() diff --git a/article_epub/publishers/plosone.py b/article_epub/publishers/plosone.py index 8638fad..01204d6 100644 --- a/article_epub/publishers/plosone.py +++ b/article_epub/publishers/plosone.py @@ -3,6 +3,7 @@ from article_epub.publisher import Publisher, register_publisher class PLoSONE(Publisher): """Class for PLoS ONE articles""" + name = "PLoS ONE" domains = ["plos.org"] def get_doi(self): diff --git a/article_epub/publishers/royalsociety.py b/article_epub/publishers/royalsociety.py index 0c3b2d4..9da5846 100644 --- a/article_epub/publishers/royalsociety.py +++ b/article_epub/publishers/royalsociety.py @@ -7,6 +7,7 @@ from bs4 import BeautifulSoup class RoyalSociety(Publisher): """Class for Royal Society Publishing articles""" + name = "Royal Society Publishing" domains = ["royalsocietypublishing.org"] def check_fulltext(self): diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py index 3ffcad1..186e68f 100644 --- a/article_epub/publishers/sciencedirect.py +++ b/article_epub/publishers/sciencedirect.py @@ -4,6 +4,7 @@ import sys class ScienceDirect(Publisher): """Class for Science Direct (Elsevier) articles""" + name = "Elsevier" domains = ["sciencedirect.com","elsevier.com"] def check_fulltext(self): diff --git a/article_epub/publishers/springer.py b/article_epub/publishers/springer.py index 9013d1d..4e3bcb8 100644 --- a/article_epub/publishers/springer.py +++ b/article_epub/publishers/springer.py @@ -4,6 +4,7 @@ import sys class Springer(Publisher): """Class for Springer articles""" + name = "Springer" domains = ["springer.com"] def check_fulltext(self): diff --git a/article_epub/publishers/tandf.py b/article_epub/publishers/tandf.py index 498e116..7d33518 100644 --- a/article_epub/publishers/tandf.py +++ b/article_epub/publishers/tandf.py @@ -4,6 +4,7 @@ import sys class TandF(Publisher): """Class for Taylor & Francis articles""" + name = "Taylor & Francis" domains = ["tandfonline.com"] def get_final_url(self): @@ -24,7 +25,15 @@ class TandF(Publisher): def get_abstract(self): """Get article abstract""" abstract_raw = self.soup.find('div',class_='hlFld-Abstract') - abstract_raw.find('p',class_='summary-title').decompose() + try: + abstract_raw.find('p',class_='summary-title').decompose() + except: + pass + try: + abstract_raw.find('div',{'id':'mathJaxToggle'}).decompose() + except: + pass + self.abstract = str(abstract_raw) def get_keywords(self): @@ -71,6 +80,9 @@ class TandF(Publisher): link = 'https://www.tandfonline.com'+csv['href'] csv['href'] = link i.find('a',{'id':'displaySizeTable'}).decompose() + + for i in self.soup.find_all('span',class_='NLM_disp-formula-image'): + i.decompose() self.body = '' for i in body_raw: @@ -81,6 +93,9 @@ class TandF(Publisher): references_raw = self.soup.find('ul',{'id':'references-Section'}) for i in references_raw.find_all('div',class_='xlinks-container'): i.decompose() + + for i in references_raw.find_all('img'): + i.decompose() references_title = '<h2>References</h2>\n' self.references = references_title+str(references_raw) diff --git a/article_epub/publishers/wiley.py b/article_epub/publishers/wiley.py index d5fbdc7..de15213 100644 --- a/article_epub/publishers/wiley.py +++ b/article_epub/publishers/wiley.py @@ -4,6 +4,7 @@ import sys class Wiley(Publisher): """Class for Springer articles""" + name = "Wiley" domains = ["wiley.com"] def get_final_url(self): @@ -11,8 +12,7 @@ class Wiley(Publisher): self.url = self.url.replace('/abs/','/full/') def check_fulltext(self): - test = self.soup.find_all('div',class_='article-section__content') - if len(test) < 4: + if self.soup.find('section',class_='article-section__full') == None: sys.exit('Error: Can\'t access fulltext of article') else: return(True) |