aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Kellner <ken@kenkellner.com>2018-04-19 16:23:43 -0400
committerKen Kellner <ken@kenkellner.com>2018-04-19 16:23:43 -0400
commit1c17504de24c88924925a0b903a6878974968218 (patch)
tree747afab2dcba392828073b38f516a723c63ec389
parentf2029998c90f7ebf7a108011fd508eb1c2c00d24 (diff)
Some bugfixes and QOL improvements
-rw-r--r--article_epub/publisher.py5
-rw-r--r--article_epub/publishers/bioone.py2
-rw-r--r--article_epub/publishers/nrc.py13
-rw-r--r--article_epub/publishers/oxford.py27
-rw-r--r--article_epub/publishers/tandf.py20
-rw-r--r--article_epub/publishers/wiley.py12
6 files changed, 60 insertions, 19 deletions
diff --git a/article_epub/publisher.py b/article_epub/publisher.py
index 9756abb..1cc0562 100644
--- a/article_epub/publisher.py
+++ b/article_epub/publisher.py
@@ -183,10 +183,13 @@ def match_publisher(url,doi):
"""Match a URL to a publisher class"""
domain = ".".join(url.split("//")[-1].split("/")[0] \
.split('?')[0].split('.')[-2:])
+ if domain == 'doi.org':
+ sys.exit('DOI not found; is it correct?')
+
try:
art = get_publishers()[domain](url=url,doi=doi)
print('Matched URL to publisher: '+art.name)
return(art)
except:
- sys.exit('Publisher not supported.')
+ sys.exit('Publisher ['+domain+'] not supported.')
diff --git a/article_epub/publishers/bioone.py b/article_epub/publishers/bioone.py
index 41485be..df9edeb 100644
--- a/article_epub/publishers/bioone.py
+++ b/article_epub/publishers/bioone.py
@@ -10,7 +10,7 @@ class BioOne(Publisher):
domains = ["bioone.org"]
def check_fulltext(self):
- if self.soup.find('div',class_='hlFld-Fulltext') == None:
+ if self.soup.find('div',class_='NLM_sec_level_1') == None:
sys.exit('Error: Can\'t access fulltext of article')
else:
return(True)
diff --git a/article_epub/publishers/nrc.py b/article_epub/publishers/nrc.py
index 645cbc1..aff1dd4 100644
--- a/article_epub/publishers/nrc.py
+++ b/article_epub/publishers/nrc.py
@@ -27,10 +27,13 @@ class NRC(Publisher):
def get_keywords(self):
"""Get article keywords"""
- keywords_raw = self.soup.find('font',{'size':'-1'}).find_all('a')
self.keywords = []
- for i in keywords_raw:
- self.keywords.append(i.text)
+ try:
+ keywords_raw = self.soup.find('font',{'size':'-1'}).find_all('a')
+ for i in keywords_raw:
+ self.keywords.append(i.text)
+ except:
+ pass
def get_body(self):
"""Get body of article"""
@@ -46,8 +49,8 @@ class NRC(Publisher):
i.find('p').decompose()
if len(figs) > 0:
- temp_raw = 'http://nrcresearchpress.com'+newlink
- template = temp_raw.split('f')[0:-2][0]
+ newlink = figs[0].find('img')['src']
+ template = newlink.split('f1')[0]
for i in body_raw.find_all('div',class_='short-legend'):
i.decompose()
diff --git a/article_epub/publishers/oxford.py b/article_epub/publishers/oxford.py
index 0b86191..aa0a431 100644
--- a/article_epub/publishers/oxford.py
+++ b/article_epub/publishers/oxford.py
@@ -23,6 +23,10 @@ class Oxford(Publisher):
def get_abstract(self):
"""Get article abstract"""
abstract_raw = self.soup.find('section',class_='abstract')
+ if abstract_raw == None:
+ self.abstract = ''
+ return
+
self.abstract = '<h2>Abstract</h2>\n'+str(abstract_raw)
def get_keywords(self):
@@ -44,15 +48,24 @@ class Oxford(Publisher):
body_raw.find('h2',class_='abstract-title').decompose()
except:
pass
- body_raw.find('section',class_='abstract').decompose()
+ try:
+ body_raw.find('section',class_='abstract').decompose()
+ except:
+ pass
try:
body_raw.find('div',class_='article-metadata-panel').decompose()
except:
pass
- body_raw.find('div',class_='ref-list').decompose()
+ try:
+ body_raw.find('div',class_='ref-list').decompose()
+ body_raw.find('h2',class_='backreferences-title').decompose()
+ except:
+ pass
body_raw.find('span',{'id':'UserHasAccess'}).decompose()
- body_raw.find('div',class_='copyright').decompose()
- body_raw.find('h2',class_='backreferences-title').decompose()
+ try:
+ body_raw.find('div',class_='copyright').decompose()
+ except:
+ pass
for i in body_raw.find_all('div',class_='fig-modal'):
i.decompose()
@@ -78,8 +91,12 @@ class Oxford(Publisher):
def get_references(self):
"""Get references list"""
- references_title = self.soup.find('h2',class_='backreferences-title')
references_raw = self.soup.find('div',class_='ref-list')
+ if references_raw == None:
+ self.references = ''
+ return
+
+ references_title = self.soup.find('h2',class_='backreferences-title')
refs_format = ''
for i in references_raw.find_all('div',recursive=False):
for j in i.find_all('a'):
diff --git a/article_epub/publishers/tandf.py b/article_epub/publishers/tandf.py
index 7d33518..963aaf2 100644
--- a/article_epub/publishers/tandf.py
+++ b/article_epub/publishers/tandf.py
@@ -80,9 +80,23 @@ class TandF(Publisher):
link = 'https://www.tandfonline.com'+csv['href']
csv['href'] = link
i.find('a',{'id':'displaySizeTable'}).decompose()
-
- for i in self.soup.find_all('span',class_='NLM_disp-formula-image'):
- i.decompose()
+
+ for i in self.soup.find_all('span', class_='disp-formula'):
+
+ link = 'https://www.tandfonline.com'+ \
+ i.find('noscript').find('img')['src']
+
+ for j in i.find_all('noscript'):
+ j.decompose()
+
+ i.find('img')['src'] = link
+
+ for i in self.soup.find_all('span', class_='NLM_inline-graphic'):
+ for j in i.find_all('noscript'):
+ j.decompose()
+
+ link = 'https://www.tandfonline.com'+i.find('img')['src']
+ i.find('img')['src'] = link
self.body = ''
for i in body_raw:
diff --git a/article_epub/publishers/wiley.py b/article_epub/publishers/wiley.py
index 6d40491..9f08c59 100644
--- a/article_epub/publishers/wiley.py
+++ b/article_epub/publishers/wiley.py
@@ -12,11 +12,15 @@ class Wiley(Publisher):
self.url = self.url.replace('/abs/','/full/')
def check_fulltext(self):
- if self.soup.find('section',class_='article-section__full') \
- .find('div',class_='article-section__content').text == '\n\xa0\n':
- sys.exit('Error: Can\'t access fulltext of article')
+ full = self.soup.find('section',class_='article-section__full')
+ if full != None:
+ if full.find('div',class_='article-section__content') \
+ .text == '\n\xa0\n':
+ sys.exit('Error: Can\'t access fulltext of article')
+ else:
+ return(True)
else:
- return(True)
+ sys.exit('Error: Can\'t access fulltext of article')
def get_doi(self):
if self.doi == None: