aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Kellner <ken@kenkellner.com>2018-04-05 13:14:43 -0400
committerKen Kellner <ken@kenkellner.com>2018-04-05 13:14:43 -0400
commit34913dd9ff573cd506e45dda5c13008f451cec11 (patch)
tree08de3d70a4a530337ac30b34c0f4a4455ac8c668
parent7aacb78f566a97c90b8f460bf33e075a022060c1 (diff)
Add BioOne to supported publishers
-rwxr-xr-xarticle-epub.py9
-rw-r--r--article_epub/publisher.py6
-rw-r--r--article_epub/publishers/__init__.py1
-rw-r--r--article_epub/publishers/bioone.py69
-rw-r--r--article_epub/publishers/sciencedirect.py34
-rw-r--r--article_epub/publishers/springer.py30
6 files changed, 80 insertions, 69 deletions
diff --git a/article-epub.py b/article-epub.py
index 2a5d339..3b1ef25 100755
--- a/article-epub.py
+++ b/article-epub.py
@@ -4,7 +4,8 @@ import sys
import requests
def main():
- if sys.argv[1] == '-d':
+ if sys.argv[1] == '-d':
+ print("Getting URL from DOI...")
url = requests.get('https://doi.org/'+sys.argv[2]).url
doi = sys.argv[2]
else:
@@ -14,9 +15,11 @@ def main():
domain = ".".join(url.split("//")[-1].split("/")[0] \
.split('?')[0].split('.')[-2:])
- art = article_epub.publisher.get_publishers()[domain](url=url,doi=doi)
+ try:
+ art = article_epub.publisher.get_publishers()[domain](url=url,doi=doi)
+ except:
+ sys.exit('Publisher not supported.')
- print('Downloading content...')
art.soupify()
art.extract_data()
art.epubify()
diff --git a/article_epub/publisher.py b/article_epub/publisher.py
index 0ea9259..7af9a47 100644
--- a/article_epub/publisher.py
+++ b/article_epub/publisher.py
@@ -25,12 +25,15 @@ class Publisher(object):
"""Get HTML from article's page"""
self.get_final_url()
os.environ['MOZ_HEADLESS'] = '1'
+ print('Starting headless browser...')
binary = FirefoxBinary('/usr/bin/firefox')
try:
driver = webdriver.Firefox(firefox_binary=binary,
log_path='/tmp/gecko_log')
except:
sys.exit('Failed to load Firefox; is it installed?')
+
+ print('Loading page...')
try:
driver.get(self.url)
except:
@@ -99,8 +102,7 @@ class Publisher(object):
+self.journal+'. '+' doi: '+self.doi
def extract_data(self):
- #self.get_title()
- #self.get_authors()
+ print('Extracting data from HTML...')
self.get_doi()
self.get_metadata()
self.get_abstract()
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py
index 730fab3..f17fab8 100644
--- a/article_epub/publishers/__init__.py
+++ b/article_epub/publishers/__init__.py
@@ -1,3 +1,4 @@
from article_epub.publishers.sciencedirect import ScienceDirect
from article_epub.publishers.springer import Springer
from article_epub.publishers.wiley import Wiley
+from article_epub.publishers.bioone import BioOne
diff --git a/article_epub/publishers/bioone.py b/article_epub/publishers/bioone.py
new file mode 100644
index 0000000..73c4379
--- /dev/null
+++ b/article_epub/publishers/bioone.py
@@ -0,0 +1,69 @@
+from article_epub.publisher import Publisher, register_publisher
+import requests
+from bs4 import BeautifulSoup
+
+class BioOne(Publisher):
+ """Class for BioOne articles"""
+
+ domains = ["bioone.org"]
+
+ def get_final_url(self):
+ if '/abs/' in self.url:
+ self.url = self.url.replace('/doi/abs/','/doi/')
+
+ def get_doi(self):
+ if self.doi == None:
+ doi_raw = self.soup.find('p',class_='articleRef') \
+ .find('a').text.split('/')
+ self.doi = str(doi_raw[3]+'/'+doi_raw[4])
+
+ def get_abstract(self):
+ """Get article abstract"""
+ abstract_raw = str(self.soup.find('div',class_='abstractSection'))
+ self.abstract = abstract_raw.replace('<h3','<h2') \
+ .replace('</h3>','</h2>').replace('Abstract. ','ABSTRACT')
+
+ def get_keywords(self):
+ """Get article keywords"""
+ pass
+
+ def get_body(self):
+ """Get body of article"""
+ body_full = self.soup.find('div',class_='hlFld-Fulltext')
+ links_old = body_full.find_all('a',class_='ref')
+ for i in links_old:
+ try:
+ tag = '#'+i['onclick'].split("'")[1]
+ i['href'] = str(tag)
+ i['onclick'] = ''
+ except:
+ pass
+
+ print('Downloading higher-quality images...')
+ imgs_old = body_full.find_all('div',class_='articleImage')
+ for i in imgs_old:
+ try:
+ link = i.find('a',class_='popupLink')
+ imgpage = BeautifulSoup(requests.get('https://bioone.org' \
+ +str(link['href'])).content,'html.parser')
+ imglink = 'http://bioone.org'+str(imgpage.find('img')['src'])
+ link.find('img')['src'] = imglink
+ link['href'] = ''
+ except:
+ pass
+
+ body_raw = body_full.find_all('div',class_='NLM_sec_level_1')
+ self.body = ''
+ for i in body_raw:
+ self.body += str(i)
+
+ self.body = self.body.replace('<h6>','<h2>').replace('</h6>','</h2>')
+ self.body = self.body.replace('enlarge figure','')
+
+ def get_references(self):
+ """Get references list"""
+ references_raw = str(self.soup.find('div',class_='articleReferences'))
+ self.references = references_raw.replace('<h3>','<h2>') \
+ .replace('</h3>','</h2>')
+
+register_publisher(BioOne)
diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py
index 34fecfd..6ca4d8f 100644
--- a/article_epub/publishers/sciencedirect.py
+++ b/article_epub/publishers/sciencedirect.py
@@ -5,24 +5,6 @@ class ScienceDirect(Publisher):
domains = ["sciencedirect.com","elsevier.com"]
- #def get_title(self):
- # """Get article title"""
- # self.title = self.soup.find('span',class_='title-text').text
-
- #def get_authors(self):
- # """Get author given and surnammes"""
- # author_raw = self.soup.find('div',class_='author-group') \
- # .find_all('span',class_='text surname')
- # self.author_surnames = []
- # for i in author_raw:
- # self.author_surnames.append(i.text)
- #
- # author_raw = self.soup.find('div',class_='author-group') \
- # .find_all('span',class_='text given-name')
- # self.author_givennames = []
- # for i in author_raw:
- # self.author_givennames.append(i.text)
-
def get_doi(self):
if self.doi == None:
doi_raw = self.soup.find('a',class_='doi').get('href').split('/')
@@ -40,22 +22,6 @@ class ScienceDirect(Publisher):
for i in keys_raw:
self.keywords.append(i.text)
- #def get_metadata(self):
- # """Get assortment of other metadata"""
- # if self.doi == None:
- # doi_raw = self.soup.find('a',class_='doi').get('href').split('/')
- # self.doi = doi_raw[3]+'/'+doi_raw[4]
- #
- # self.journal = self.soup.find('div',class_='publication-volume') \
- # .find('span',class_='size-xl').text
- #
- # pubdate_raw = self.soup.find('div',class_='publication-volume') \
- # .find('span',class_='size-m').text.split(',')
- #
- # self.year = pubdate_raw[-2].split(' ')[-1]
- # self.volume = pubdate_raw[0].split(' ')[1]
- # self.pages = pubdate_raw[-1].split(' ')[2]
-
def get_body(self):
"""Get body of article"""
body_raw = str(self.soup.find('div',class_='Body'))
diff --git a/article_epub/publishers/springer.py b/article_epub/publishers/springer.py
index 77930d4..d53eafc 100644
--- a/article_epub/publishers/springer.py
+++ b/article_epub/publishers/springer.py
@@ -4,20 +4,6 @@ class Springer(Publisher):
"""Class for Springer articles"""
domains = ["springer.com"]
-
- #def get_title(self):
- # """Get article title"""
- # self.title = self.soup.find('h1',class_='ArticleTitle').text
-
- #def get_authors(self):
- # """Get author given and surnammes"""
- # author_raw = self.soup.find_all('span',class_='authors__name')
- # self.author_surnames = []
- # self.author_givennames = []
- # for i in author_raw:
- # name = i.text.split('\xa0')
- # self.author_surnames.append(name[-1])
- # self.author_givennames.append(' '.join(name[:-1]))
def get_doi(self):
if self.doi == None:
@@ -35,22 +21,6 @@ class Springer(Publisher):
for i in keywords_raw:
self.keywords.append(i.text.replace('\xa0',''))
- #def get_metadata(self):
- # """Get assortment of other metadata"""
- # if self.doi == None:
- # doi_raw = self.soup.find('span',{"id":"doi-url"}).text.split('/')
- # self.doi = doi_raw[-2]+'/'+doi_raw[-1]
- #
- # self.journal = self.soup.find('span',class_="JournalTitle").text
- #
- # self.year = self.soup.find('time')['datetime'].split('-')[0]
- #
- # self.volume = self.soup.find('span',class_="ArticleCitation_Volume") \
- # .text[:-2].split(' ')[-1]
- #
- # self.pages = self.soup.find('span',class_="ArticleCitation_Pages") \
- # .text.split(' ')[-1]
-
def get_body(self):
"""Get body of article"""
self.body = self.soup.find('div',{"id":"body"})