Add BioOne to supported publishers

author: Ken Kellner <ken@kenkellner.com> 2018-04-05 13:14:43 -0400
committer: Ken Kellner <ken@kenkellner.com> 2018-04-05 13:14:43 -0400
commit: 34913dd9ff573cd506e45dda5c13008f451cec11 (patch)
tree: 08de3d70a4a530337ac30b34c0f4a4455ac8c668
parent: 7aacb78f566a97c90b8f460bf33e075a022060c1 (diff)
6 files changed, 80 insertions, 69 deletions
diff --git a/article-epub.py b/article-epub.py
index 2a5d339..3b1ef25 100755
--- a/article-epub.py
+++ b/article-epub.py
@@ -4,7 +4,8 @@ import sys
 import requests
 
 def main():
-    if sys.argv[1] == '-d': 
+    if sys.argv[1] == '-d':
+        print("Getting URL from DOI...")
         url = requests.get('https://doi.org/'+sys.argv[2]).url
         doi = sys.argv[2]
     else:
@@ -14,9 +15,11 @@ def main():
     domain = ".".join(url.split("//")[-1].split("/")[0] \
             .split('?')[0].split('.')[-2:])
 
-    art = article_epub.publisher.get_publishers()[domain](url=url,doi=doi)
+    try:
+        art = article_epub.publisher.get_publishers()[domain](url=url,doi=doi)
+    except:
+        sys.exit('Publisher not supported.')
 
-    print('Downloading content...')
     art.soupify()
     art.extract_data()
     art.epubify()
diff --git a/article_epub/publisher.py b/article_epub/publisher.py
index 0ea9259..7af9a47 100644
--- a/article_epub/publisher.py
+++ b/article_epub/publisher.py
@@ -25,12 +25,15 @@ class Publisher(object):
         """Get HTML from article's page"""
         self.get_final_url()
         os.environ['MOZ_HEADLESS'] = '1'
+        print('Starting headless browser...')
         binary = FirefoxBinary('/usr/bin/firefox')
         try:
             driver = webdriver.Firefox(firefox_binary=binary, 
                     log_path='/tmp/gecko_log')
         except:
             sys.exit('Failed to load Firefox; is it installed?')
+        
+        print('Loading page...')
         try:
             driver.get(self.url)
         except:
@@ -99,8 +102,7 @@ class Publisher(object):
                     +self.journal+'. '+' doi: '+self.doi
     
     def extract_data(self):
-        #self.get_title()
-        #self.get_authors()
+        print('Extracting data from HTML...')
         self.get_doi()
         self.get_metadata()
         self.get_abstract()
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py
index 730fab3..f17fab8 100644
--- a/article_epub/publishers/__init__.py
+++ b/article_epub/publishers/__init__.py
@@ -1,3 +1,4 @@
 from article_epub.publishers.sciencedirect import ScienceDirect
 from article_epub.publishers.springer import Springer
 from article_epub.publishers.wiley import Wiley
+from article_epub.publishers.bioone import BioOne
diff --git a/article_epub/publishers/bioone.py b/article_epub/publishers/bioone.py
new file mode 100644
index 0000000..73c4379
--- /dev/null
+++ b/article_epub/publishers/bioone.py
@@ -0,0 +1,69 @@
+from article_epub.publisher import Publisher, register_publisher
+import requests
+from bs4 import BeautifulSoup
+
+class BioOne(Publisher):
+    """Class for BioOne articles"""
+
+    domains = ["bioone.org"]
+
+    def get_final_url(self):
+        if '/abs/' in self.url:
+            self.url = self.url.replace('/doi/abs/','/doi/')
+
+    def get_doi(self):
+        if self.doi == None:
+            doi_raw = self.soup.find('p',class_='articleRef') \
+                .find('a').text.split('/')
+            self.doi = str(doi_raw[3]+'/'+doi_raw[4])
+
+    def get_abstract(self):
+        """Get article abstract"""
+        abstract_raw = str(self.soup.find('div',class_='abstractSection'))
+        self.abstract = abstract_raw.replace('<h3','<h2') \
+                .replace('</h3>','</h2>').replace('Abstract. ','ABSTRACT')
+
+    def get_keywords(self):
+        """Get article keywords"""
+        pass
+
+    def get_body(self):
+        """Get body of article"""
+        body_full = self.soup.find('div',class_='hlFld-Fulltext')
+        links_old = body_full.find_all('a',class_='ref')
+        for i in links_old:
+            try:
+                tag = '#'+i['onclick'].split("'")[1]
+                i['href'] = str(tag)
+                i['onclick'] = ''
+            except:
+                pass
+        
+        print('Downloading higher-quality images...')
+        imgs_old = body_full.find_all('div',class_='articleImage')
+        for i in imgs_old:
+            try:
+                link = i.find('a',class_='popupLink') 
+                imgpage = BeautifulSoup(requests.get('https://bioone.org' \
+                   +str(link['href'])).content,'html.parser')
+                imglink = 'http://bioone.org'+str(imgpage.find('img')['src'])
+                link.find('img')['src'] = imglink
+                link['href'] = ''
+            except:
+                pass
+
+        body_raw = body_full.find_all('div',class_='NLM_sec_level_1')
+        self.body = ''
+        for i in body_raw:
+            self.body += str(i)
+
+        self.body = self.body.replace('<h6>','<h2>').replace('</h6>','</h2>')
+        self.body = self.body.replace('enlarge figure','')
+    
+    def get_references(self):
+        """Get references list"""
+        references_raw = str(self.soup.find('div',class_='articleReferences'))
+        self.references = references_raw.replace('<h3>','<h2>') \
+                .replace('</h3>','</h2>')
+
+register_publisher(BioOne)
diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py
index 34fecfd..6ca4d8f 100644
--- a/article_epub/publishers/sciencedirect.py
+++ b/article_epub/publishers/sciencedirect.py
@@ -5,24 +5,6 @@ class ScienceDirect(Publisher):
 
     domains = ["sciencedirect.com","elsevier.com"]
     
-    #def get_title(self):
-    #    """Get article title"""
-    #    self.title = self.soup.find('span',class_='title-text').text
-
-    #def get_authors(self):
-    #    """Get author given and surnammes"""
-    #    author_raw = self.soup.find('div',class_='author-group') \
-    #        .find_all('span',class_='text surname')
-    #    self.author_surnames = []
-    #    for i in author_raw:
-    #        self.author_surnames.append(i.text)
-    #
-    #    author_raw = self.soup.find('div',class_='author-group') \
-    #        .find_all('span',class_='text given-name')
-    #    self.author_givennames = []
-    #    for i in author_raw:
-    #        self.author_givennames.append(i.text)
-    
     def get_doi(self):
         if self.doi == None:
             doi_raw = self.soup.find('a',class_='doi').get('href').split('/')
@@ -40,22 +22,6 @@ class ScienceDirect(Publisher):
         for i in keys_raw:
             self.keywords.append(i.text)
 
-    #def get_metadata(self):
-    #    """Get assortment of other metadata"""
-    #    if self.doi == None:
-    #        doi_raw = self.soup.find('a',class_='doi').get('href').split('/')
-    #        self.doi = doi_raw[3]+'/'+doi_raw[4]
-    #
-    #    self.journal = self.soup.find('div',class_='publication-volume') \
-    #        .find('span',class_='size-xl').text
-    #
-    #    pubdate_raw = self.soup.find('div',class_='publication-volume') \
-    #        .find('span',class_='size-m').text.split(',')
-    #
-    #    self.year = pubdate_raw[-2].split(' ')[-1]
-    #    self.volume = pubdate_raw[0].split(' ')[1]
-    #    self.pages = pubdate_raw[-1].split(' ')[2]
-
     def get_body(self):
         """Get body of article"""
         body_raw = str(self.soup.find('div',class_='Body'))
diff --git a/article_epub/publishers/springer.py b/article_epub/publishers/springer.py
index 77930d4..d53eafc 100644
--- a/article_epub/publishers/springer.py
+++ b/article_epub/publishers/springer.py
@@ -4,20 +4,6 @@ class Springer(Publisher):
     """Class for Springer articles"""
 
     domains = ["springer.com"]
-    
-    #def get_title(self):
-    #    """Get article title"""
-    #    self.title = self.soup.find('h1',class_='ArticleTitle').text
-
-    #def get_authors(self):
-    #    """Get author given and surnammes"""
-    #    author_raw = self.soup.find_all('span',class_='authors__name')
-    #    self.author_surnames = []
-    #    self.author_givennames = []
-    #    for i in author_raw:
-    #        name = i.text.split('\xa0')
-    #        self.author_surnames.append(name[-1])
-    #        self.author_givennames.append(' '.join(name[:-1]))
 
     def get_doi(self):
         if self.doi == None:
@@ -35,22 +21,6 @@ class Springer(Publisher):
         for i in keywords_raw:
             self.keywords.append(i.text.replace('\xa0',''))
 
-    #def get_metadata(self):
-    #   """Get assortment of other metadata"""
-    #    if self.doi == None:
-    #        doi_raw = self.soup.find('span',{"id":"doi-url"}).text.split('/')
-    #        self.doi = doi_raw[-2]+'/'+doi_raw[-1]
-    #
-    #    self.journal = self.soup.find('span',class_="JournalTitle").text
-    #
-    #    self.year = self.soup.find('time')['datetime'].split('-')[0]
-    #
-    #    self.volume = self.soup.find('span',class_="ArticleCitation_Volume") \
-    #        .text[:-2].split(' ')[-1]
-    #
-    #    self.pages = self.soup.find('span',class_="ArticleCitation_Pages") \
-    #        .text.split(' ')[-1]
-
     def get_body(self):
         """Get body of article"""
         self.body = self.soup.find('div',{"id":"body"})
author	Ken Kellner <ken@kenkellner.com>	2018-04-05 13:14:43 -0400
committer	Ken Kellner <ken@kenkellner.com>	2018-04-05 13:14:43 -0400
commit	34913dd9ff573cd506e45dda5c13008f451cec11 (patch)
tree	08de3d70a4a530337ac30b34c0f4a4455ac8c668
parent	7aacb78f566a97c90b8f460bf33e075a022060c1 (diff)