Clean up Oxford recipe and improve program output formatting

author: Ken Kellner <ken@kenkellner.com> 2018-04-06 10:34:50 -0400
committer: Ken Kellner <ken@kenkellner.com> 2018-04-06 10:34:50 -0400
commit: a5ae056da7a2b739a7412c854f76f33958c13e4a (patch)
tree: dd2daee3f4b951dee427994671cd32ef47dcd105
parent: 59de534956c5dcacdd641afd0c5399c6279445fe (diff)
3 files changed, 45 insertions, 10 deletions
diff --git a/article-epub.py b/article-epub.py
index 3b1ef25..078c793 100755
--- a/article-epub.py
+++ b/article-epub.py
@@ -5,9 +5,10 @@ import requests
 
 def main():
     if sys.argv[1] == '-d':
-        print("Getting URL from DOI...")
+        print("Getting URL from DOI........",end='',flush=True)
         url = requests.get('https://doi.org/'+sys.argv[2]).url
         doi = sys.argv[2]
+        print('done')
     else:
         url = sys.argv[1]
         doi = None
@@ -23,6 +24,8 @@ def main():
     art.soupify()
     art.extract_data()
     art.epubify()
+    print('\nCitation: '+art.citation)
+    print('Filename: '+art.output)
 
 
 main()
diff --git a/article_epub/publisher.py b/article_epub/publisher.py
index 7465f7f..eeb0c67 100644
--- a/article_epub/publisher.py
+++ b/article_epub/publisher.py
@@ -25,25 +25,26 @@ class Publisher(object):
         """Get HTML from article's page"""
         self.get_final_url()
         os.environ['MOZ_HEADLESS'] = '1'
-        print('Starting headless browser...')
+        print('Starting headless browser...',end='',flush=True)
         binary = FirefoxBinary('/usr/bin/firefox')
         try:
             driver = webdriver.Firefox(firefox_binary=binary, 
                     log_path='/tmp/gecko_log')
+            print('done')
         except:
             sys.exit('Failed to load Firefox; is it installed?')
         
-        print('Loading page...')
+        print('Loading page................',end="",flush=True)
         try:
             driver.get(self.url)
         except:
             sys.exit('Failed to load URL')
         
         if self.doi != None:
-            print('Waiting for redirects..')
             sleep(5) #To allow redirects
-        
+
         sleep(5)
+        print('done')   
         self.url = driver.current_url
         
         self.soup = BeautifulSoup(driver.page_source,'html.parser')
@@ -106,13 +107,14 @@ class Publisher(object):
                     +self.journal+'. '+' doi: '+self.doi
     
     def extract_data(self):
-        print('Extracting data from HTML...')
+        print('Extracting data from HTML...',end='',flush=True)
         self.get_doi()
         self.get_metadata()
         self.get_abstract()
         self.get_keywords()
         self.get_body()
         self.get_references()
+        print('done')
 
     def epubify(self):
         """Convert data into epub format"""
@@ -142,12 +144,12 @@ class Publisher(object):
         combined += str(self.body)
         combined += str(self.references)
         
-        print('Generating epub...')
+        print('Generating epub.............',end='',flush=True)
         epubout = pypandoc.convert_text(combined,format='html',to='epub',
                 extra_args=args,
                 outputfile=output_raw)
-
         subprocess.check_output(['ebook-convert',output_raw,self.output])
+        print('done')
 
 def register_publisher(publisher):
     _publishers.append(publisher)
diff --git a/article_epub/publishers/oxford.py b/article_epub/publishers/oxford.py
index f8a95a1..1bca8ae 100644
--- a/article_epub/publishers/oxford.py
+++ b/article_epub/publishers/oxford.py
@@ -15,7 +15,7 @@ class Oxford(Publisher):
     def get_abstract(self):
         """Get article abstract"""
         abstract_raw = self.soup.find('section',class_='abstract')
-        self.abstract = '<h2>Abstract<h2>\n'+str(abstract_raw)
+        self.abstract = '<h2>Abstract</h2>\n'+str(abstract_raw)
 
     def get_keywords(self):
         """Get article keywords"""
@@ -29,17 +29,47 @@ class Oxford(Publisher):
         body_raw = copy.copy(self.soup.find(
             'div',{'data-widgetname':'ArticleFulltext'}))
         body_raw.find('h2',class_='abstract-title').decompose()
+        body_raw.find('section',class_='abstract').decompose()
         body_raw.find('div',class_='article-metadata-panel').decompose()
         body_raw.find('div',class_='ref-list').decompose()
         body_raw.find('span',{'id':'UserHasAccess'}).decompose()
         body_raw.find('div',class_='copyright').decompose()
         body_raw.find('h2',class_='backreferences-title').decompose()
+
+        for i in body_raw.find_all('div',class_='fig-modal'):
+            i.decompose()
+
+        for i in body_raw.find_all('div',class_='table-modal'):
+            i.decompose()
+        
+        for i in body_raw.find_all('div',class_='fig-orig'):
+            i.decompose()
+
+        for i in body_raw.find_all('a',class_='fig-view-orig'):
+            i.decompose()
+
+        for i in body_raw.find_all('a',class_='xref-bibr'):
+            new = '#'+i['reveal-id']
+            i['href'] = new
+        
+        for i in body_raw.find_all('a',class_='xref-fig'):
+            new = '#'+i['reveal-id']
+            i['href'] = new
+        
         self.body = body_raw
     
     def get_references(self):
         """Get references list"""
         references_title = self.soup.find('h2',class_='backreferences-title')
         references_raw = self.soup.find('div',class_='ref-list')
-        self.references = str(references_title)+str(references_raw)
+        refs_format = ''
+        for i in references_raw.find_all('div',recursive=False):
+            for j in i.find_all('a'):
+                j.decompose()
+            refs_format += '<div id="'+i['content-id']+'">'
+            refs_format += i.text+'\n</div>'
+
+        self.references = str(references_title)+str(refs_format)
+        self.references = self.references.replace('doi:','')
 
 register_publisher(Oxford)
author	Ken Kellner <ken@kenkellner.com>	2018-04-06 10:34:50 -0400
committer	Ken Kellner <ken@kenkellner.com>	2018-04-06 10:34:50 -0400
commit	a5ae056da7a2b739a7412c854f76f33958c13e4a (patch)
tree	dd2daee3f4b951dee427994671cd32ef47dcd105
parent	59de534956c5dcacdd641afd0c5399c6279445fe (diff)