diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-06 10:34:50 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-06 10:34:50 -0400 |
commit | a5ae056da7a2b739a7412c854f76f33958c13e4a (patch) | |
tree | dd2daee3f4b951dee427994671cd32ef47dcd105 | |
parent | 59de534956c5dcacdd641afd0c5399c6279445fe (diff) |
Clean up Oxford recipe and improve program output formatting
-rwxr-xr-x | article-epub.py | 5 | ||||
-rw-r--r-- | article_epub/publisher.py | 16 | ||||
-rw-r--r-- | article_epub/publishers/oxford.py | 34 |
3 files changed, 45 insertions, 10 deletions
diff --git a/article-epub.py b/article-epub.py index 3b1ef25..078c793 100755 --- a/article-epub.py +++ b/article-epub.py @@ -5,9 +5,10 @@ import requests def main(): if sys.argv[1] == '-d': - print("Getting URL from DOI...") + print("Getting URL from DOI........",end='',flush=True) url = requests.get('https://doi.org/'+sys.argv[2]).url doi = sys.argv[2] + print('done') else: url = sys.argv[1] doi = None @@ -23,6 +24,8 @@ def main(): art.soupify() art.extract_data() art.epubify() + print('\nCitation: '+art.citation) + print('Filename: '+art.output) main() diff --git a/article_epub/publisher.py b/article_epub/publisher.py index 7465f7f..eeb0c67 100644 --- a/article_epub/publisher.py +++ b/article_epub/publisher.py @@ -25,25 +25,26 @@ class Publisher(object): """Get HTML from article's page""" self.get_final_url() os.environ['MOZ_HEADLESS'] = '1' - print('Starting headless browser...') + print('Starting headless browser...',end='',flush=True) binary = FirefoxBinary('/usr/bin/firefox') try: driver = webdriver.Firefox(firefox_binary=binary, log_path='/tmp/gecko_log') + print('done') except: sys.exit('Failed to load Firefox; is it installed?') - print('Loading page...') + print('Loading page................',end="",flush=True) try: driver.get(self.url) except: sys.exit('Failed to load URL') if self.doi != None: - print('Waiting for redirects..') sleep(5) #To allow redirects - + sleep(5) + print('done') self.url = driver.current_url self.soup = BeautifulSoup(driver.page_source,'html.parser') @@ -106,13 +107,14 @@ class Publisher(object): +self.journal+'. '+' doi: '+self.doi def extract_data(self): - print('Extracting data from HTML...') + print('Extracting data from HTML...',end='',flush=True) self.get_doi() self.get_metadata() self.get_abstract() self.get_keywords() self.get_body() self.get_references() + print('done') def epubify(self): """Convert data into epub format""" @@ -142,12 +144,12 @@ class Publisher(object): combined += str(self.body) combined += str(self.references) - print('Generating epub...') + print('Generating epub.............',end='',flush=True) epubout = pypandoc.convert_text(combined,format='html',to='epub', extra_args=args, outputfile=output_raw) - subprocess.check_output(['ebook-convert',output_raw,self.output]) + print('done') def register_publisher(publisher): _publishers.append(publisher) diff --git a/article_epub/publishers/oxford.py b/article_epub/publishers/oxford.py index f8a95a1..1bca8ae 100644 --- a/article_epub/publishers/oxford.py +++ b/article_epub/publishers/oxford.py @@ -15,7 +15,7 @@ class Oxford(Publisher): def get_abstract(self): """Get article abstract""" abstract_raw = self.soup.find('section',class_='abstract') - self.abstract = '<h2>Abstract<h2>\n'+str(abstract_raw) + self.abstract = '<h2>Abstract</h2>\n'+str(abstract_raw) def get_keywords(self): """Get article keywords""" @@ -29,17 +29,47 @@ class Oxford(Publisher): body_raw = copy.copy(self.soup.find( 'div',{'data-widgetname':'ArticleFulltext'})) body_raw.find('h2',class_='abstract-title').decompose() + body_raw.find('section',class_='abstract').decompose() body_raw.find('div',class_='article-metadata-panel').decompose() body_raw.find('div',class_='ref-list').decompose() body_raw.find('span',{'id':'UserHasAccess'}).decompose() body_raw.find('div',class_='copyright').decompose() body_raw.find('h2',class_='backreferences-title').decompose() + + for i in body_raw.find_all('div',class_='fig-modal'): + i.decompose() + + for i in body_raw.find_all('div',class_='table-modal'): + i.decompose() + + for i in body_raw.find_all('div',class_='fig-orig'): + i.decompose() + + for i in body_raw.find_all('a',class_='fig-view-orig'): + i.decompose() + + for i in body_raw.find_all('a',class_='xref-bibr'): + new = '#'+i['reveal-id'] + i['href'] = new + + for i in body_raw.find_all('a',class_='xref-fig'): + new = '#'+i['reveal-id'] + i['href'] = new + self.body = body_raw def get_references(self): """Get references list""" references_title = self.soup.find('h2',class_='backreferences-title') references_raw = self.soup.find('div',class_='ref-list') - self.references = str(references_title)+str(references_raw) + refs_format = '' + for i in references_raw.find_all('div',recursive=False): + for j in i.find_all('a'): + j.decompose() + refs_format += '<div id="'+i['content-id']+'">' + refs_format += i.text+'\n</div>' + + self.references = str(references_title)+str(refs_format) + self.references = self.references.replace('doi:','') register_publisher(Oxford) |