aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Kellner <ken@kenkellner.com>2018-04-06 10:34:50 -0400
committerKen Kellner <ken@kenkellner.com>2018-04-06 10:34:50 -0400
commita5ae056da7a2b739a7412c854f76f33958c13e4a (patch)
treedd2daee3f4b951dee427994671cd32ef47dcd105
parent59de534956c5dcacdd641afd0c5399c6279445fe (diff)
Clean up Oxford recipe and improve program output formatting
-rwxr-xr-xarticle-epub.py5
-rw-r--r--article_epub/publisher.py16
-rw-r--r--article_epub/publishers/oxford.py34
3 files changed, 45 insertions, 10 deletions
diff --git a/article-epub.py b/article-epub.py
index 3b1ef25..078c793 100755
--- a/article-epub.py
+++ b/article-epub.py
@@ -5,9 +5,10 @@ import requests
def main():
if sys.argv[1] == '-d':
- print("Getting URL from DOI...")
+ print("Getting URL from DOI........",end='',flush=True)
url = requests.get('https://doi.org/'+sys.argv[2]).url
doi = sys.argv[2]
+ print('done')
else:
url = sys.argv[1]
doi = None
@@ -23,6 +24,8 @@ def main():
art.soupify()
art.extract_data()
art.epubify()
+ print('\nCitation: '+art.citation)
+ print('Filename: '+art.output)
main()
diff --git a/article_epub/publisher.py b/article_epub/publisher.py
index 7465f7f..eeb0c67 100644
--- a/article_epub/publisher.py
+++ b/article_epub/publisher.py
@@ -25,25 +25,26 @@ class Publisher(object):
"""Get HTML from article's page"""
self.get_final_url()
os.environ['MOZ_HEADLESS'] = '1'
- print('Starting headless browser...')
+ print('Starting headless browser...',end='',flush=True)
binary = FirefoxBinary('/usr/bin/firefox')
try:
driver = webdriver.Firefox(firefox_binary=binary,
log_path='/tmp/gecko_log')
+ print('done')
except:
sys.exit('Failed to load Firefox; is it installed?')
- print('Loading page...')
+ print('Loading page................',end="",flush=True)
try:
driver.get(self.url)
except:
sys.exit('Failed to load URL')
if self.doi != None:
- print('Waiting for redirects..')
sleep(5) #To allow redirects
-
+
sleep(5)
+ print('done')
self.url = driver.current_url
self.soup = BeautifulSoup(driver.page_source,'html.parser')
@@ -106,13 +107,14 @@ class Publisher(object):
+self.journal+'. '+' doi: '+self.doi
def extract_data(self):
- print('Extracting data from HTML...')
+ print('Extracting data from HTML...',end='',flush=True)
self.get_doi()
self.get_metadata()
self.get_abstract()
self.get_keywords()
self.get_body()
self.get_references()
+ print('done')
def epubify(self):
"""Convert data into epub format"""
@@ -142,12 +144,12 @@ class Publisher(object):
combined += str(self.body)
combined += str(self.references)
- print('Generating epub...')
+ print('Generating epub.............',end='',flush=True)
epubout = pypandoc.convert_text(combined,format='html',to='epub',
extra_args=args,
outputfile=output_raw)
-
subprocess.check_output(['ebook-convert',output_raw,self.output])
+ print('done')
def register_publisher(publisher):
_publishers.append(publisher)
diff --git a/article_epub/publishers/oxford.py b/article_epub/publishers/oxford.py
index f8a95a1..1bca8ae 100644
--- a/article_epub/publishers/oxford.py
+++ b/article_epub/publishers/oxford.py
@@ -15,7 +15,7 @@ class Oxford(Publisher):
def get_abstract(self):
"""Get article abstract"""
abstract_raw = self.soup.find('section',class_='abstract')
- self.abstract = '<h2>Abstract<h2>\n'+str(abstract_raw)
+ self.abstract = '<h2>Abstract</h2>\n'+str(abstract_raw)
def get_keywords(self):
"""Get article keywords"""
@@ -29,17 +29,47 @@ class Oxford(Publisher):
body_raw = copy.copy(self.soup.find(
'div',{'data-widgetname':'ArticleFulltext'}))
body_raw.find('h2',class_='abstract-title').decompose()
+ body_raw.find('section',class_='abstract').decompose()
body_raw.find('div',class_='article-metadata-panel').decompose()
body_raw.find('div',class_='ref-list').decompose()
body_raw.find('span',{'id':'UserHasAccess'}).decompose()
body_raw.find('div',class_='copyright').decompose()
body_raw.find('h2',class_='backreferences-title').decompose()
+
+ for i in body_raw.find_all('div',class_='fig-modal'):
+ i.decompose()
+
+ for i in body_raw.find_all('div',class_='table-modal'):
+ i.decompose()
+
+ for i in body_raw.find_all('div',class_='fig-orig'):
+ i.decompose()
+
+ for i in body_raw.find_all('a',class_='fig-view-orig'):
+ i.decompose()
+
+ for i in body_raw.find_all('a',class_='xref-bibr'):
+ new = '#'+i['reveal-id']
+ i['href'] = new
+
+ for i in body_raw.find_all('a',class_='xref-fig'):
+ new = '#'+i['reveal-id']
+ i['href'] = new
+
self.body = body_raw
def get_references(self):
"""Get references list"""
references_title = self.soup.find('h2',class_='backreferences-title')
references_raw = self.soup.find('div',class_='ref-list')
- self.references = str(references_title)+str(references_raw)
+ refs_format = ''
+ for i in references_raw.find_all('div',recursive=False):
+ for j in i.find_all('a'):
+ j.decompose()
+ refs_format += '<div id="'+i['content-id']+'">'
+ refs_format += i.text+'\n</div>'
+
+ self.references = str(references_title)+str(refs_format)
+ self.references = self.references.replace('doi:','')
register_publisher(Oxford)