diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-03 21:25:54 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-03 21:25:54 -0400 |
commit | 9366bac2de4c94fb01c7c67e191e55bd39b78aba (patch) | |
tree | ca4a84a1f82487b8858e31f6c90b3fd617943b5a | |
parent | 68fae4e7cae677845cfb74ac6843e866b487b689 (diff) |
Improve epub quality and start to add main function framework
-rw-r--r-- | article_epub/publishers/sciencedirect.py | 2 | ||||
-rw-r--r-- | article_epub/sciarticle.py | 14 | ||||
-rwxr-xr-x[-rw-r--r--] | sci-scraper.py | 37 |
3 files changed, 39 insertions, 14 deletions
diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py index 5ddc3e3..218cb98 100644 --- a/article_epub/publishers/sciencedirect.py +++ b/article_epub/publishers/sciencedirect.py @@ -45,7 +45,7 @@ class ScienceDirect(SciArticle): def get_body(self): body_raw = str(self.soup.find('div',class_='Body')) - self.body = body_raw.replace('#bib','#ref-id-bib') + self.body = body_raw.replace('#b','#ref-id-b') def get_references(self): self.references = self.soup.find('section',class_='bibliography') diff --git a/article_epub/sciarticle.py b/article_epub/sciarticle.py index 93aaa09..cd828f0 100644 --- a/article_epub/sciarticle.py +++ b/article_epub/sciarticle.py @@ -5,6 +5,7 @@ import os import sys import pypandoc from time import sleep +import subprocess class SciArticle(object): @@ -31,7 +32,10 @@ class SciArticle(object): except: sys.exit('Failed to load URL') - sleep(2) #To allow redirects + if self.doi != None: + print('Waiting for redirects..') + sleep(5) #To allow redirects + self.url = driver.current_url self.soup = BeautifulSoup(driver.page_source,'html.parser') @@ -82,16 +86,20 @@ class SciArticle(object): args.append('--parse-raw') self.output = self.author_surnames[0]+self.year+'.epub' + output_raw = '/tmp/raw.epub' combined = '' combined += str(self.citation) combined += str(self.abstract) combined += str(self.body) combined += str(self.references) - + + print('Generating epub...') epubout = pypandoc.convert_text(combined,format='html',to='epub', extra_args=args, - outputfile=self.output) + outputfile=output_raw) + + subprocess.check_output(['ebook-convert',output_raw,self.output]) diff --git a/sci-scraper.py b/sci-scraper.py index 8bfa1c1..6bb5861 100644..100755 --- a/sci-scraper.py +++ b/sci-scraper.py @@ -1,22 +1,39 @@ #!/usr/bin/python3 - from article_epub.publishers import ScienceDirect +import sys +import requests + +def main(): + if sys.argv[1] == '-d': + url = requests.get('https://doi.org/'+sys.argv[2]).url + art = ScienceDirect(url=url,doi=sys.argv[2]) + else: + url = sys.argv[1] + art = ScienceDirect(url=url) + print('Downloading content...') + art.soupify() + art.extract_data() + art.epubify() -test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S037811271630723X') +main() +#test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S037811271630723X') -test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S0946672X17308763') +#test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S0946672X17308763') -test.soupify() -test.extract_data() -test.epubify() +#test.soupify() +#test.extract_data() +#test.epubify() ##### -import urllib.request +#import urllib.request -def final_url(url=None,doi=None): - if url !=None: - response = requests.get(url) +#def final_url(url=None,doi=None): +# if url !=None: +# response = requests.get(url) +# elif doi !=None: +# response = request.get('https://doi.org/'+doi) + |