aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Kellner <ken@kenkellner.com>2018-04-03 21:25:54 -0400
committerKen Kellner <ken@kenkellner.com>2018-04-03 21:25:54 -0400
commit9366bac2de4c94fb01c7c67e191e55bd39b78aba (patch)
treeca4a84a1f82487b8858e31f6c90b3fd617943b5a
parent68fae4e7cae677845cfb74ac6843e866b487b689 (diff)
Improve epub quality and start to add main function framework
-rw-r--r--article_epub/publishers/sciencedirect.py2
-rw-r--r--article_epub/sciarticle.py14
-rwxr-xr-x[-rw-r--r--]sci-scraper.py37
3 files changed, 39 insertions, 14 deletions
diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py
index 5ddc3e3..218cb98 100644
--- a/article_epub/publishers/sciencedirect.py
+++ b/article_epub/publishers/sciencedirect.py
@@ -45,7 +45,7 @@ class ScienceDirect(SciArticle):
def get_body(self):
body_raw = str(self.soup.find('div',class_='Body'))
- self.body = body_raw.replace('#bib','#ref-id-bib')
+ self.body = body_raw.replace('#b','#ref-id-b')
def get_references(self):
self.references = self.soup.find('section',class_='bibliography')
diff --git a/article_epub/sciarticle.py b/article_epub/sciarticle.py
index 93aaa09..cd828f0 100644
--- a/article_epub/sciarticle.py
+++ b/article_epub/sciarticle.py
@@ -5,6 +5,7 @@ import os
import sys
import pypandoc
from time import sleep
+import subprocess
class SciArticle(object):
@@ -31,7 +32,10 @@ class SciArticle(object):
except:
sys.exit('Failed to load URL')
- sleep(2) #To allow redirects
+ if self.doi != None:
+ print('Waiting for redirects..')
+ sleep(5) #To allow redirects
+
self.url = driver.current_url
self.soup = BeautifulSoup(driver.page_source,'html.parser')
@@ -82,16 +86,20 @@ class SciArticle(object):
args.append('--parse-raw')
self.output = self.author_surnames[0]+self.year+'.epub'
+ output_raw = '/tmp/raw.epub'
combined = ''
combined += str(self.citation)
combined += str(self.abstract)
combined += str(self.body)
combined += str(self.references)
-
+
+ print('Generating epub...')
epubout = pypandoc.convert_text(combined,format='html',to='epub',
extra_args=args,
- outputfile=self.output)
+ outputfile=output_raw)
+
+ subprocess.check_output(['ebook-convert',output_raw,self.output])
diff --git a/sci-scraper.py b/sci-scraper.py
index 8bfa1c1..6bb5861 100644..100755
--- a/sci-scraper.py
+++ b/sci-scraper.py
@@ -1,22 +1,39 @@
#!/usr/bin/python3
-
from article_epub.publishers import ScienceDirect
+import sys
+import requests
+
+def main():
+ if sys.argv[1] == '-d':
+ url = requests.get('https://doi.org/'+sys.argv[2]).url
+ art = ScienceDirect(url=url,doi=sys.argv[2])
+ else:
+ url = sys.argv[1]
+ art = ScienceDirect(url=url)
+ print('Downloading content...')
+ art.soupify()
+ art.extract_data()
+ art.epubify()
-test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S037811271630723X')
+main()
+#test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S037811271630723X')
-test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S0946672X17308763')
+#test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S0946672X17308763')
-test.soupify()
-test.extract_data()
-test.epubify()
+#test.soupify()
+#test.extract_data()
+#test.epubify()
#####
-import urllib.request
+#import urllib.request
-def final_url(url=None,doi=None):
- if url !=None:
- response = requests.get(url)
+#def final_url(url=None,doi=None):
+# if url !=None:
+# response = requests.get(url)
+# elif doi !=None:
+# response = request.get('https://doi.org/'+doi)
+