aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Kellner <ken@kenkellner.com>2018-03-29 15:32:44 -0400
committerKen Kellner <ken@kenkellner.com>2018-03-29 15:32:44 -0400
commitaad9b78393f0dfc41868a2da2e96fb5fb349893a (patch)
tree818f44bb9c71ce95d9bc9e1be8d3954eba86a786
Initial commit
-rw-r--r--SciArticle.py62
-rw-r--r--sciencedirect.py31
2 files changed, 93 insertions, 0 deletions
diff --git a/SciArticle.py b/SciArticle.py
new file mode 100644
index 0000000..440b72e
--- /dev/null
+++ b/SciArticle.py
@@ -0,0 +1,62 @@
+#!/usr/bin/python3
+#https://github.com/mozilla/geckodriver/releases
+from selenium import webdriver
+from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
+from bs4 import BeautifulSoup
+import re
+import os
+import sys
+import pypandoc
+from time import sleep
+
+class SciArticle(object):
+
+ def __init__(self, url, doi=None, out_format='kepub'):
+ self.url = url
+ self.output_format = out_format
+ if out_format not in ['epub','kepub']:
+ sys.exit('Supported formats are epub and kepub')
+ if doi != None:
+ self.doi = doi
+
+ def soupify(self):
+ """Get HTML from article's page"""
+ os.environ['MOZ_HEADLESS'] = '1'
+ binary = FirefoxBinary('/usr/bin/firefox')
+ try:
+ driver = webdriver.Firefox(firefox_binary=binary, log_file='/tmp/gecko_log')
+ except:
+ sys.exit('Failed to load Firefox; is it installed?')
+ try:
+ driver.get(self.init_url)
+ except:
+ sys.exit('Failed to load URL')
+
+ sleep(2) #To allow redirects
+ self.url = driver.current_url
+
+ self.soup = BeautifulSoup(driver.page_source,'html.parser')
+ driver.quit()
+ #return(self.soup)
+
+ #def out_filename(self):
+ # first5 = self.title.split()[:5]
+
+
+ def epubify(self):
+ """Convert data into epub format"""
+ args = []
+ args.append('-M')
+ args.append('title="'+self.title+'"')
+ args.append('author="'+author+'"')
+ args.append('--parse-raw')
+
+ epubout = pypandoc.convert_text(self.body,format='html',to='epub',
+ extra_args=args,
+ outputfile=self.output)
+
+
+
+
+
+
diff --git a/sciencedirect.py b/sciencedirect.py
new file mode 100644
index 0000000..36d13e0
--- /dev/null
+++ b/sciencedirect.py
@@ -0,0 +1,31 @@
+import sci-scraper-new
+
+class ScienceDirect(SciArticle):
+
+ def get_title(self):
+ self.title = self.soup.find('span',class_='title-text').text
+
+
+
+
+
+
+
+
+test.title = test.soup.find('span',class_='title-text').text
+
+author_raw = test.soup.find('div',class_='author-group') \
+ .find_all('span',class_='content')
+author_list = []
+
+if len(author_raw) == 1:
+ test.authors = author_raw[0].text
+else:
+ for i in author_raw:
+ author_list.append(i.text)
+
+ test.author_list = author_list
+
+ #test.authors = ", ".join(author_list)
+
+