diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-03-29 15:32:44 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-03-29 15:32:44 -0400 |
commit | aad9b78393f0dfc41868a2da2e96fb5fb349893a (patch) | |
tree | 818f44bb9c71ce95d9bc9e1be8d3954eba86a786 |
Initial commit
-rw-r--r-- | SciArticle.py | 62 | ||||
-rw-r--r-- | sciencedirect.py | 31 |
2 files changed, 93 insertions, 0 deletions
diff --git a/SciArticle.py b/SciArticle.py new file mode 100644 index 0000000..440b72e --- /dev/null +++ b/SciArticle.py @@ -0,0 +1,62 @@ +#!/usr/bin/python3 +#https://github.com/mozilla/geckodriver/releases +from selenium import webdriver +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from bs4 import BeautifulSoup +import re +import os +import sys +import pypandoc +from time import sleep + +class SciArticle(object): + + def __init__(self, url, doi=None, out_format='kepub'): + self.url = url + self.output_format = out_format + if out_format not in ['epub','kepub']: + sys.exit('Supported formats are epub and kepub') + if doi != None: + self.doi = doi + + def soupify(self): + """Get HTML from article's page""" + os.environ['MOZ_HEADLESS'] = '1' + binary = FirefoxBinary('/usr/bin/firefox') + try: + driver = webdriver.Firefox(firefox_binary=binary, log_file='/tmp/gecko_log') + except: + sys.exit('Failed to load Firefox; is it installed?') + try: + driver.get(self.init_url) + except: + sys.exit('Failed to load URL') + + sleep(2) #To allow redirects + self.url = driver.current_url + + self.soup = BeautifulSoup(driver.page_source,'html.parser') + driver.quit() + #return(self.soup) + + #def out_filename(self): + # first5 = self.title.split()[:5] + + + def epubify(self): + """Convert data into epub format""" + args = [] + args.append('-M') + args.append('title="'+self.title+'"') + args.append('author="'+author+'"') + args.append('--parse-raw') + + epubout = pypandoc.convert_text(self.body,format='html',to='epub', + extra_args=args, + outputfile=self.output) + + + + + + diff --git a/sciencedirect.py b/sciencedirect.py new file mode 100644 index 0000000..36d13e0 --- /dev/null +++ b/sciencedirect.py @@ -0,0 +1,31 @@ +import sci-scraper-new + +class ScienceDirect(SciArticle): + + def get_title(self): + self.title = self.soup.find('span',class_='title-text').text + + + + + + + + +test.title = test.soup.find('span',class_='title-text').text + +author_raw = test.soup.find('div',class_='author-group') \ + .find_all('span',class_='content') +author_list = [] + +if len(author_raw) == 1: + test.authors = author_raw[0].text +else: + for i in author_raw: + author_list.append(i.text) + + test.author_list = author_list + + #test.authors = ", ".join(author_list) + + |