diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-03 17:34:55 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-03 17:34:55 -0400 |
commit | 68fae4e7cae677845cfb74ac6843e866b487b689 (patch) | |
tree | e588c2c88e392de6800b27ff7c6b865124e0fc5f | |
parent | aad9b78393f0dfc41868a2da2e96fb5fb349893a (diff) |
Reorganize into modules
-rw-r--r-- | .gitignore | 3 | ||||
-rw-r--r-- | SciArticle.py | 62 | ||||
-rw-r--r-- | article_epub/__init__.py | 3 | ||||
-rw-r--r-- | article_epub/publishers/__init__.py | 1 | ||||
-rw-r--r-- | article_epub/publishers/sciencedirect.py | 65 | ||||
-rw-r--r-- | article_epub/sciarticle.py | 100 | ||||
-rw-r--r-- | sci-scraper.py | 22 | ||||
-rw-r--r-- | sciencedirect.py | 31 |
8 files changed, 194 insertions, 93 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..32b1973 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__* +*.epub +*.pdf diff --git a/SciArticle.py b/SciArticle.py deleted file mode 100644 index 440b72e..0000000 --- a/SciArticle.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/python3 -#https://github.com/mozilla/geckodriver/releases -from selenium import webdriver -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from bs4 import BeautifulSoup -import re -import os -import sys -import pypandoc -from time import sleep - -class SciArticle(object): - - def __init__(self, url, doi=None, out_format='kepub'): - self.url = url - self.output_format = out_format - if out_format not in ['epub','kepub']: - sys.exit('Supported formats are epub and kepub') - if doi != None: - self.doi = doi - - def soupify(self): - """Get HTML from article's page""" - os.environ['MOZ_HEADLESS'] = '1' - binary = FirefoxBinary('/usr/bin/firefox') - try: - driver = webdriver.Firefox(firefox_binary=binary, log_file='/tmp/gecko_log') - except: - sys.exit('Failed to load Firefox; is it installed?') - try: - driver.get(self.init_url) - except: - sys.exit('Failed to load URL') - - sleep(2) #To allow redirects - self.url = driver.current_url - - self.soup = BeautifulSoup(driver.page_source,'html.parser') - driver.quit() - #return(self.soup) - - #def out_filename(self): - # first5 = self.title.split()[:5] - - - def epubify(self): - """Convert data into epub format""" - args = [] - args.append('-M') - args.append('title="'+self.title+'"') - args.append('author="'+author+'"') - args.append('--parse-raw') - - epubout = pypandoc.convert_text(self.body,format='html',to='epub', - extra_args=args, - outputfile=self.output) - - - - - - diff --git a/article_epub/__init__.py b/article_epub/__init__.py new file mode 100644 index 0000000..84afc23 --- /dev/null +++ b/article_epub/__init__.py @@ -0,0 +1,3 @@ +#!/usr/bin/python3 +#https://github.com/mozilla/geckodriver/releases +import article_epub.publishers diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py new file mode 100644 index 0000000..a51d157 --- /dev/null +++ b/article_epub/publishers/__init__.py @@ -0,0 +1 @@ +from article_epub.publishers.sciencedirect import ScienceDirect diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py new file mode 100644 index 0000000..5ddc3e3 --- /dev/null +++ b/article_epub/publishers/sciencedirect.py @@ -0,0 +1,65 @@ +from article_epub.sciarticle import SciArticle + +class ScienceDirect(SciArticle): + + def get_title(self): + self.title = self.soup.find('span',class_='title-text').text + + def get_authors(self): + author_raw = self.soup.find('div',class_='author-group') \ + .find_all('span',class_='text surname') + self.author_surnames = [] + for i in author_raw: + self.author_surnames.append(i.text) + + author_raw = self.soup.find('div',class_='author-group') \ + .find_all('span',class_='text given-name') + self.author_givennames = [] + for i in author_raw: + self.author_givennames.append(i.text) + + def get_abstract(self): + self.abstract = self.soup.find('div',class_='abstract author') + + def get_keywords(self): + keys_raw = self.soup.find('div',class_='Keywords') \ + .find_all('div',class_='keyword') + self.keywords = [] + for i in keys_raw: + self.keywords.append(i.text) + + def get_metadata(self): + if self.doi == None: + doi_raw = self.soup.find('a',class_='doi').get('href').split('/') + self.doi = doi_raw[3]+'/'+doi_raw[4] + + self.journal = self.soup.find('div',class_='publication-volume') \ + .find('span',class_='size-xl').text + + pubdate_raw = self.soup.find('div',class_='publication-volume') \ + .find('span',class_='size-m').text.split(',') + + self.year = pubdate_raw[1].split(' ')[-1] + self.volume = pubdate_raw[0].split(' ')[1] + self.pages = pubdate_raw[2].split(' ')[2] + + def get_body(self): + body_raw = str(self.soup.find('div',class_='Body')) + self.body = body_raw.replace('#bib','#ref-id-bib') + + def get_references(self): + self.references = self.soup.find('section',class_='bibliography') + + + + + + + + + + + + + + diff --git a/article_epub/sciarticle.py b/article_epub/sciarticle.py new file mode 100644 index 0000000..93aaa09 --- /dev/null +++ b/article_epub/sciarticle.py @@ -0,0 +1,100 @@ +from selenium import webdriver +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from bs4 import BeautifulSoup +import os +import sys +import pypandoc +from time import sleep + +class SciArticle(object): + + def __init__(self, url, doi=None, out_format='epub'): + self.url = url + self.doi = doi + self.output_format = out_format + if out_format not in ['epub','kepub']: + sys.exit('Supported formats are epub and kepub') + if doi != None: + self.doi = doi + + def soupify(self): + """Get HTML from article's page""" + os.environ['MOZ_HEADLESS'] = '1' + binary = FirefoxBinary('/usr/bin/firefox') + try: + driver = webdriver.Firefox(firefox_binary=binary, + log_path='/tmp/gecko_log') + except: + sys.exit('Failed to load Firefox; is it installed?') + try: + driver.get(self.url) + except: + sys.exit('Failed to load URL') + + sleep(2) #To allow redirects + self.url = driver.current_url + + self.soup = BeautifulSoup(driver.page_source,'html.parser') + driver.quit() + + def get_citation(self): + + all_authors = '' + for i in range(0,len(self.author_surnames)): + all_authors += self.author_surnames[i] + ', ' + all_authors += self.author_givennames[i] + if(i != (len(self.author_surnames) - 1)): + all_authors += '; ' + if all_authors[-1] == '.': + cap = ' ' + else: + cap = '. ' + + self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \ + +self.journal+' '+self.volume+': '+self.pages+'.' \ + +' doi: '+self.doi + + def extract_data(self): + self.get_title() + self.get_authors() + self.get_abstract() + self.get_keywords() + self.get_metadata() + self.get_body() + self.get_references() + self.get_citation() + + def epubify(self): + """Convert data into epub format""" + + all_authors = '' + for i in range(0,len(self.author_surnames)): + all_authors += self.author_givennames[i] + ' ' + all_authors += self.author_surnames[i] + if(i != (len(self.author_surnames) - 1)): + all_authors += ', ' + + args = [] + args.append('-M') + args.append('title="'+self.title+'"') + args.append('-M') + args.append('author="'+all_authors+'"') + args.append('--parse-raw') + + self.output = self.author_surnames[0]+self.year+'.epub' + + combined = '' + combined += str(self.citation) + combined += str(self.abstract) + combined += str(self.body) + combined += str(self.references) + + epubout = pypandoc.convert_text(combined,format='html',to='epub', + extra_args=args, + outputfile=self.output) + + + + + + diff --git a/sci-scraper.py b/sci-scraper.py new file mode 100644 index 0000000..8bfa1c1 --- /dev/null +++ b/sci-scraper.py @@ -0,0 +1,22 @@ +#!/usr/bin/python3 + +from article_epub.publishers import ScienceDirect + + +test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S037811271630723X') + +test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S0946672X17308763') + +test.soupify() +test.extract_data() +test.epubify() + +##### + +import urllib.request + + +def final_url(url=None,doi=None): + if url !=None: + response = requests.get(url) + diff --git a/sciencedirect.py b/sciencedirect.py deleted file mode 100644 index 36d13e0..0000000 --- a/sciencedirect.py +++ /dev/null @@ -1,31 +0,0 @@ -import sci-scraper-new - -class ScienceDirect(SciArticle): - - def get_title(self): - self.title = self.soup.find('span',class_='title-text').text - - - - - - - - -test.title = test.soup.find('span',class_='title-text').text - -author_raw = test.soup.find('div',class_='author-group') \ - .find_all('span',class_='content') -author_list = [] - -if len(author_raw) == 1: - test.authors = author_raw[0].text -else: - for i in author_raw: - author_list.append(i.text) - - test.author_list = author_list - - #test.authors = ", ".join(author_list) - - |