aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Kellner <ken@kenkellner.com>2018-04-03 17:34:55 -0400
committerKen Kellner <ken@kenkellner.com>2018-04-03 17:34:55 -0400
commit68fae4e7cae677845cfb74ac6843e866b487b689 (patch)
treee588c2c88e392de6800b27ff7c6b865124e0fc5f
parentaad9b78393f0dfc41868a2da2e96fb5fb349893a (diff)
Reorganize into modules
-rw-r--r--.gitignore3
-rw-r--r--SciArticle.py62
-rw-r--r--article_epub/__init__.py3
-rw-r--r--article_epub/publishers/__init__.py1
-rw-r--r--article_epub/publishers/sciencedirect.py65
-rw-r--r--article_epub/sciarticle.py100
-rw-r--r--sci-scraper.py22
-rw-r--r--sciencedirect.py31
8 files changed, 194 insertions, 93 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..32b1973
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+__pycache__*
+*.epub
+*.pdf
diff --git a/SciArticle.py b/SciArticle.py
deleted file mode 100644
index 440b72e..0000000
--- a/SciArticle.py
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/usr/bin/python3
-#https://github.com/mozilla/geckodriver/releases
-from selenium import webdriver
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from bs4 import BeautifulSoup
-import re
-import os
-import sys
-import pypandoc
-from time import sleep
-
-class SciArticle(object):
-
- def __init__(self, url, doi=None, out_format='kepub'):
- self.url = url
- self.output_format = out_format
- if out_format not in ['epub','kepub']:
- sys.exit('Supported formats are epub and kepub')
- if doi != None:
- self.doi = doi
-
- def soupify(self):
- """Get HTML from article's page"""
- os.environ['MOZ_HEADLESS'] = '1'
- binary = FirefoxBinary('/usr/bin/firefox')
- try:
- driver = webdriver.Firefox(firefox_binary=binary, log_file='/tmp/gecko_log')
- except:
- sys.exit('Failed to load Firefox; is it installed?')
- try:
- driver.get(self.init_url)
- except:
- sys.exit('Failed to load URL')
-
- sleep(2) #To allow redirects
- self.url = driver.current_url
-
- self.soup = BeautifulSoup(driver.page_source,'html.parser')
- driver.quit()
- #return(self.soup)
-
- #def out_filename(self):
- # first5 = self.title.split()[:5]
-
-
- def epubify(self):
- """Convert data into epub format"""
- args = []
- args.append('-M')
- args.append('title="'+self.title+'"')
- args.append('author="'+author+'"')
- args.append('--parse-raw')
-
- epubout = pypandoc.convert_text(self.body,format='html',to='epub',
- extra_args=args,
- outputfile=self.output)
-
-
-
-
-
-
diff --git a/article_epub/__init__.py b/article_epub/__init__.py
new file mode 100644
index 0000000..84afc23
--- /dev/null
+++ b/article_epub/__init__.py
@@ -0,0 +1,3 @@
+#!/usr/bin/python3
+#https://github.com/mozilla/geckodriver/releases
+import article_epub.publishers
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py
new file mode 100644
index 0000000..a51d157
--- /dev/null
+++ b/article_epub/publishers/__init__.py
@@ -0,0 +1 @@
+from article_epub.publishers.sciencedirect import ScienceDirect
diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py
new file mode 100644
index 0000000..5ddc3e3
--- /dev/null
+++ b/article_epub/publishers/sciencedirect.py
@@ -0,0 +1,65 @@
+from article_epub.sciarticle import SciArticle
+
+class ScienceDirect(SciArticle):
+
+ def get_title(self):
+ self.title = self.soup.find('span',class_='title-text').text
+
+ def get_authors(self):
+ author_raw = self.soup.find('div',class_='author-group') \
+ .find_all('span',class_='text surname')
+ self.author_surnames = []
+ for i in author_raw:
+ self.author_surnames.append(i.text)
+
+ author_raw = self.soup.find('div',class_='author-group') \
+ .find_all('span',class_='text given-name')
+ self.author_givennames = []
+ for i in author_raw:
+ self.author_givennames.append(i.text)
+
+ def get_abstract(self):
+ self.abstract = self.soup.find('div',class_='abstract author')
+
+ def get_keywords(self):
+ keys_raw = self.soup.find('div',class_='Keywords') \
+ .find_all('div',class_='keyword')
+ self.keywords = []
+ for i in keys_raw:
+ self.keywords.append(i.text)
+
+ def get_metadata(self):
+ if self.doi == None:
+ doi_raw = self.soup.find('a',class_='doi').get('href').split('/')
+ self.doi = doi_raw[3]+'/'+doi_raw[4]
+
+ self.journal = self.soup.find('div',class_='publication-volume') \
+ .find('span',class_='size-xl').text
+
+ pubdate_raw = self.soup.find('div',class_='publication-volume') \
+ .find('span',class_='size-m').text.split(',')
+
+ self.year = pubdate_raw[1].split(' ')[-1]
+ self.volume = pubdate_raw[0].split(' ')[1]
+ self.pages = pubdate_raw[2].split(' ')[2]
+
+ def get_body(self):
+ body_raw = str(self.soup.find('div',class_='Body'))
+ self.body = body_raw.replace('#bib','#ref-id-bib')
+
+ def get_references(self):
+ self.references = self.soup.find('section',class_='bibliography')
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/article_epub/sciarticle.py b/article_epub/sciarticle.py
new file mode 100644
index 0000000..93aaa09
--- /dev/null
+++ b/article_epub/sciarticle.py
@@ -0,0 +1,100 @@
+from selenium import webdriver
+from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
+from bs4 import BeautifulSoup
+import os
+import sys
+import pypandoc
+from time import sleep
+
+class SciArticle(object):
+
+ def __init__(self, url, doi=None, out_format='epub'):
+ self.url = url
+ self.doi = doi
+ self.output_format = out_format
+ if out_format not in ['epub','kepub']:
+ sys.exit('Supported formats are epub and kepub')
+ if doi != None:
+ self.doi = doi
+
+ def soupify(self):
+ """Get HTML from article's page"""
+ os.environ['MOZ_HEADLESS'] = '1'
+ binary = FirefoxBinary('/usr/bin/firefox')
+ try:
+ driver = webdriver.Firefox(firefox_binary=binary,
+ log_path='/tmp/gecko_log')
+ except:
+ sys.exit('Failed to load Firefox; is it installed?')
+ try:
+ driver.get(self.url)
+ except:
+ sys.exit('Failed to load URL')
+
+ sleep(2) #To allow redirects
+ self.url = driver.current_url
+
+ self.soup = BeautifulSoup(driver.page_source,'html.parser')
+ driver.quit()
+
+ def get_citation(self):
+
+ all_authors = ''
+ for i in range(0,len(self.author_surnames)):
+ all_authors += self.author_surnames[i] + ', '
+ all_authors += self.author_givennames[i]
+ if(i != (len(self.author_surnames) - 1)):
+ all_authors += '; '
+ if all_authors[-1] == '.':
+ cap = ' '
+ else:
+ cap = '. '
+
+ self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \
+ +self.journal+' '+self.volume+': '+self.pages+'.' \
+ +' doi: '+self.doi
+
+ def extract_data(self):
+ self.get_title()
+ self.get_authors()
+ self.get_abstract()
+ self.get_keywords()
+ self.get_metadata()
+ self.get_body()
+ self.get_references()
+ self.get_citation()
+
+ def epubify(self):
+ """Convert data into epub format"""
+
+ all_authors = ''
+ for i in range(0,len(self.author_surnames)):
+ all_authors += self.author_givennames[i] + ' '
+ all_authors += self.author_surnames[i]
+ if(i != (len(self.author_surnames) - 1)):
+ all_authors += ', '
+
+ args = []
+ args.append('-M')
+ args.append('title="'+self.title+'"')
+ args.append('-M')
+ args.append('author="'+all_authors+'"')
+ args.append('--parse-raw')
+
+ self.output = self.author_surnames[0]+self.year+'.epub'
+
+ combined = ''
+ combined += str(self.citation)
+ combined += str(self.abstract)
+ combined += str(self.body)
+ combined += str(self.references)
+
+ epubout = pypandoc.convert_text(combined,format='html',to='epub',
+ extra_args=args,
+ outputfile=self.output)
+
+
+
+
+
+
diff --git a/sci-scraper.py b/sci-scraper.py
new file mode 100644
index 0000000..8bfa1c1
--- /dev/null
+++ b/sci-scraper.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python3
+
+from article_epub.publishers import ScienceDirect
+
+
+test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S037811271630723X')
+
+test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S0946672X17308763')
+
+test.soupify()
+test.extract_data()
+test.epubify()
+
+#####
+
+import urllib.request
+
+
+def final_url(url=None,doi=None):
+ if url !=None:
+ response = requests.get(url)
+
diff --git a/sciencedirect.py b/sciencedirect.py
deleted file mode 100644
index 36d13e0..0000000
--- a/sciencedirect.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import sci-scraper-new
-
-class ScienceDirect(SciArticle):
-
- def get_title(self):
- self.title = self.soup.find('span',class_='title-text').text
-
-
-
-
-
-
-
-
-test.title = test.soup.find('span',class_='title-text').text
-
-author_raw = test.soup.find('div',class_='author-group') \
- .find_all('span',class_='content')
-author_list = []
-
-if len(author_raw) == 1:
- test.authors = author_raw[0].text
-else:
- for i in author_raw:
- author_list.append(i.text)
-
- test.author_list = author_list
-
- #test.authors = ", ".join(author_list)
-
-