Reorganize into modules

author: Ken Kellner <ken@kenkellner.com> 2018-04-03 17:34:55 -0400
committer: Ken Kellner <ken@kenkellner.com> 2018-04-03 17:34:55 -0400
commit: 68fae4e7cae677845cfb74ac6843e866b487b689 (patch)
tree: e588c2c88e392de6800b27ff7c6b865124e0fc5f
parent: aad9b78393f0dfc41868a2da2e96fb5fb349893a (diff)
8 files changed, 194 insertions, 93 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..32b1973
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+__pycache__*
+*.epub
+*.pdf
diff --git a/SciArticle.py b/SciArticle.py
deleted file mode 100644
index 440b72e..0000000
--- a/SciArticle.py
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/usr/bin/python3
-#https://github.com/mozilla/geckodriver/releases
-from selenium import webdriver
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from bs4 import BeautifulSoup
-import re
-import os 
-import sys
-import pypandoc
-from time import sleep
-
-class SciArticle(object):
-    
-    def __init__(self, url, doi=None, out_format='kepub'):
-        self.url = url
-        self.output_format = out_format
-        if out_format not in ['epub','kepub']:
-            sys.exit('Supported formats are epub and kepub')
-        if doi != None:
-            self.doi = doi
-
-    def soupify(self):
-        """Get HTML from article's page"""
-        os.environ['MOZ_HEADLESS'] = '1'
-        binary = FirefoxBinary('/usr/bin/firefox')
-        try:
-            driver = webdriver.Firefox(firefox_binary=binary, log_file='/tmp/gecko_log')
-        except:
-            sys.exit('Failed to load Firefox; is it installed?')
-        try:
-            driver.get(self.init_url)
-        except:
-            sys.exit('Failed to load URL')
-        
-        sleep(2) #To allow redirects
-        self.url = driver.current_url
-        
-        self.soup = BeautifulSoup(driver.page_source,'html.parser')
-        driver.quit()
-        #return(self.soup)
-
-    #def out_filename(self):
-    #    first5 = self.title.split()[:5]
-
-
-    def epubify(self):
-        """Convert data into epub format"""
-        args = []
-        args.append('-M')
-        args.append('title="'+self.title+'"')
-        args.append('author="'+author+'"')
-        args.append('--parse-raw')
-
-        epubout = pypandoc.convert_text(self.body,format='html',to='epub',
-                extra_args=args,
-                outputfile=self.output)
-
-
-
-
-
-
diff --git a/article_epub/__init__.py b/article_epub/__init__.py
new file mode 100644
index 0000000..84afc23
--- /dev/null
+++ b/article_epub/__init__.py
@@ -0,0 +1,3 @@
+#!/usr/bin/python3
+#https://github.com/mozilla/geckodriver/releases
+import article_epub.publishers
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py
new file mode 100644
index 0000000..a51d157
--- /dev/null
+++ b/article_epub/publishers/__init__.py
@@ -0,0 +1 @@
+from article_epub.publishers.sciencedirect import ScienceDirect
diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py
new file mode 100644
index 0000000..5ddc3e3
--- /dev/null
+++ b/article_epub/publishers/sciencedirect.py
@@ -0,0 +1,65 @@
+from article_epub.sciarticle import SciArticle
+
+class ScienceDirect(SciArticle):
+    
+    def get_title(self):
+        self.title = self.soup.find('span',class_='title-text').text
+
+    def get_authors(self):
+        author_raw = self.soup.find('div',class_='author-group') \
+            .find_all('span',class_='text surname')
+        self.author_surnames = []
+        for i in author_raw:
+            self.author_surnames.append(i.text)
+
+        author_raw = self.soup.find('div',class_='author-group') \
+            .find_all('span',class_='text given-name')
+        self.author_givennames = []
+        for i in author_raw:
+            self.author_givennames.append(i.text)
+
+    def get_abstract(self):
+        self.abstract = self.soup.find('div',class_='abstract author')
+
+    def get_keywords(self):
+        keys_raw = self.soup.find('div',class_='Keywords') \
+            .find_all('div',class_='keyword')
+        self.keywords = []
+        for i in keys_raw:
+            self.keywords.append(i.text)
+
+    def get_metadata(self):
+        if self.doi == None:
+            doi_raw = self.soup.find('a',class_='doi').get('href').split('/')
+            self.doi = doi_raw[3]+'/'+doi_raw[4]
+
+        self.journal = self.soup.find('div',class_='publication-volume') \
+            .find('span',class_='size-xl').text
+
+        pubdate_raw = self.soup.find('div',class_='publication-volume') \
+            .find('span',class_='size-m').text.split(',')
+
+        self.year = pubdate_raw[1].split(' ')[-1]
+        self.volume = pubdate_raw[0].split(' ')[1]
+        self.pages = pubdate_raw[2].split(' ')[2]
+
+    def get_body(self):
+        body_raw = str(self.soup.find('div',class_='Body'))
+        self.body = body_raw.replace('#bib','#ref-id-bib')
+
+    def get_references(self):
+        self.references = self.soup.find('section',class_='bibliography')
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/article_epub/sciarticle.py b/article_epub/sciarticle.py
new file mode 100644
index 0000000..93aaa09
--- /dev/null
+++ b/article_epub/sciarticle.py
@@ -0,0 +1,100 @@
+from selenium import webdriver
+from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
+from bs4 import BeautifulSoup
+import os 
+import sys
+import pypandoc
+from time import sleep
+
+class SciArticle(object):
+    
+    def __init__(self, url, doi=None, out_format='epub'):
+        self.url = url
+        self.doi = doi
+        self.output_format = out_format
+        if out_format not in ['epub','kepub']:
+            sys.exit('Supported formats are epub and kepub')
+        if doi != None:
+            self.doi = doi
+
+    def soupify(self):
+        """Get HTML from article's page"""
+        os.environ['MOZ_HEADLESS'] = '1'
+        binary = FirefoxBinary('/usr/bin/firefox')
+        try:
+            driver = webdriver.Firefox(firefox_binary=binary, 
+                    log_path='/tmp/gecko_log')
+        except:
+            sys.exit('Failed to load Firefox; is it installed?')
+        try:
+            driver.get(self.url)
+        except:
+            sys.exit('Failed to load URL')
+        
+        sleep(2) #To allow redirects
+        self.url = driver.current_url
+        
+        self.soup = BeautifulSoup(driver.page_source,'html.parser')
+        driver.quit()
+
+    def get_citation(self):
+        
+        all_authors = ''
+        for i in range(0,len(self.author_surnames)):
+            all_authors += self.author_surnames[i] + ', '
+            all_authors += self.author_givennames[i]
+            if(i != (len(self.author_surnames) - 1)):
+                all_authors += '; '
+        if all_authors[-1] == '.':
+            cap = ' '
+        else:
+            cap = '. '
+        
+        self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \
+                +self.journal+' '+self.volume+': '+self.pages+'.' \
+                +' doi: '+self.doi
+    
+    def extract_data(self):
+        self.get_title()
+        self.get_authors()
+        self.get_abstract()
+        self.get_keywords()
+        self.get_metadata()
+        self.get_body()
+        self.get_references()
+        self.get_citation()
+
+    def epubify(self):
+        """Convert data into epub format"""
+
+        all_authors = ''
+        for i in range(0,len(self.author_surnames)):
+            all_authors += self.author_givennames[i] + ' '
+            all_authors += self.author_surnames[i]
+            if(i != (len(self.author_surnames) - 1)):
+                all_authors += ', '
+        
+        args = []
+        args.append('-M')
+        args.append('title="'+self.title+'"')
+        args.append('-M')
+        args.append('author="'+all_authors+'"')
+        args.append('--parse-raw')
+
+        self.output = self.author_surnames[0]+self.year+'.epub'
+
+        combined = ''
+        combined += str(self.citation)
+        combined += str(self.abstract)
+        combined += str(self.body)
+        combined += str(self.references)
+
+        epubout = pypandoc.convert_text(combined,format='html',to='epub',
+                extra_args=args,
+                outputfile=self.output)
+
+
+
+
+
+
diff --git a/sci-scraper.py b/sci-scraper.py
new file mode 100644
index 0000000..8bfa1c1
--- /dev/null
+++ b/sci-scraper.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python3
+
+from article_epub.publishers import ScienceDirect
+
+
+test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S037811271630723X')
+
+test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S0946672X17308763')
+
+test.soupify()
+test.extract_data()
+test.epubify()
+
+#####
+
+import urllib.request
+
+
+def final_url(url=None,doi=None):
+    if url !=None:
+        response = requests.get(url)
+
diff --git a/sciencedirect.py b/sciencedirect.py
deleted file mode 100644
index 36d13e0..0000000
--- a/sciencedirect.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import sci-scraper-new
-
-class ScienceDirect(SciArticle):
-    
-    def get_title(self):
-        self.title = self.soup.find('span',class_='title-text').text
-
-
-
-
-
-
-
-
-test.title = test.soup.find('span',class_='title-text').text
-
-author_raw = test.soup.find('div',class_='author-group') \
-        .find_all('span',class_='content')
-author_list = []
-
-if len(author_raw) == 1:
-    test.authors = author_raw[0].text
-else:
-    for i in author_raw:
-        author_list.append(i.text)
-    
-    test.author_list = author_list
-    
-    #test.authors = ", ".join(author_list)
-
-
author	Ken Kellner <ken@kenkellner.com>	2018-04-03 17:34:55 -0400
committer	Ken Kellner <ken@kenkellner.com>	2018-04-03 17:34:55 -0400
commit	68fae4e7cae677845cfb74ac6843e866b487b689 (patch)
tree	e588c2c88e392de6800b27ff7c6b865124e0fc5f
parent	aad9b78393f0dfc41868a2da2e96fb5fb349893a (diff)