Reorganize and add Springer recipe

author: Ken Kellner <ken@kenkellner.com> 2018-04-04 16:23:43 -0400
committer: Ken Kellner <ken@kenkellner.com> 2018-04-04 16:23:43 -0400
commit: b37149bedcfcac84f74d8057f1de843c42b85753 (patch)
tree: 589b795b823ffcfe2a1dd8c9f0a66ebd4872e5e1
parent: 9366bac2de4c94fb01c7c67e191e55bd39b78aba (diff)
6 files changed, 113 insertions, 59 deletions
diff --git a/article-epub.py b/article-epub.py
new file mode 100755
index 0000000..431c330
--- /dev/null
+++ b/article-epub.py
@@ -0,0 +1,23 @@
+#!/usr/bin/python3
+import article_epub
+import sys
+import requests
+
+def main():
+    if sys.argv[1] == '-d': 
+        url = requests.get('https://doi.org/'+sys.argv[2]).url
+    else:
+        url = sys.argv[1]
+    
+    domain = url.split("//")[-1].split("/")[0].split('?')[0]
+
+    art = article_epub.publisher.get_publishers()[domain](url=url)
+
+    print('Downloading content...')
+    art.soupify()
+    art.extract_data()
+    art.epubify()
+
+
+main()
+
diff --git a/article_epub/sciarticle.py b/article_epub/publisher.py
index cd828f0..f8e5424 100644
--- a/article_epub/sciarticle.py
+++ b/article_epub/publisher.py
@@ -7,7 +7,10 @@ import pypandoc
 from time import sleep
 import subprocess
 
-class SciArticle(object):
+_publishers = list()
+_publisher_domains = dict()
+
+class Publisher(object):
     
     def __init__(self, url, doi=None, out_format='epub'):
         self.url = url
@@ -85,7 +88,7 @@ class SciArticle(object):
         args.append('author="'+all_authors+'"')
         args.append('--parse-raw')
 
-        self.output = self.author_surnames[0]+self.year+'.epub'
+        self.output = self.author_surnames[0]+'_'+self.year+'.epub'
         output_raw = '/tmp/raw.epub'
 
         combined = ''
@@ -101,6 +104,16 @@ class SciArticle(object):
 
         subprocess.check_output(['ebook-convert',output_raw,self.output])
 
+def register_publisher(publisher):
+    _publishers.append(publisher)
+    for d in publisher.domains:
+        _publisher_domains[d] = publisher
+
+def get_publishers():
+    return _publisher_domains
+
+
+
 
 
 
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py
index a51d157..939ee39 100644
--- a/article_epub/publishers/__init__.py
+++ b/article_epub/publishers/__init__.py
@@ -1 +1,2 @@
 from article_epub.publishers.sciencedirect import ScienceDirect
+from article_epub.publishers.springer import Springer
diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py
index 218cb98..26ca971 100644
--- a/article_epub/publishers/sciencedirect.py
+++ b/article_epub/publishers/sciencedirect.py
@@ -1,11 +1,17 @@
-from article_epub.sciarticle import SciArticle
+from article_epub.publisher import Publisher, register_publisher
 
-class ScienceDirect(SciArticle):
+class ScienceDirect(Publisher):
+    """Class for Science Direct (Elsevier) articles"""
+
+    domains = ["sciencedirect.com","www.sciencedirect.com",
+            "linkinghub.elsevier.com"]
     
     def get_title(self):
+        """Get article title"""
         self.title = self.soup.find('span',class_='title-text').text
 
     def get_authors(self):
+        """Get author given and surnammes"""
         author_raw = self.soup.find('div',class_='author-group') \
             .find_all('span',class_='text surname')
         self.author_surnames = []
@@ -19,9 +25,11 @@ class ScienceDirect(SciArticle):
             self.author_givennames.append(i.text)
 
     def get_abstract(self):
+        """Get article abstract"""
         self.abstract = self.soup.find('div',class_='abstract author')
 
     def get_keywords(self):
+        """Get article keywords"""
         keys_raw = self.soup.find('div',class_='Keywords') \
             .find_all('div',class_='keyword')
         self.keywords = []
@@ -29,6 +37,7 @@ class ScienceDirect(SciArticle):
             self.keywords.append(i.text)
 
     def get_metadata(self):
+        """Get assortment of other metadata"""
         if self.doi == None:
             doi_raw = self.soup.find('a',class_='doi').get('href').split('/')
             self.doi = doi_raw[3]+'/'+doi_raw[4]
@@ -39,27 +48,17 @@ class ScienceDirect(SciArticle):
         pubdate_raw = self.soup.find('div',class_='publication-volume') \
             .find('span',class_='size-m').text.split(',')
 
-        self.year = pubdate_raw[1].split(' ')[-1]
+        self.year = pubdate_raw[-2].split(' ')[-1]
         self.volume = pubdate_raw[0].split(' ')[1]
-        self.pages = pubdate_raw[2].split(' ')[2]
+        self.pages = pubdate_raw[-1].split(' ')[2]
 
     def get_body(self):
+        """Get body of article"""
         body_raw = str(self.soup.find('div',class_='Body'))
-        self.body = body_raw.replace('#b','#ref-id-b')
+        self.body = body_raw.replace('#b','#ref-id-b') #Fix anchors
 
     def get_references(self):
+        """Get references list"""
         self.references = self.soup.find('section',class_='bibliography')
 
-
-
-
-
-
-
-
-
-
-
-
-
-
+register_publisher(ScienceDirect)
diff --git a/article_epub/publishers/springer.py b/article_epub/publishers/springer.py
new file mode 100644
index 0000000..bdd0354
--- /dev/null
+++ b/article_epub/publishers/springer.py
@@ -0,0 +1,57 @@
+from article_epub.publisher import Publisher, register_publisher
+
+class Springer(Publisher):
+    """Class for Springer articles"""
+
+    domains = ["link.springer.com","springer.com","www.springer.com"]
+    
+    def get_title(self):
+        """Get article title"""
+        self.title = self.soup.find('h1',class_='ArticleTitle').text
+
+    def get_authors(self):
+        """Get author given and surnammes"""
+        author_raw = self.soup.find_all('span',class_='authors__name')
+        self.author_surnames = []
+        self.author_givennames = []
+        for i in author_raw:
+            name = i.text.split('\xa0')
+            self.author_surnames.append(name[-1])
+            self.author_givennames.append(' '.join(name[:-1]))
+
+    def get_abstract(self):
+        """Get article abstract"""
+        self.abstract = self.soup.find('section',class_='Abstract')
+
+    def get_keywords(self):
+        """Get article keywords"""
+        keywords_raw = self.soup.find_all('span',class_='Keyword')
+        self.keywords = []
+        for i in keywords_raw:
+            self.keywords.append(i.text.replace('\xa0',''))
+
+    def get_metadata(self):
+        """Get assortment of other metadata"""
+        if self.doi == None:
+            doi_raw = self.soup.find('span',{"id":"doi-url"}).text.split('/')
+            self.doi = doi_raw[-2]+'/'+doi_raw[-1]
+
+        self.journal = self.soup.find('span',class_="JournalTitle").text
+
+        self.year = self.soup.find('time')['datetime'].split('-')[0]
+
+        self.volume = self.soup.find('span',class_="ArticleCitation_Volume") \
+            .text[:-2].split(' ')[-1]
+
+        self.pages = self.soup.find('span',class_="ArticleCitation_Pages") \
+            .text.split(' ')[-1]
+
+    def get_body(self):
+        """Get body of article"""
+        self.body = self.soup.find('div',{"id":"body"})
+
+    def get_references(self):
+        """Get references list"""
+        self.references = self.soup.find('section',{"id":"Bib1"})
+
+register_publisher(Springer)
diff --git a/sci-scraper.py b/sci-scraper.py
deleted file mode 100755
index 6bb5861..0000000
--- a/sci-scraper.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/python3
-from article_epub.publishers import ScienceDirect
-import sys
-import requests
-
-def main():
-    if sys.argv[1] == '-d': 
-        url = requests.get('https://doi.org/'+sys.argv[2]).url
-        art = ScienceDirect(url=url,doi=sys.argv[2])
-    else:
-        url = sys.argv[1]
-        art = ScienceDirect(url=url)
-    print('Downloading content...')
-    art.soupify()
-    art.extract_data()
-    art.epubify()
-
-
-main()
-#test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S037811271630723X')
-
-#test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S0946672X17308763')
-
-#test.soupify()
-#test.extract_data()
-#test.epubify()
-
-#####
-
-#import urllib.request
-
-
-#def final_url(url=None,doi=None):
-#    if url !=None:
-#        response = requests.get(url)
-#    elif doi !=None:
-#        response = request.get('https://doi.org/'+doi)
-    
-
author	Ken Kellner <ken@kenkellner.com>	2018-04-04 16:23:43 -0400
committer	Ken Kellner <ken@kenkellner.com>	2018-04-04 16:23:43 -0400
commit	b37149bedcfcac84f74d8057f1de843c42b85753 (patch)
tree	589b795b823ffcfe2a1dd8c9f0a66ebd4872e5e1
parent	9366bac2de4c94fb01c7c67e191e55bd39b78aba (diff)