aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Kellner <ken@kenkellner.com>2018-04-04 16:23:43 -0400
committerKen Kellner <ken@kenkellner.com>2018-04-04 16:23:43 -0400
commitb37149bedcfcac84f74d8057f1de843c42b85753 (patch)
tree589b795b823ffcfe2a1dd8c9f0a66ebd4872e5e1
parent9366bac2de4c94fb01c7c67e191e55bd39b78aba (diff)
Reorganize and add Springer recipe
-rwxr-xr-xarticle-epub.py23
-rw-r--r--article_epub/publisher.py (renamed from article_epub/sciarticle.py)17
-rw-r--r--article_epub/publishers/__init__.py1
-rw-r--r--article_epub/publishers/sciencedirect.py35
-rw-r--r--article_epub/publishers/springer.py57
-rwxr-xr-xsci-scraper.py39
6 files changed, 113 insertions, 59 deletions
diff --git a/article-epub.py b/article-epub.py
new file mode 100755
index 0000000..431c330
--- /dev/null
+++ b/article-epub.py
@@ -0,0 +1,23 @@
+#!/usr/bin/python3
+import article_epub
+import sys
+import requests
+
+def main():
+ if sys.argv[1] == '-d':
+ url = requests.get('https://doi.org/'+sys.argv[2]).url
+ else:
+ url = sys.argv[1]
+
+ domain = url.split("//")[-1].split("/")[0].split('?')[0]
+
+ art = article_epub.publisher.get_publishers()[domain](url=url)
+
+ print('Downloading content...')
+ art.soupify()
+ art.extract_data()
+ art.epubify()
+
+
+main()
+
diff --git a/article_epub/sciarticle.py b/article_epub/publisher.py
index cd828f0..f8e5424 100644
--- a/article_epub/sciarticle.py
+++ b/article_epub/publisher.py
@@ -7,7 +7,10 @@ import pypandoc
from time import sleep
import subprocess
-class SciArticle(object):
+_publishers = list()
+_publisher_domains = dict()
+
+class Publisher(object):
def __init__(self, url, doi=None, out_format='epub'):
self.url = url
@@ -85,7 +88,7 @@ class SciArticle(object):
args.append('author="'+all_authors+'"')
args.append('--parse-raw')
- self.output = self.author_surnames[0]+self.year+'.epub'
+ self.output = self.author_surnames[0]+'_'+self.year+'.epub'
output_raw = '/tmp/raw.epub'
combined = ''
@@ -101,6 +104,16 @@ class SciArticle(object):
subprocess.check_output(['ebook-convert',output_raw,self.output])
+def register_publisher(publisher):
+ _publishers.append(publisher)
+ for d in publisher.domains:
+ _publisher_domains[d] = publisher
+
+def get_publishers():
+ return _publisher_domains
+
+
+
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py
index a51d157..939ee39 100644
--- a/article_epub/publishers/__init__.py
+++ b/article_epub/publishers/__init__.py
@@ -1 +1,2 @@
from article_epub.publishers.sciencedirect import ScienceDirect
+from article_epub.publishers.springer import Springer
diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py
index 218cb98..26ca971 100644
--- a/article_epub/publishers/sciencedirect.py
+++ b/article_epub/publishers/sciencedirect.py
@@ -1,11 +1,17 @@
-from article_epub.sciarticle import SciArticle
+from article_epub.publisher import Publisher, register_publisher
-class ScienceDirect(SciArticle):
+class ScienceDirect(Publisher):
+ """Class for Science Direct (Elsevier) articles"""
+
+ domains = ["sciencedirect.com","www.sciencedirect.com",
+ "linkinghub.elsevier.com"]
def get_title(self):
+ """Get article title"""
self.title = self.soup.find('span',class_='title-text').text
def get_authors(self):
+ """Get author given and surnammes"""
author_raw = self.soup.find('div',class_='author-group') \
.find_all('span',class_='text surname')
self.author_surnames = []
@@ -19,9 +25,11 @@ class ScienceDirect(SciArticle):
self.author_givennames.append(i.text)
def get_abstract(self):
+ """Get article abstract"""
self.abstract = self.soup.find('div',class_='abstract author')
def get_keywords(self):
+ """Get article keywords"""
keys_raw = self.soup.find('div',class_='Keywords') \
.find_all('div',class_='keyword')
self.keywords = []
@@ -29,6 +37,7 @@ class ScienceDirect(SciArticle):
self.keywords.append(i.text)
def get_metadata(self):
+ """Get assortment of other metadata"""
if self.doi == None:
doi_raw = self.soup.find('a',class_='doi').get('href').split('/')
self.doi = doi_raw[3]+'/'+doi_raw[4]
@@ -39,27 +48,17 @@ class ScienceDirect(SciArticle):
pubdate_raw = self.soup.find('div',class_='publication-volume') \
.find('span',class_='size-m').text.split(',')
- self.year = pubdate_raw[1].split(' ')[-1]
+ self.year = pubdate_raw[-2].split(' ')[-1]
self.volume = pubdate_raw[0].split(' ')[1]
- self.pages = pubdate_raw[2].split(' ')[2]
+ self.pages = pubdate_raw[-1].split(' ')[2]
def get_body(self):
+ """Get body of article"""
body_raw = str(self.soup.find('div',class_='Body'))
- self.body = body_raw.replace('#b','#ref-id-b')
+ self.body = body_raw.replace('#b','#ref-id-b') #Fix anchors
def get_references(self):
+ """Get references list"""
self.references = self.soup.find('section',class_='bibliography')
-
-
-
-
-
-
-
-
-
-
-
-
-
+register_publisher(ScienceDirect)
diff --git a/article_epub/publishers/springer.py b/article_epub/publishers/springer.py
new file mode 100644
index 0000000..bdd0354
--- /dev/null
+++ b/article_epub/publishers/springer.py
@@ -0,0 +1,57 @@
+from article_epub.publisher import Publisher, register_publisher
+
+class Springer(Publisher):
+ """Class for Springer articles"""
+
+ domains = ["link.springer.com","springer.com","www.springer.com"]
+
+ def get_title(self):
+ """Get article title"""
+ self.title = self.soup.find('h1',class_='ArticleTitle').text
+
+ def get_authors(self):
+ """Get author given and surnammes"""
+ author_raw = self.soup.find_all('span',class_='authors__name')
+ self.author_surnames = []
+ self.author_givennames = []
+ for i in author_raw:
+ name = i.text.split('\xa0')
+ self.author_surnames.append(name[-1])
+ self.author_givennames.append(' '.join(name[:-1]))
+
+ def get_abstract(self):
+ """Get article abstract"""
+ self.abstract = self.soup.find('section',class_='Abstract')
+
+ def get_keywords(self):
+ """Get article keywords"""
+ keywords_raw = self.soup.find_all('span',class_='Keyword')
+ self.keywords = []
+ for i in keywords_raw:
+ self.keywords.append(i.text.replace('\xa0',''))
+
+ def get_metadata(self):
+ """Get assortment of other metadata"""
+ if self.doi == None:
+ doi_raw = self.soup.find('span',{"id":"doi-url"}).text.split('/')
+ self.doi = doi_raw[-2]+'/'+doi_raw[-1]
+
+ self.journal = self.soup.find('span',class_="JournalTitle").text
+
+ self.year = self.soup.find('time')['datetime'].split('-')[0]
+
+ self.volume = self.soup.find('span',class_="ArticleCitation_Volume") \
+ .text[:-2].split(' ')[-1]
+
+ self.pages = self.soup.find('span',class_="ArticleCitation_Pages") \
+ .text.split(' ')[-1]
+
+ def get_body(self):
+ """Get body of article"""
+ self.body = self.soup.find('div',{"id":"body"})
+
+ def get_references(self):
+ """Get references list"""
+ self.references = self.soup.find('section',{"id":"Bib1"})
+
+register_publisher(Springer)
diff --git a/sci-scraper.py b/sci-scraper.py
deleted file mode 100755
index 6bb5861..0000000
--- a/sci-scraper.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/python3
-from article_epub.publishers import ScienceDirect
-import sys
-import requests
-
-def main():
- if sys.argv[1] == '-d':
- url = requests.get('https://doi.org/'+sys.argv[2]).url
- art = ScienceDirect(url=url,doi=sys.argv[2])
- else:
- url = sys.argv[1]
- art = ScienceDirect(url=url)
- print('Downloading content...')
- art.soupify()
- art.extract_data()
- art.epubify()
-
-
-main()
-#test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S037811271630723X')
-
-#test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S0946672X17308763')
-
-#test.soupify()
-#test.extract_data()
-#test.epubify()
-
-#####
-
-#import urllib.request
-
-
-#def final_url(url=None,doi=None):
-# if url !=None:
-# response = requests.get(url)
-# elif doi !=None:
-# response = request.get('https://doi.org/'+doi)
-
-