aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Kellner <ken@kenkellner.com>2018-04-05 08:20:40 -0400
committerKen Kellner <ken@kenkellner.com>2018-04-05 08:20:40 -0400
commit7aacb78f566a97c90b8f460bf33e075a022060c1 (patch)
tree09ccb0001e6e6a9716379d287e4193265472228e
parent52d5cc23a30b76f43522b2d37229d4449f099d73 (diff)
Add Wiley recipe and get info from DOI where possible
-rwxr-xr-xarticle-epub.py3
-rw-r--r--article_epub/publisher.py62
-rw-r--r--article_epub/publishers/__init__.py1
-rw-r--r--article_epub/publishers/sciencedirect.py70
-rw-r--r--article_epub/publishers/springer.py63
-rw-r--r--article_epub/publishers/wiley.py47
6 files changed, 174 insertions, 72 deletions
diff --git a/article-epub.py b/article-epub.py
index 7b301e9..2a5d339 100755
--- a/article-epub.py
+++ b/article-epub.py
@@ -11,7 +11,8 @@ def main():
url = sys.argv[1]
doi = None
- domain = url.split("//")[-1].split("/")[0].split('?')[0]
+ domain = ".".join(url.split("//")[-1].split("/")[0] \
+ .split('?')[0].split('.')[-2:])
art = article_epub.publisher.get_publishers()[domain](url=url,doi=doi)
diff --git a/article_epub/publisher.py b/article_epub/publisher.py
index f8abfc8..0ea9259 100644
--- a/article_epub/publisher.py
+++ b/article_epub/publisher.py
@@ -6,6 +6,8 @@ import sys
import pypandoc
from time import sleep
import subprocess
+import requests
+import json
_publishers = list()
_publisher_domains = dict()
@@ -16,8 +18,12 @@ class Publisher(object):
self.url = url
self.doi = doi
+ def get_final_url(self):
+ pass
+
def soupify(self):
"""Get HTML from article's page"""
+ self.get_final_url()
os.environ['MOZ_HEADLESS'] = '1'
binary = FirefoxBinary('/usr/bin/firefox')
try:
@@ -33,12 +39,44 @@ class Publisher(object):
if self.doi != None:
print('Waiting for redirects..')
sleep(5) #To allow redirects
-
+
+ sleep(5)
self.url = driver.current_url
self.soup = BeautifulSoup(driver.page_source,'html.parser')
driver.quit()
+ def doi2json(self):
+ """
+ Get a dictionary of metadata for a given DOI.
+ """
+ url = "http://dx.doi.org/" + self.doi
+ headers = {"accept": "application/json"}
+ r = requests.get(url, headers = headers)
+ self.meta = r.json()
+
+ def get_metadata(self):
+ self.doi2json()
+
+ self.title = self.meta['title']
+
+ self.author_surnames = []
+ self.author_givennames = []
+ for i in self.meta['author']:
+ self.author_surnames.append(i['family'])
+ self.author_givennames.append(i['given'])
+
+ self.journal = self.meta['container-title']
+
+ if 'published-print' in self.meta.keys():
+ self.volume = str(self.meta['volume'])
+ self.pages = str(self.meta['page'])
+ self.year = str(self.meta['published-print']['date-parts'][0][0])
+ else:
+ self.volume = ''
+ self.pages = ''
+ self.year = str(self.meta['published-online']['date-parts'][0][0])
+
def get_citation(self):
all_authors = ''
@@ -52,19 +90,23 @@ class Publisher(object):
else:
cap = '. '
- self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \
- +self.journal+' '+self.volume+': '+self.pages+'.' \
- +' doi: '+self.doi
+ if self.volume != '':
+ self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \
+ +self.journal+' '+self.volume+': '+self.pages+'.' \
+ +' doi: '+self.doi
+ else:
+ self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \
+ +self.journal+'. '+' doi: '+self.doi
def extract_data(self):
- self.get_title()
- self.get_authors()
+ #self.get_title()
+ #self.get_authors()
+ self.get_doi()
+ self.get_metadata()
self.get_abstract()
self.get_keywords()
- self.get_metadata()
self.get_body()
self.get_references()
- self.get_citation()
def epubify(self):
"""Convert data into epub format"""
@@ -75,7 +117,9 @@ class Publisher(object):
all_authors += self.author_surnames[i]
if(i != (len(self.author_surnames) - 1)):
all_authors += ', '
-
+
+ self.get_citation()
+
args = []
args.append('-M')
args.append('title="'+self.title+'"')
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py
index 939ee39..730fab3 100644
--- a/article_epub/publishers/__init__.py
+++ b/article_epub/publishers/__init__.py
@@ -1,2 +1,3 @@
from article_epub.publishers.sciencedirect import ScienceDirect
from article_epub.publishers.springer import Springer
+from article_epub.publishers.wiley import Wiley
diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py
index 26ca971..34fecfd 100644
--- a/article_epub/publishers/sciencedirect.py
+++ b/article_epub/publishers/sciencedirect.py
@@ -3,26 +3,30 @@ from article_epub.publisher import Publisher, register_publisher
class ScienceDirect(Publisher):
"""Class for Science Direct (Elsevier) articles"""
- domains = ["sciencedirect.com","www.sciencedirect.com",
- "linkinghub.elsevier.com"]
+ domains = ["sciencedirect.com","elsevier.com"]
- def get_title(self):
- """Get article title"""
- self.title = self.soup.find('span',class_='title-text').text
+ #def get_title(self):
+ # """Get article title"""
+ # self.title = self.soup.find('span',class_='title-text').text
- def get_authors(self):
- """Get author given and surnammes"""
- author_raw = self.soup.find('div',class_='author-group') \
- .find_all('span',class_='text surname')
- self.author_surnames = []
- for i in author_raw:
- self.author_surnames.append(i.text)
-
- author_raw = self.soup.find('div',class_='author-group') \
- .find_all('span',class_='text given-name')
- self.author_givennames = []
- for i in author_raw:
- self.author_givennames.append(i.text)
+ #def get_authors(self):
+ # """Get author given and surnammes"""
+ # author_raw = self.soup.find('div',class_='author-group') \
+ # .find_all('span',class_='text surname')
+ # self.author_surnames = []
+ # for i in author_raw:
+ # self.author_surnames.append(i.text)
+ #
+ # author_raw = self.soup.find('div',class_='author-group') \
+ # .find_all('span',class_='text given-name')
+ # self.author_givennames = []
+ # for i in author_raw:
+ # self.author_givennames.append(i.text)
+
+ def get_doi(self):
+ if self.doi == None:
+ doi_raw = self.soup.find('a',class_='doi').get('href').split('/')
+ self.doi = str(doi_raw[3]+'/'+doi_raw[4])
def get_abstract(self):
"""Get article abstract"""
@@ -36,21 +40,21 @@ class ScienceDirect(Publisher):
for i in keys_raw:
self.keywords.append(i.text)
- def get_metadata(self):
- """Get assortment of other metadata"""
- if self.doi == None:
- doi_raw = self.soup.find('a',class_='doi').get('href').split('/')
- self.doi = doi_raw[3]+'/'+doi_raw[4]
-
- self.journal = self.soup.find('div',class_='publication-volume') \
- .find('span',class_='size-xl').text
-
- pubdate_raw = self.soup.find('div',class_='publication-volume') \
- .find('span',class_='size-m').text.split(',')
-
- self.year = pubdate_raw[-2].split(' ')[-1]
- self.volume = pubdate_raw[0].split(' ')[1]
- self.pages = pubdate_raw[-1].split(' ')[2]
+ #def get_metadata(self):
+ # """Get assortment of other metadata"""
+ # if self.doi == None:
+ # doi_raw = self.soup.find('a',class_='doi').get('href').split('/')
+ # self.doi = doi_raw[3]+'/'+doi_raw[4]
+ #
+ # self.journal = self.soup.find('div',class_='publication-volume') \
+ # .find('span',class_='size-xl').text
+ #
+ # pubdate_raw = self.soup.find('div',class_='publication-volume') \
+ # .find('span',class_='size-m').text.split(',')
+ #
+ # self.year = pubdate_raw[-2].split(' ')[-1]
+ # self.volume = pubdate_raw[0].split(' ')[1]
+ # self.pages = pubdate_raw[-1].split(' ')[2]
def get_body(self):
"""Get body of article"""
diff --git a/article_epub/publishers/springer.py b/article_epub/publishers/springer.py
index bdd0354..77930d4 100644
--- a/article_epub/publishers/springer.py
+++ b/article_epub/publishers/springer.py
@@ -3,21 +3,26 @@ from article_epub.publisher import Publisher, register_publisher
class Springer(Publisher):
"""Class for Springer articles"""
- domains = ["link.springer.com","springer.com","www.springer.com"]
+ domains = ["springer.com"]
- def get_title(self):
- """Get article title"""
- self.title = self.soup.find('h1',class_='ArticleTitle').text
-
- def get_authors(self):
- """Get author given and surnammes"""
- author_raw = self.soup.find_all('span',class_='authors__name')
- self.author_surnames = []
- self.author_givennames = []
- for i in author_raw:
- name = i.text.split('\xa0')
- self.author_surnames.append(name[-1])
- self.author_givennames.append(' '.join(name[:-1]))
+ #def get_title(self):
+ # """Get article title"""
+ # self.title = self.soup.find('h1',class_='ArticleTitle').text
+
+ #def get_authors(self):
+ # """Get author given and surnammes"""
+ # author_raw = self.soup.find_all('span',class_='authors__name')
+ # self.author_surnames = []
+ # self.author_givennames = []
+ # for i in author_raw:
+ # name = i.text.split('\xa0')
+ # self.author_surnames.append(name[-1])
+ # self.author_givennames.append(' '.join(name[:-1]))
+
+ def get_doi(self):
+ if self.doi == None:
+ doi_raw = self.soup.find('span',{"id":"doi-url"}).text.split('/')
+ self.doi = str(doi_raw[-2]+'/'+doi_raw[-1])
def get_abstract(self):
"""Get article abstract"""
@@ -30,21 +35,21 @@ class Springer(Publisher):
for i in keywords_raw:
self.keywords.append(i.text.replace('\xa0',''))
- def get_metadata(self):
- """Get assortment of other metadata"""
- if self.doi == None:
- doi_raw = self.soup.find('span',{"id":"doi-url"}).text.split('/')
- self.doi = doi_raw[-2]+'/'+doi_raw[-1]
-
- self.journal = self.soup.find('span',class_="JournalTitle").text
-
- self.year = self.soup.find('time')['datetime'].split('-')[0]
-
- self.volume = self.soup.find('span',class_="ArticleCitation_Volume") \
- .text[:-2].split(' ')[-1]
-
- self.pages = self.soup.find('span',class_="ArticleCitation_Pages") \
- .text.split(' ')[-1]
+ #def get_metadata(self):
+ # """Get assortment of other metadata"""
+ # if self.doi == None:
+ # doi_raw = self.soup.find('span',{"id":"doi-url"}).text.split('/')
+ # self.doi = doi_raw[-2]+'/'+doi_raw[-1]
+ #
+ # self.journal = self.soup.find('span',class_="JournalTitle").text
+ #
+ # self.year = self.soup.find('time')['datetime'].split('-')[0]
+ #
+ # self.volume = self.soup.find('span',class_="ArticleCitation_Volume") \
+ # .text[:-2].split(' ')[-1]
+ #
+ # self.pages = self.soup.find('span',class_="ArticleCitation_Pages") \
+ # .text.split(' ')[-1]
def get_body(self):
"""Get body of article"""
diff --git a/article_epub/publishers/wiley.py b/article_epub/publishers/wiley.py
new file mode 100644
index 0000000..0133c1d
--- /dev/null
+++ b/article_epub/publishers/wiley.py
@@ -0,0 +1,47 @@
+from article_epub.publisher import Publisher, register_publisher
+
+class Wiley(Publisher):
+ """Class for Springer articles"""
+
+ domains = ["wiley.com"]
+
+ def get_final_url(self):
+ if '/abs/' in self.url:
+ self.url = self.url.replace('/abs/','/full/')
+
+ def get_doi(self):
+ if self.doi == None:
+ doi_raw = self.soup.find('a',class_='epub-doi').text.split('/')
+ self.doi = str(doi_raw[3]+'/'+doi_raw[4])
+
+ def get_abstract(self):
+ """Get article abstract"""
+ self.abstract = self.soup.find('section',
+ class_='article-section__abstract')
+
+ def get_keywords(self):
+ """Get article keywords"""
+ keywords_raw = self.soup.find('section',class_='keywords') \
+ .find_all('a',class_='badge-type')
+ self.keywords = []
+ for i in keywords_raw:
+ self.keywords.append(i.text.replace('\n','').replace('\u200a',''))
+
+ def get_body(self):
+ """Get body of article"""
+ body_raw = self.soup.find_all('div',class_='article-section__content')
+ body_raw = body_raw[1:]
+ self.body = ''
+ for i in body_raw:
+ self.body += str(i)
+
+ def get_references(self):
+ """Get references list"""
+ references_raw = str(self.soup.find('section',
+ {'id':'references-section'}))
+ references_raw = references_raw.replace('"display: none;"','')
+ references_raw = references_raw.replace('Literature Cited','')
+ references_raw = references_raw.replace('data-bib-id','id')
+ self.references = '<h2>Literature Cited</h2>\n'+references_raw
+
+register_publisher(Wiley)