Add Wiley recipe and get info from DOI where possible

author: Ken Kellner <ken@kenkellner.com> 2018-04-05 08:20:40 -0400
committer: Ken Kellner <ken@kenkellner.com> 2018-04-05 08:20:40 -0400
commit: 7aacb78f566a97c90b8f460bf33e075a022060c1 (patch)
tree: 09ccb0001e6e6a9716379d287e4193265472228e
parent: 52d5cc23a30b76f43522b2d37229d4449f099d73 (diff)
6 files changed, 174 insertions, 72 deletions
diff --git a/article-epub.py b/article-epub.py
index 7b301e9..2a5d339 100755
--- a/article-epub.py
+++ b/article-epub.py
@@ -11,7 +11,8 @@ def main():
         url = sys.argv[1]
         doi = None
     
-    domain = url.split("//")[-1].split("/")[0].split('?')[0]
+    domain = ".".join(url.split("//")[-1].split("/")[0] \
+            .split('?')[0].split('.')[-2:])
 
     art = article_epub.publisher.get_publishers()[domain](url=url,doi=doi)
 
diff --git a/article_epub/publisher.py b/article_epub/publisher.py
index f8abfc8..0ea9259 100644
--- a/article_epub/publisher.py
+++ b/article_epub/publisher.py
@@ -6,6 +6,8 @@ import sys
 import pypandoc
 from time import sleep
 import subprocess
+import requests
+import json
 
 _publishers = list()
 _publisher_domains = dict()
@@ -16,8 +18,12 @@ class Publisher(object):
         self.url = url
         self.doi = doi
 
+    def get_final_url(self):
+        pass
+
     def soupify(self):
         """Get HTML from article's page"""
+        self.get_final_url()
         os.environ['MOZ_HEADLESS'] = '1'
         binary = FirefoxBinary('/usr/bin/firefox')
         try:
@@ -33,12 +39,44 @@ class Publisher(object):
         if self.doi != None:
             print('Waiting for redirects..')
             sleep(5) #To allow redirects
-
+        
+        sleep(5)
         self.url = driver.current_url
         
         self.soup = BeautifulSoup(driver.page_source,'html.parser')
         driver.quit()
 
+    def doi2json(self):
+        """
+        Get a dictionary of metadata for a given DOI.
+        """
+        url = "http://dx.doi.org/" + self.doi
+        headers = {"accept": "application/json"}
+        r = requests.get(url, headers = headers)
+        self.meta = r.json()
+
+    def get_metadata(self):
+        self.doi2json()
+
+        self.title = self.meta['title']
+
+        self.author_surnames = []
+        self.author_givennames = []
+        for i in self.meta['author']:
+            self.author_surnames.append(i['family'])
+            self.author_givennames.append(i['given'])
+
+        self.journal = self.meta['container-title']
+
+        if 'published-print' in self.meta.keys():
+            self.volume = str(self.meta['volume'])
+            self.pages = str(self.meta['page'])
+            self.year = str(self.meta['published-print']['date-parts'][0][0])
+        else:
+            self.volume = ''
+            self.pages = ''
+            self.year = str(self.meta['published-online']['date-parts'][0][0])
+
     def get_citation(self):
         
         all_authors = ''
@@ -52,19 +90,23 @@ class Publisher(object):
         else:
             cap = '. '
         
-        self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \
-                +self.journal+' '+self.volume+': '+self.pages+'.' \
-                +' doi: '+self.doi
+        if self.volume != '':
+            self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \
+                    +self.journal+' '+self.volume+': '+self.pages+'.' \
+                    +' doi: '+self.doi
+        else:
+            self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \
+                    +self.journal+'. '+' doi: '+self.doi
     
     def extract_data(self):
-        self.get_title()
-        self.get_authors()
+        #self.get_title()
+        #self.get_authors()
+        self.get_doi()
+        self.get_metadata()
         self.get_abstract()
         self.get_keywords()
-        self.get_metadata()
         self.get_body()
         self.get_references()
-        self.get_citation()
 
     def epubify(self):
         """Convert data into epub format"""
@@ -75,7 +117,9 @@ class Publisher(object):
             all_authors += self.author_surnames[i]
             if(i != (len(self.author_surnames) - 1)):
                 all_authors += ', '
-        
+       
+        self.get_citation()
+
         args = []
         args.append('-M')
         args.append('title="'+self.title+'"')
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py
index 939ee39..730fab3 100644
--- a/article_epub/publishers/__init__.py
+++ b/article_epub/publishers/__init__.py
@@ -1,2 +1,3 @@
 from article_epub.publishers.sciencedirect import ScienceDirect
 from article_epub.publishers.springer import Springer
+from article_epub.publishers.wiley import Wiley
diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py
index 26ca971..34fecfd 100644
--- a/article_epub/publishers/sciencedirect.py
+++ b/article_epub/publishers/sciencedirect.py
@@ -3,26 +3,30 @@ from article_epub.publisher import Publisher, register_publisher
 class ScienceDirect(Publisher):
     """Class for Science Direct (Elsevier) articles"""
 
-    domains = ["sciencedirect.com","www.sciencedirect.com",
-            "linkinghub.elsevier.com"]
+    domains = ["sciencedirect.com","elsevier.com"]
     
-    def get_title(self):
-        """Get article title"""
-        self.title = self.soup.find('span',class_='title-text').text
+    #def get_title(self):
+    #    """Get article title"""
+    #    self.title = self.soup.find('span',class_='title-text').text
 
-    def get_authors(self):
-        """Get author given and surnammes"""
-        author_raw = self.soup.find('div',class_='author-group') \
-            .find_all('span',class_='text surname')
-        self.author_surnames = []
-        for i in author_raw:
-            self.author_surnames.append(i.text)
-
-        author_raw = self.soup.find('div',class_='author-group') \
-            .find_all('span',class_='text given-name')
-        self.author_givennames = []
-        for i in author_raw:
-            self.author_givennames.append(i.text)
+    #def get_authors(self):
+    #    """Get author given and surnammes"""
+    #    author_raw = self.soup.find('div',class_='author-group') \
+    #        .find_all('span',class_='text surname')
+    #    self.author_surnames = []
+    #    for i in author_raw:
+    #        self.author_surnames.append(i.text)
+    #
+    #    author_raw = self.soup.find('div',class_='author-group') \
+    #        .find_all('span',class_='text given-name')
+    #    self.author_givennames = []
+    #    for i in author_raw:
+    #        self.author_givennames.append(i.text)
+    
+    def get_doi(self):
+        if self.doi == None:
+            doi_raw = self.soup.find('a',class_='doi').get('href').split('/')
+            self.doi = str(doi_raw[3]+'/'+doi_raw[4])
 
     def get_abstract(self):
         """Get article abstract"""
@@ -36,21 +40,21 @@ class ScienceDirect(Publisher):
         for i in keys_raw:
             self.keywords.append(i.text)
 
-    def get_metadata(self):
-        """Get assortment of other metadata"""
-        if self.doi == None:
-            doi_raw = self.soup.find('a',class_='doi').get('href').split('/')
-            self.doi = doi_raw[3]+'/'+doi_raw[4]
-
-        self.journal = self.soup.find('div',class_='publication-volume') \
-            .find('span',class_='size-xl').text
-
-        pubdate_raw = self.soup.find('div',class_='publication-volume') \
-            .find('span',class_='size-m').text.split(',')
-
-        self.year = pubdate_raw[-2].split(' ')[-1]
-        self.volume = pubdate_raw[0].split(' ')[1]
-        self.pages = pubdate_raw[-1].split(' ')[2]
+    #def get_metadata(self):
+    #    """Get assortment of other metadata"""
+    #    if self.doi == None:
+    #        doi_raw = self.soup.find('a',class_='doi').get('href').split('/')
+    #        self.doi = doi_raw[3]+'/'+doi_raw[4]
+    #
+    #    self.journal = self.soup.find('div',class_='publication-volume') \
+    #        .find('span',class_='size-xl').text
+    #
+    #    pubdate_raw = self.soup.find('div',class_='publication-volume') \
+    #        .find('span',class_='size-m').text.split(',')
+    #
+    #    self.year = pubdate_raw[-2].split(' ')[-1]
+    #    self.volume = pubdate_raw[0].split(' ')[1]
+    #    self.pages = pubdate_raw[-1].split(' ')[2]
 
     def get_body(self):
         """Get body of article"""
diff --git a/article_epub/publishers/springer.py b/article_epub/publishers/springer.py
index bdd0354..77930d4 100644
--- a/article_epub/publishers/springer.py
+++ b/article_epub/publishers/springer.py
@@ -3,21 +3,26 @@ from article_epub.publisher import Publisher, register_publisher
 class Springer(Publisher):
     """Class for Springer articles"""
 
-    domains = ["link.springer.com","springer.com","www.springer.com"]
+    domains = ["springer.com"]
     
-    def get_title(self):
-        """Get article title"""
-        self.title = self.soup.find('h1',class_='ArticleTitle').text
-
-    def get_authors(self):
-        """Get author given and surnammes"""
-        author_raw = self.soup.find_all('span',class_='authors__name')
-        self.author_surnames = []
-        self.author_givennames = []
-        for i in author_raw:
-            name = i.text.split('\xa0')
-            self.author_surnames.append(name[-1])
-            self.author_givennames.append(' '.join(name[:-1]))
+    #def get_title(self):
+    #    """Get article title"""
+    #    self.title = self.soup.find('h1',class_='ArticleTitle').text
+
+    #def get_authors(self):
+    #    """Get author given and surnammes"""
+    #    author_raw = self.soup.find_all('span',class_='authors__name')
+    #    self.author_surnames = []
+    #    self.author_givennames = []
+    #    for i in author_raw:
+    #        name = i.text.split('\xa0')
+    #        self.author_surnames.append(name[-1])
+    #        self.author_givennames.append(' '.join(name[:-1]))
+
+    def get_doi(self):
+        if self.doi == None:
+            doi_raw = self.soup.find('span',{"id":"doi-url"}).text.split('/')
+            self.doi = str(doi_raw[-2]+'/'+doi_raw[-1])
 
     def get_abstract(self):
         """Get article abstract"""
@@ -30,21 +35,21 @@ class Springer(Publisher):
         for i in keywords_raw:
             self.keywords.append(i.text.replace('\xa0',''))
 
-    def get_metadata(self):
-        """Get assortment of other metadata"""
-        if self.doi == None:
-            doi_raw = self.soup.find('span',{"id":"doi-url"}).text.split('/')
-            self.doi = doi_raw[-2]+'/'+doi_raw[-1]
-
-        self.journal = self.soup.find('span',class_="JournalTitle").text
-
-        self.year = self.soup.find('time')['datetime'].split('-')[0]
-
-        self.volume = self.soup.find('span',class_="ArticleCitation_Volume") \
-            .text[:-2].split(' ')[-1]
-
-        self.pages = self.soup.find('span',class_="ArticleCitation_Pages") \
-            .text.split(' ')[-1]
+    #def get_metadata(self):
+    #   """Get assortment of other metadata"""
+    #    if self.doi == None:
+    #        doi_raw = self.soup.find('span',{"id":"doi-url"}).text.split('/')
+    #        self.doi = doi_raw[-2]+'/'+doi_raw[-1]
+    #
+    #    self.journal = self.soup.find('span',class_="JournalTitle").text
+    #
+    #    self.year = self.soup.find('time')['datetime'].split('-')[0]
+    #
+    #    self.volume = self.soup.find('span',class_="ArticleCitation_Volume") \
+    #        .text[:-2].split(' ')[-1]
+    #
+    #    self.pages = self.soup.find('span',class_="ArticleCitation_Pages") \
+    #        .text.split(' ')[-1]
 
     def get_body(self):
         """Get body of article"""
diff --git a/article_epub/publishers/wiley.py b/article_epub/publishers/wiley.py
new file mode 100644
index 0000000..0133c1d
--- /dev/null
+++ b/article_epub/publishers/wiley.py
@@ -0,0 +1,47 @@
+from article_epub.publisher import Publisher, register_publisher
+
+class Wiley(Publisher):
+    """Class for Springer articles"""
+
+    domains = ["wiley.com"]
+
+    def get_final_url(self):
+        if '/abs/' in self.url:
+            self.url = self.url.replace('/abs/','/full/')
+   
+    def get_doi(self):
+        if self.doi == None:
+            doi_raw = self.soup.find('a',class_='epub-doi').text.split('/')
+            self.doi = str(doi_raw[3]+'/'+doi_raw[4])
+
+    def get_abstract(self):
+        """Get article abstract"""
+        self.abstract = self.soup.find('section',
+                class_='article-section__abstract')
+
+    def get_keywords(self):
+        """Get article keywords"""
+        keywords_raw = self.soup.find('section',class_='keywords') \
+            .find_all('a',class_='badge-type')
+        self.keywords = []
+        for i in keywords_raw:
+            self.keywords.append(i.text.replace('\n','').replace('\u200a',''))
+
+    def get_body(self):
+        """Get body of article"""
+        body_raw = self.soup.find_all('div',class_='article-section__content') 
+        body_raw = body_raw[1:]
+        self.body = ''
+        for i in body_raw:
+            self.body += str(i)
+    
+    def get_references(self):
+        """Get references list"""
+        references_raw = str(self.soup.find('section',
+            {'id':'references-section'}))
+        references_raw = references_raw.replace('"display: none;"','')
+        references_raw = references_raw.replace('Literature Cited','')
+        references_raw = references_raw.replace('data-bib-id','id')
+        self.references = '<h2>Literature Cited</h2>\n'+references_raw
+
+register_publisher(Wiley)
author	Ken Kellner <ken@kenkellner.com>	2018-04-05 08:20:40 -0400
committer	Ken Kellner <ken@kenkellner.com>	2018-04-05 08:20:40 -0400
commit	7aacb78f566a97c90b8f460bf33e075a022060c1 (patch)
tree	09ccb0001e6e6a9716379d287e4193265472228e
parent	52d5cc23a30b76f43522b2d37229d4449f099d73 (diff)