Ignore keywords if necessary. Add ability to search by article title

author: Ken Kellner <ken@kenkellner.com> 2018-04-11 19:55:48 -0400
committer: Ken Kellner <ken@kenkellner.com> 2018-04-11 19:55:48 -0400
commit: 9dcfc2b9b581b4bc8f0ed6343206399aefe18acc (patch)
tree: 5e1448ba40d4a344236cfc12a6253cc77c2dc284
parent: e7494fe7d2647ef61f1136606f4f3fa890625ded (diff)
6 files changed, 84 insertions, 34 deletions
diff --git a/article-epub.py b/article-epub.py
index 9ccab6d..bfbed1b 100755
--- a/article-epub.py
+++ b/article-epub.py
@@ -3,36 +3,44 @@ import article_epub
 import sys
 import requests
 import argparse
+from bs4 import BeautifulSoup
 
 parser = argparse.ArgumentParser()
 
-parser.add_argument("-u","--url",type=str,help='URL of article',default=None)
-parser.add_argument("-d","--doi",type=str,help='DOI of article',default=None)
-parser.add_argument("-o","--out",type=str,help='Name of output file',
+parser.add_argument("-u",type=str,help='URL of article',
+        default=None,metavar='URL')
+parser.add_argument("-d",type=str,help='DOI of article'
+        ,default=None,metavar='DOI')
+parser.add_argument("-t",type=str,help='Title of article',
+        default=None,metavar='TITLE')
+parser.add_argument("-o",type=str,help='Name of output file',
         default=None,metavar='FILE')
-parser.add_argument("-p","--publishers",help='List supported publishers',
+parser.add_argument("-p",help='List supported publishers',
         action="store_true")
 args = parser.parse_args()
 
 def main():
-
-    if args.publishers:
+    if args.p:
         pubs = article_epub.publisher.list_publishers()
         print('Available publishers:')
         for i in pubs:
             print('• '+i.__name__)
         sys.exit()
 
-    if args.doi == None and args.url == None:
-        sys.exit('Must provide either URL or DOI')
+    if args.d == None and args.u == None and args.t == None:
+        sys.exit('Must provide URL, DOI or title')
 
-    if args.doi != None:
+    if args.d != None:
         print("Getting URL from DOI........",end='',flush=True)
-        url = requests.get('https://doi.org/'+args.doi).url
-        doi = args.doi
+        url = requests.get('https://doi.org/'+args.d,
+                headers={'User-Agent':'Mozilla/5.0'}).url
+        doi = args.d
         print('done')
+    elif args.t != None:
+        url = url_from_title(args.t)
+        doi = None
     else:
-        url = args.url
+        url = args.u
         doi = None
     
     domain = ".".join(url.split("//")[-1].split("/")[0] \
@@ -46,10 +54,33 @@ def main():
 
     art.soupify()
     art.extract_data()
-    art.epubify(args.out)
-    print('\nCitation: '+art.citation)
+    art.epubify(args.o)
+    print('\nCitation: '+art.get_citation())
     print('Filename: '+art.output)
 
+def url_from_title(title):
+    print("Getting URL from title......")
+    try:
+        url_stem = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C49&q="'
+        search = title.replace(' ','+').replace('\n','')
+        full_url = url_stem+search+'"'
+        out = requests.get(full_url,headers={'User-Agent':'Mozilla/5.0'})
+        soup = BeautifulSoup(out.content,'html.parser')
+        result = soup.find('div',class_='gs_scl') \
+            .find('div',class_='gs_ri').find('a')
+        possible_title = result.text
+        possible_link = result['href']
+
+        print('Found following article:')
+        print(possible_title)
+        choice = input("Is this correct (y/n)? ")
+        if choice == 'y':
+            return(possible_link)
+        else:
+            sys.exit('Getting URL from title failed')
+    except:
+        sys.exit('Getting URL from title failed')
+
 
 main()
 
diff --git a/article_epub/publisher.py b/article_epub/publisher.py
index 4f728f5..89bb8b3 100644
--- a/article_epub/publisher.py
+++ b/article_epub/publisher.py
@@ -88,7 +88,7 @@ class Publisher(object):
         except:
             self.pages = ''
 
-    def get_citation(self):
+    def get_citation(self,link=False):
         
         all_authors = ''
         for i in range(0,len(self.author_surnames)):
@@ -101,13 +101,18 @@ class Publisher(object):
         else:
             cap = '. '
         
+        if link:
+            doi = '<a href="https://dx.doi.org/'+self.doi+'">'+self.doi+'</a>'
+        else:
+            doi = self.doi
+
         if self.volume != '':
-            self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \
+            return(all_authors+cap+self.year+'. '+self.title+'. ' \
                     +self.journal+' '+self.volume+': '+self.pages+'.' \
-                    +' doi: '+self.doi
+                    +' doi: '+doi) 
         else:
-            self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \
-                    +self.journal+'. '+' doi: '+self.doi
+            return(all_authors+cap+self.year+'. '+self.title+'. ' \
+                    +self.journal+'. '+' doi: '+doi)
     
     def extract_data(self):
         self.check_fulltext()
@@ -138,6 +143,7 @@ class Publisher(object):
         args.append('-M')
         args.append('author="'+all_authors+'"')
         args.append('--parse-raw')
+        args.append('--webtex')
         
         if output == None:
             self.output = self.author_surnames[0]+'_'+self.year+'.epub'
@@ -147,7 +153,7 @@ class Publisher(object):
         output_raw = '/tmp/raw.epub'
 
         combined = ''
-        combined += str(self.citation)
+        combined += str(self.get_citation(link=True))
         combined += str(self.abstract)
         combined += str(self.body)
         combined += str(self.references)
@@ -156,7 +162,8 @@ class Publisher(object):
         epubout = pypandoc.convert_text(combined,format='html',to='epub',
                 extra_args=args,
                 outputfile=output_raw)
-        subprocess.check_output(['ebook-convert',output_raw,self.output])
+        subprocess.check_output(['ebook-convert',output_raw,self.output,
+            '--no-default-epub-cover'])
         print('done')
 
 def register_publisher(publisher):
diff --git a/article_epub/publishers/bioone.py b/article_epub/publishers/bioone.py
index 4defe52..4d112fa 100644
--- a/article_epub/publishers/bioone.py
+++ b/article_epub/publishers/bioone.py
@@ -46,7 +46,7 @@ class BioOne(Publisher):
             except:
                 pass
         
-        print('Downloading higher-quality images...')
+        #print('Downloading higher-quality images...')
         imgs_old = body_full.find_all('div',class_='articleImage')
         for i in imgs_old:
             try:
diff --git a/article_epub/publishers/oxford.py b/article_epub/publishers/oxford.py
index b4481f6..ee72f11 100644
--- a/article_epub/publishers/oxford.py
+++ b/article_epub/publishers/oxford.py
@@ -26,10 +26,14 @@ class Oxford(Publisher):
 
     def get_keywords(self):
         """Get article keywords"""
-        keywords_raw = self.soup.find('div',class_='kwd-group').find_all('a')
         self.keywords = []
-        for i in keywords_raw:
-            self.keywords.append(i.text)
+        try:
+            keywords_raw = self.soup.find('div',class_='kwd-group') \
+                .find_all('a')
+            for i in keywords_raw:
+                self.keywords.append(i.text)
+        except:
+            pass
 
     def get_body(self):
         """Get body of article"""
diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py
index 27b3a70..3ffcad1 100644
--- a/article_epub/publishers/sciencedirect.py
+++ b/article_epub/publishers/sciencedirect.py
@@ -23,11 +23,15 @@ class ScienceDirect(Publisher):
 
     def get_keywords(self):
         """Get article keywords"""
-        keys_raw = self.soup.find('div',class_='Keywords') \
-            .find_all('div',class_='keyword')
         self.keywords = []
-        for i in keys_raw:
-            self.keywords.append(i.text)
+        try:
+            keys_raw = self.soup.find('div',class_='Keywords') \
+                .find_all('div',class_='keyword')
+            self.keywords = []
+            for i in keys_raw:
+                self.keywords.append(i.text)
+        except:
+            pass
 
     def get_body(self):
         """Get body of article"""
diff --git a/article_epub/publishers/wiley.py b/article_epub/publishers/wiley.py
index 00066df..d5fbdc7 100644
--- a/article_epub/publishers/wiley.py
+++ b/article_epub/publishers/wiley.py
@@ -12,7 +12,7 @@ class Wiley(Publisher):
    
     def check_fulltext(self):
         test = self.soup.find_all('div',class_='article-section__content')
-        if len(test) < 3:
+        if len(test) < 4:
             sys.exit('Error: Can\'t access fulltext of article')
         else:
             return(True)
@@ -29,11 +29,15 @@ class Wiley(Publisher):
 
     def get_keywords(self):
         """Get article keywords"""
-        keywords_raw = self.soup.find('section',class_='keywords') \
-            .find_all('a',class_='badge-type')
         self.keywords = []
-        for i in keywords_raw:
-            self.keywords.append(i.text.replace('\n','').replace('\u200a',''))
+        try:
+            keywords_raw = self.soup.find('section',class_='keywords') \
+                .find_all('a',class_='badge-type')
+            for i in keywords_raw:
+                self.keywords.append(i.text.replace('\n','') \
+                        .replace('\u200a',''))
+        except:
+            pass
 
     def get_body(self):
         """Get body of article"""
author	Ken Kellner <ken@kenkellner.com>	2018-04-11 19:55:48 -0400
committer	Ken Kellner <ken@kenkellner.com>	2018-04-11 19:55:48 -0400
commit	9dcfc2b9b581b4bc8f0ed6343206399aefe18acc (patch)
tree	5e1448ba40d4a344236cfc12a6253cc77c2dc284
parent	e7494fe7d2647ef61f1136606f4f3fa890625ded (diff)