diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-11 19:55:48 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-11 19:55:48 -0400 |
commit | 9dcfc2b9b581b4bc8f0ed6343206399aefe18acc (patch) | |
tree | 5e1448ba40d4a344236cfc12a6253cc77c2dc284 | |
parent | e7494fe7d2647ef61f1136606f4f3fa890625ded (diff) |
Ignore keywords if necessary. Add ability to search by article title
-rwxr-xr-x | article-epub.py | 59 | ||||
-rw-r--r-- | article_epub/publisher.py | 21 | ||||
-rw-r--r-- | article_epub/publishers/bioone.py | 2 | ||||
-rw-r--r-- | article_epub/publishers/oxford.py | 10 | ||||
-rw-r--r-- | article_epub/publishers/sciencedirect.py | 12 | ||||
-rw-r--r-- | article_epub/publishers/wiley.py | 14 |
6 files changed, 84 insertions, 34 deletions
diff --git a/article-epub.py b/article-epub.py index 9ccab6d..bfbed1b 100755 --- a/article-epub.py +++ b/article-epub.py @@ -3,36 +3,44 @@ import article_epub import sys import requests import argparse +from bs4 import BeautifulSoup parser = argparse.ArgumentParser() -parser.add_argument("-u","--url",type=str,help='URL of article',default=None) -parser.add_argument("-d","--doi",type=str,help='DOI of article',default=None) -parser.add_argument("-o","--out",type=str,help='Name of output file', +parser.add_argument("-u",type=str,help='URL of article', + default=None,metavar='URL') +parser.add_argument("-d",type=str,help='DOI of article' + ,default=None,metavar='DOI') +parser.add_argument("-t",type=str,help='Title of article', + default=None,metavar='TITLE') +parser.add_argument("-o",type=str,help='Name of output file', default=None,metavar='FILE') -parser.add_argument("-p","--publishers",help='List supported publishers', +parser.add_argument("-p",help='List supported publishers', action="store_true") args = parser.parse_args() def main(): - - if args.publishers: + if args.p: pubs = article_epub.publisher.list_publishers() print('Available publishers:') for i in pubs: print('• '+i.__name__) sys.exit() - if args.doi == None and args.url == None: - sys.exit('Must provide either URL or DOI') + if args.d == None and args.u == None and args.t == None: + sys.exit('Must provide URL, DOI or title') - if args.doi != None: + if args.d != None: print("Getting URL from DOI........",end='',flush=True) - url = requests.get('https://doi.org/'+args.doi).url - doi = args.doi + url = requests.get('https://doi.org/'+args.d, + headers={'User-Agent':'Mozilla/5.0'}).url + doi = args.d print('done') + elif args.t != None: + url = url_from_title(args.t) + doi = None else: - url = args.url + url = args.u doi = None domain = ".".join(url.split("//")[-1].split("/")[0] \ @@ -46,10 +54,33 @@ def main(): art.soupify() art.extract_data() - art.epubify(args.out) - print('\nCitation: '+art.citation) + art.epubify(args.o) + print('\nCitation: '+art.get_citation()) print('Filename: '+art.output) +def url_from_title(title): + print("Getting URL from title......") + try: + url_stem = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C49&q="' + search = title.replace(' ','+').replace('\n','') + full_url = url_stem+search+'"' + out = requests.get(full_url,headers={'User-Agent':'Mozilla/5.0'}) + soup = BeautifulSoup(out.content,'html.parser') + result = soup.find('div',class_='gs_scl') \ + .find('div',class_='gs_ri').find('a') + possible_title = result.text + possible_link = result['href'] + + print('Found following article:') + print(possible_title) + choice = input("Is this correct (y/n)? ") + if choice == 'y': + return(possible_link) + else: + sys.exit('Getting URL from title failed') + except: + sys.exit('Getting URL from title failed') + main() diff --git a/article_epub/publisher.py b/article_epub/publisher.py index 4f728f5..89bb8b3 100644 --- a/article_epub/publisher.py +++ b/article_epub/publisher.py @@ -88,7 +88,7 @@ class Publisher(object): except: self.pages = '' - def get_citation(self): + def get_citation(self,link=False): all_authors = '' for i in range(0,len(self.author_surnames)): @@ -101,13 +101,18 @@ class Publisher(object): else: cap = '. ' + if link: + doi = '<a href="https://dx.doi.org/'+self.doi+'">'+self.doi+'</a>' + else: + doi = self.doi + if self.volume != '': - self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \ + return(all_authors+cap+self.year+'. '+self.title+'. ' \ +self.journal+' '+self.volume+': '+self.pages+'.' \ - +' doi: '+self.doi + +' doi: '+doi) else: - self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \ - +self.journal+'. '+' doi: '+self.doi + return(all_authors+cap+self.year+'. '+self.title+'. ' \ + +self.journal+'. '+' doi: '+doi) def extract_data(self): self.check_fulltext() @@ -138,6 +143,7 @@ class Publisher(object): args.append('-M') args.append('author="'+all_authors+'"') args.append('--parse-raw') + args.append('--webtex') if output == None: self.output = self.author_surnames[0]+'_'+self.year+'.epub' @@ -147,7 +153,7 @@ class Publisher(object): output_raw = '/tmp/raw.epub' combined = '' - combined += str(self.citation) + combined += str(self.get_citation(link=True)) combined += str(self.abstract) combined += str(self.body) combined += str(self.references) @@ -156,7 +162,8 @@ class Publisher(object): epubout = pypandoc.convert_text(combined,format='html',to='epub', extra_args=args, outputfile=output_raw) - subprocess.check_output(['ebook-convert',output_raw,self.output]) + subprocess.check_output(['ebook-convert',output_raw,self.output, + '--no-default-epub-cover']) print('done') def register_publisher(publisher): diff --git a/article_epub/publishers/bioone.py b/article_epub/publishers/bioone.py index 4defe52..4d112fa 100644 --- a/article_epub/publishers/bioone.py +++ b/article_epub/publishers/bioone.py @@ -46,7 +46,7 @@ class BioOne(Publisher): except: pass - print('Downloading higher-quality images...') + #print('Downloading higher-quality images...') imgs_old = body_full.find_all('div',class_='articleImage') for i in imgs_old: try: diff --git a/article_epub/publishers/oxford.py b/article_epub/publishers/oxford.py index b4481f6..ee72f11 100644 --- a/article_epub/publishers/oxford.py +++ b/article_epub/publishers/oxford.py @@ -26,10 +26,14 @@ class Oxford(Publisher): def get_keywords(self): """Get article keywords""" - keywords_raw = self.soup.find('div',class_='kwd-group').find_all('a') self.keywords = [] - for i in keywords_raw: - self.keywords.append(i.text) + try: + keywords_raw = self.soup.find('div',class_='kwd-group') \ + .find_all('a') + for i in keywords_raw: + self.keywords.append(i.text) + except: + pass def get_body(self): """Get body of article""" diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py index 27b3a70..3ffcad1 100644 --- a/article_epub/publishers/sciencedirect.py +++ b/article_epub/publishers/sciencedirect.py @@ -23,11 +23,15 @@ class ScienceDirect(Publisher): def get_keywords(self): """Get article keywords""" - keys_raw = self.soup.find('div',class_='Keywords') \ - .find_all('div',class_='keyword') self.keywords = [] - for i in keys_raw: - self.keywords.append(i.text) + try: + keys_raw = self.soup.find('div',class_='Keywords') \ + .find_all('div',class_='keyword') + self.keywords = [] + for i in keys_raw: + self.keywords.append(i.text) + except: + pass def get_body(self): """Get body of article""" diff --git a/article_epub/publishers/wiley.py b/article_epub/publishers/wiley.py index 00066df..d5fbdc7 100644 --- a/article_epub/publishers/wiley.py +++ b/article_epub/publishers/wiley.py @@ -12,7 +12,7 @@ class Wiley(Publisher): def check_fulltext(self): test = self.soup.find_all('div',class_='article-section__content') - if len(test) < 3: + if len(test) < 4: sys.exit('Error: Can\'t access fulltext of article') else: return(True) @@ -29,11 +29,15 @@ class Wiley(Publisher): def get_keywords(self): """Get article keywords""" - keywords_raw = self.soup.find('section',class_='keywords') \ - .find_all('a',class_='badge-type') self.keywords = [] - for i in keywords_raw: - self.keywords.append(i.text.replace('\n','').replace('\u200a','')) + try: + keywords_raw = self.soup.find('section',class_='keywords') \ + .find_all('a',class_='badge-type') + for i in keywords_raw: + self.keywords.append(i.text.replace('\n','') \ + .replace('\u200a','')) + except: + pass def get_body(self): """Get body of article""" |