aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Kellner <ken@kenkellner.com>2018-04-11 19:55:48 -0400
committerKen Kellner <ken@kenkellner.com>2018-04-11 19:55:48 -0400
commit9dcfc2b9b581b4bc8f0ed6343206399aefe18acc (patch)
tree5e1448ba40d4a344236cfc12a6253cc77c2dc284
parente7494fe7d2647ef61f1136606f4f3fa890625ded (diff)
Ignore keywords if necessary. Add ability to search by article title
-rwxr-xr-xarticle-epub.py59
-rw-r--r--article_epub/publisher.py21
-rw-r--r--article_epub/publishers/bioone.py2
-rw-r--r--article_epub/publishers/oxford.py10
-rw-r--r--article_epub/publishers/sciencedirect.py12
-rw-r--r--article_epub/publishers/wiley.py14
6 files changed, 84 insertions, 34 deletions
diff --git a/article-epub.py b/article-epub.py
index 9ccab6d..bfbed1b 100755
--- a/article-epub.py
+++ b/article-epub.py
@@ -3,36 +3,44 @@ import article_epub
import sys
import requests
import argparse
+from bs4 import BeautifulSoup
parser = argparse.ArgumentParser()
-parser.add_argument("-u","--url",type=str,help='URL of article',default=None)
-parser.add_argument("-d","--doi",type=str,help='DOI of article',default=None)
-parser.add_argument("-o","--out",type=str,help='Name of output file',
+parser.add_argument("-u",type=str,help='URL of article',
+ default=None,metavar='URL')
+parser.add_argument("-d",type=str,help='DOI of article'
+ ,default=None,metavar='DOI')
+parser.add_argument("-t",type=str,help='Title of article',
+ default=None,metavar='TITLE')
+parser.add_argument("-o",type=str,help='Name of output file',
default=None,metavar='FILE')
-parser.add_argument("-p","--publishers",help='List supported publishers',
+parser.add_argument("-p",help='List supported publishers',
action="store_true")
args = parser.parse_args()
def main():
-
- if args.publishers:
+ if args.p:
pubs = article_epub.publisher.list_publishers()
print('Available publishers:')
for i in pubs:
print('• '+i.__name__)
sys.exit()
- if args.doi == None and args.url == None:
- sys.exit('Must provide either URL or DOI')
+ if args.d == None and args.u == None and args.t == None:
+ sys.exit('Must provide URL, DOI or title')
- if args.doi != None:
+ if args.d != None:
print("Getting URL from DOI........",end='',flush=True)
- url = requests.get('https://doi.org/'+args.doi).url
- doi = args.doi
+ url = requests.get('https://doi.org/'+args.d,
+ headers={'User-Agent':'Mozilla/5.0'}).url
+ doi = args.d
print('done')
+ elif args.t != None:
+ url = url_from_title(args.t)
+ doi = None
else:
- url = args.url
+ url = args.u
doi = None
domain = ".".join(url.split("//")[-1].split("/")[0] \
@@ -46,10 +54,33 @@ def main():
art.soupify()
art.extract_data()
- art.epubify(args.out)
- print('\nCitation: '+art.citation)
+ art.epubify(args.o)
+ print('\nCitation: '+art.get_citation())
print('Filename: '+art.output)
+def url_from_title(title):
+ print("Getting URL from title......")
+ try:
+ url_stem = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C49&q="'
+ search = title.replace(' ','+').replace('\n','')
+ full_url = url_stem+search+'"'
+ out = requests.get(full_url,headers={'User-Agent':'Mozilla/5.0'})
+ soup = BeautifulSoup(out.content,'html.parser')
+ result = soup.find('div',class_='gs_scl') \
+ .find('div',class_='gs_ri').find('a')
+ possible_title = result.text
+ possible_link = result['href']
+
+ print('Found following article:')
+ print(possible_title)
+ choice = input("Is this correct (y/n)? ")
+ if choice == 'y':
+ return(possible_link)
+ else:
+ sys.exit('Getting URL from title failed')
+ except:
+ sys.exit('Getting URL from title failed')
+
main()
diff --git a/article_epub/publisher.py b/article_epub/publisher.py
index 4f728f5..89bb8b3 100644
--- a/article_epub/publisher.py
+++ b/article_epub/publisher.py
@@ -88,7 +88,7 @@ class Publisher(object):
except:
self.pages = ''
- def get_citation(self):
+ def get_citation(self,link=False):
all_authors = ''
for i in range(0,len(self.author_surnames)):
@@ -101,13 +101,18 @@ class Publisher(object):
else:
cap = '. '
+ if link:
+ doi = '<a href="https://dx.doi.org/'+self.doi+'">'+self.doi+'</a>'
+ else:
+ doi = self.doi
+
if self.volume != '':
- self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \
+ return(all_authors+cap+self.year+'. '+self.title+'. ' \
+self.journal+' '+self.volume+': '+self.pages+'.' \
- +' doi: '+self.doi
+ +' doi: '+doi)
else:
- self.citation = all_authors+cap+self.year+'. '+self.title+'. ' \
- +self.journal+'. '+' doi: '+self.doi
+ return(all_authors+cap+self.year+'. '+self.title+'. ' \
+ +self.journal+'. '+' doi: '+doi)
def extract_data(self):
self.check_fulltext()
@@ -138,6 +143,7 @@ class Publisher(object):
args.append('-M')
args.append('author="'+all_authors+'"')
args.append('--parse-raw')
+ args.append('--webtex')
if output == None:
self.output = self.author_surnames[0]+'_'+self.year+'.epub'
@@ -147,7 +153,7 @@ class Publisher(object):
output_raw = '/tmp/raw.epub'
combined = ''
- combined += str(self.citation)
+ combined += str(self.get_citation(link=True))
combined += str(self.abstract)
combined += str(self.body)
combined += str(self.references)
@@ -156,7 +162,8 @@ class Publisher(object):
epubout = pypandoc.convert_text(combined,format='html',to='epub',
extra_args=args,
outputfile=output_raw)
- subprocess.check_output(['ebook-convert',output_raw,self.output])
+ subprocess.check_output(['ebook-convert',output_raw,self.output,
+ '--no-default-epub-cover'])
print('done')
def register_publisher(publisher):
diff --git a/article_epub/publishers/bioone.py b/article_epub/publishers/bioone.py
index 4defe52..4d112fa 100644
--- a/article_epub/publishers/bioone.py
+++ b/article_epub/publishers/bioone.py
@@ -46,7 +46,7 @@ class BioOne(Publisher):
except:
pass
- print('Downloading higher-quality images...')
+ #print('Downloading higher-quality images...')
imgs_old = body_full.find_all('div',class_='articleImage')
for i in imgs_old:
try:
diff --git a/article_epub/publishers/oxford.py b/article_epub/publishers/oxford.py
index b4481f6..ee72f11 100644
--- a/article_epub/publishers/oxford.py
+++ b/article_epub/publishers/oxford.py
@@ -26,10 +26,14 @@ class Oxford(Publisher):
def get_keywords(self):
"""Get article keywords"""
- keywords_raw = self.soup.find('div',class_='kwd-group').find_all('a')
self.keywords = []
- for i in keywords_raw:
- self.keywords.append(i.text)
+ try:
+ keywords_raw = self.soup.find('div',class_='kwd-group') \
+ .find_all('a')
+ for i in keywords_raw:
+ self.keywords.append(i.text)
+ except:
+ pass
def get_body(self):
"""Get body of article"""
diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py
index 27b3a70..3ffcad1 100644
--- a/article_epub/publishers/sciencedirect.py
+++ b/article_epub/publishers/sciencedirect.py
@@ -23,11 +23,15 @@ class ScienceDirect(Publisher):
def get_keywords(self):
"""Get article keywords"""
- keys_raw = self.soup.find('div',class_='Keywords') \
- .find_all('div',class_='keyword')
self.keywords = []
- for i in keys_raw:
- self.keywords.append(i.text)
+ try:
+ keys_raw = self.soup.find('div',class_='Keywords') \
+ .find_all('div',class_='keyword')
+ self.keywords = []
+ for i in keys_raw:
+ self.keywords.append(i.text)
+ except:
+ pass
def get_body(self):
"""Get body of article"""
diff --git a/article_epub/publishers/wiley.py b/article_epub/publishers/wiley.py
index 00066df..d5fbdc7 100644
--- a/article_epub/publishers/wiley.py
+++ b/article_epub/publishers/wiley.py
@@ -12,7 +12,7 @@ class Wiley(Publisher):
def check_fulltext(self):
test = self.soup.find_all('div',class_='article-section__content')
- if len(test) < 3:
+ if len(test) < 4:
sys.exit('Error: Can\'t access fulltext of article')
else:
return(True)
@@ -29,11 +29,15 @@ class Wiley(Publisher):
def get_keywords(self):
"""Get article keywords"""
- keywords_raw = self.soup.find('section',class_='keywords') \
- .find_all('a',class_='badge-type')
self.keywords = []
- for i in keywords_raw:
- self.keywords.append(i.text.replace('\n','').replace('\u200a',''))
+ try:
+ keywords_raw = self.soup.find('section',class_='keywords') \
+ .find_all('a',class_='badge-type')
+ for i in keywords_raw:
+ self.keywords.append(i.text.replace('\n','') \
+ .replace('\u200a',''))
+ except:
+ pass
def get_body(self):
"""Get body of article"""