diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-04 16:23:43 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-04 16:23:43 -0400 |
commit | b37149bedcfcac84f74d8057f1de843c42b85753 (patch) | |
tree | 589b795b823ffcfe2a1dd8c9f0a66ebd4872e5e1 | |
parent | 9366bac2de4c94fb01c7c67e191e55bd39b78aba (diff) |
Reorganize and add Springer recipe
-rwxr-xr-x | article-epub.py | 23 | ||||
-rw-r--r-- | article_epub/publisher.py (renamed from article_epub/sciarticle.py) | 17 | ||||
-rw-r--r-- | article_epub/publishers/__init__.py | 1 | ||||
-rw-r--r-- | article_epub/publishers/sciencedirect.py | 35 | ||||
-rw-r--r-- | article_epub/publishers/springer.py | 57 | ||||
-rwxr-xr-x | sci-scraper.py | 39 |
6 files changed, 113 insertions, 59 deletions
diff --git a/article-epub.py b/article-epub.py new file mode 100755 index 0000000..431c330 --- /dev/null +++ b/article-epub.py @@ -0,0 +1,23 @@ +#!/usr/bin/python3 +import article_epub +import sys +import requests + +def main(): + if sys.argv[1] == '-d': + url = requests.get('https://doi.org/'+sys.argv[2]).url + else: + url = sys.argv[1] + + domain = url.split("//")[-1].split("/")[0].split('?')[0] + + art = article_epub.publisher.get_publishers()[domain](url=url) + + print('Downloading content...') + art.soupify() + art.extract_data() + art.epubify() + + +main() + diff --git a/article_epub/sciarticle.py b/article_epub/publisher.py index cd828f0..f8e5424 100644 --- a/article_epub/sciarticle.py +++ b/article_epub/publisher.py @@ -7,7 +7,10 @@ import pypandoc from time import sleep import subprocess -class SciArticle(object): +_publishers = list() +_publisher_domains = dict() + +class Publisher(object): def __init__(self, url, doi=None, out_format='epub'): self.url = url @@ -85,7 +88,7 @@ class SciArticle(object): args.append('author="'+all_authors+'"') args.append('--parse-raw') - self.output = self.author_surnames[0]+self.year+'.epub' + self.output = self.author_surnames[0]+'_'+self.year+'.epub' output_raw = '/tmp/raw.epub' combined = '' @@ -101,6 +104,16 @@ class SciArticle(object): subprocess.check_output(['ebook-convert',output_raw,self.output]) +def register_publisher(publisher): + _publishers.append(publisher) + for d in publisher.domains: + _publisher_domains[d] = publisher + +def get_publishers(): + return _publisher_domains + + + diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py index a51d157..939ee39 100644 --- a/article_epub/publishers/__init__.py +++ b/article_epub/publishers/__init__.py @@ -1 +1,2 @@ from article_epub.publishers.sciencedirect import ScienceDirect +from article_epub.publishers.springer import Springer diff --git a/article_epub/publishers/sciencedirect.py b/article_epub/publishers/sciencedirect.py index 218cb98..26ca971 100644 --- a/article_epub/publishers/sciencedirect.py +++ b/article_epub/publishers/sciencedirect.py @@ -1,11 +1,17 @@ -from article_epub.sciarticle import SciArticle +from article_epub.publisher import Publisher, register_publisher -class ScienceDirect(SciArticle): +class ScienceDirect(Publisher): + """Class for Science Direct (Elsevier) articles""" + + domains = ["sciencedirect.com","www.sciencedirect.com", + "linkinghub.elsevier.com"] def get_title(self): + """Get article title""" self.title = self.soup.find('span',class_='title-text').text def get_authors(self): + """Get author given and surnammes""" author_raw = self.soup.find('div',class_='author-group') \ .find_all('span',class_='text surname') self.author_surnames = [] @@ -19,9 +25,11 @@ class ScienceDirect(SciArticle): self.author_givennames.append(i.text) def get_abstract(self): + """Get article abstract""" self.abstract = self.soup.find('div',class_='abstract author') def get_keywords(self): + """Get article keywords""" keys_raw = self.soup.find('div',class_='Keywords') \ .find_all('div',class_='keyword') self.keywords = [] @@ -29,6 +37,7 @@ class ScienceDirect(SciArticle): self.keywords.append(i.text) def get_metadata(self): + """Get assortment of other metadata""" if self.doi == None: doi_raw = self.soup.find('a',class_='doi').get('href').split('/') self.doi = doi_raw[3]+'/'+doi_raw[4] @@ -39,27 +48,17 @@ class ScienceDirect(SciArticle): pubdate_raw = self.soup.find('div',class_='publication-volume') \ .find('span',class_='size-m').text.split(',') - self.year = pubdate_raw[1].split(' ')[-1] + self.year = pubdate_raw[-2].split(' ')[-1] self.volume = pubdate_raw[0].split(' ')[1] - self.pages = pubdate_raw[2].split(' ')[2] + self.pages = pubdate_raw[-1].split(' ')[2] def get_body(self): + """Get body of article""" body_raw = str(self.soup.find('div',class_='Body')) - self.body = body_raw.replace('#b','#ref-id-b') + self.body = body_raw.replace('#b','#ref-id-b') #Fix anchors def get_references(self): + """Get references list""" self.references = self.soup.find('section',class_='bibliography') - - - - - - - - - - - - - +register_publisher(ScienceDirect) diff --git a/article_epub/publishers/springer.py b/article_epub/publishers/springer.py new file mode 100644 index 0000000..bdd0354 --- /dev/null +++ b/article_epub/publishers/springer.py @@ -0,0 +1,57 @@ +from article_epub.publisher import Publisher, register_publisher + +class Springer(Publisher): + """Class for Springer articles""" + + domains = ["link.springer.com","springer.com","www.springer.com"] + + def get_title(self): + """Get article title""" + self.title = self.soup.find('h1',class_='ArticleTitle').text + + def get_authors(self): + """Get author given and surnammes""" + author_raw = self.soup.find_all('span',class_='authors__name') + self.author_surnames = [] + self.author_givennames = [] + for i in author_raw: + name = i.text.split('\xa0') + self.author_surnames.append(name[-1]) + self.author_givennames.append(' '.join(name[:-1])) + + def get_abstract(self): + """Get article abstract""" + self.abstract = self.soup.find('section',class_='Abstract') + + def get_keywords(self): + """Get article keywords""" + keywords_raw = self.soup.find_all('span',class_='Keyword') + self.keywords = [] + for i in keywords_raw: + self.keywords.append(i.text.replace('\xa0','')) + + def get_metadata(self): + """Get assortment of other metadata""" + if self.doi == None: + doi_raw = self.soup.find('span',{"id":"doi-url"}).text.split('/') + self.doi = doi_raw[-2]+'/'+doi_raw[-1] + + self.journal = self.soup.find('span',class_="JournalTitle").text + + self.year = self.soup.find('time')['datetime'].split('-')[0] + + self.volume = self.soup.find('span',class_="ArticleCitation_Volume") \ + .text[:-2].split(' ')[-1] + + self.pages = self.soup.find('span',class_="ArticleCitation_Pages") \ + .text.split(' ')[-1] + + def get_body(self): + """Get body of article""" + self.body = self.soup.find('div',{"id":"body"}) + + def get_references(self): + """Get references list""" + self.references = self.soup.find('section',{"id":"Bib1"}) + +register_publisher(Springer) diff --git a/sci-scraper.py b/sci-scraper.py deleted file mode 100755 index 6bb5861..0000000 --- a/sci-scraper.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -from article_epub.publishers import ScienceDirect -import sys -import requests - -def main(): - if sys.argv[1] == '-d': - url = requests.get('https://doi.org/'+sys.argv[2]).url - art = ScienceDirect(url=url,doi=sys.argv[2]) - else: - url = sys.argv[1] - art = ScienceDirect(url=url) - print('Downloading content...') - art.soupify() - art.extract_data() - art.epubify() - - -main() -#test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S037811271630723X') - -#test = ScienceDirect(url='https://www.sciencedirect.com/science/article/pii/S0946672X17308763') - -#test.soupify() -#test.extract_data() -#test.epubify() - -##### - -#import urllib.request - - -#def final_url(url=None,doi=None): -# if url !=None: -# response = requests.get(url) -# elif doi !=None: -# response = request.get('https://doi.org/'+doi) - - |