diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-06 14:30:01 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-06 14:30:01 -0400 |
commit | cdbb1518012a239ebe31dfd1032ad7116c8c8c36 (patch) | |
tree | c6a25a7dc53ffabcaf6c7b6922b1bbe99614ae2c | |
parent | a5ae056da7a2b739a7412c854f76f33958c13e4a (diff) |
Add NIH/NCBI support
-rw-r--r-- | article_epub/publishers/__init__.py | 1 | ||||
-rw-r--r-- | article_epub/publishers/nih.py | 57 |
2 files changed, 58 insertions, 0 deletions
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py index 6aa2f43..77fe75c 100644 --- a/article_epub/publishers/__init__.py +++ b/article_epub/publishers/__init__.py @@ -4,3 +4,4 @@ from article_epub.publishers.wiley import Wiley from article_epub.publishers.bioone import BioOne from article_epub.publishers.plosone import PLoSONE from article_epub.publishers.oxford import Oxford +from article_epub.publishers.nih import NIH diff --git a/article_epub/publishers/nih.py b/article_epub/publishers/nih.py new file mode 100644 index 0000000..60294aa --- /dev/null +++ b/article_epub/publishers/nih.py @@ -0,0 +1,57 @@ +from article_epub.publisher import Publisher, register_publisher +import requests +import subprocess +from bs4 import BeautifulSoup + +class NIH(Publisher): + """Class for NIH NCBI articles""" + + domains = ["nih.gov"] + + def soupify(self): + print('Loading page................',end="",flush=True) + req = requests.get(self.url,headers={'User-Agent':'Mozilla/5.0'}) + self.soup = BeautifulSoup(req.content,'html.parser') + print('done') + + def get_doi(self): + if self.doi == None: + try: + self.doi = self.soup.find('span',class_='doi').find('a').text + except: + self.doi = '' + + def extract_data(self): + print('Extracting data from HTML...',end='',flush=True) + self.get_doi() + self.get_metadata() + self.get_citation() + print('done') + + def epubify(self): + + all_authors = '' + for i in range(0,len(self.author_surnames)): + all_authors += self.author_givennames[i] + ' ' + all_authors += self.author_surnames[i] + if(i != (len(self.author_surnames) - 1)): + all_authors += ', ' + + self.output = self.author_surnames[0]+'_'+self.year+'.epub' + output_raw = '/tmp/raw.epub' + + pdf_link = self.soup.find('div',class_='format-menu') \ + .find_all('a')[2]['href'] + epub_link = 'http://ncbi.nlm.nih.gov'+str(pdf_link) \ + .replace('pdf','epub') + + print('Generating epub.............',end='',flush=True) + epub = requests.get(epub_link,headers={'User-Agent':'Mozilla/5.0'}) + with open(output_raw, 'wb') as f: + f.write(epub.content) + f.close() + subprocess.check_output(['ebook-convert',output_raw,self.output, + '--authors',all_authors]) + print('done') + +register_publisher(NIH) |