aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Kellner <ken@kenkellner.com>2018-04-06 14:30:01 -0400
committerKen Kellner <ken@kenkellner.com>2018-04-06 14:30:01 -0400
commitcdbb1518012a239ebe31dfd1032ad7116c8c8c36 (patch)
treec6a25a7dc53ffabcaf6c7b6922b1bbe99614ae2c
parenta5ae056da7a2b739a7412c854f76f33958c13e4a (diff)
Add NIH/NCBI support
-rw-r--r--article_epub/publishers/__init__.py1
-rw-r--r--article_epub/publishers/nih.py57
2 files changed, 58 insertions, 0 deletions
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py
index 6aa2f43..77fe75c 100644
--- a/article_epub/publishers/__init__.py
+++ b/article_epub/publishers/__init__.py
@@ -4,3 +4,4 @@ from article_epub.publishers.wiley import Wiley
from article_epub.publishers.bioone import BioOne
from article_epub.publishers.plosone import PLoSONE
from article_epub.publishers.oxford import Oxford
+from article_epub.publishers.nih import NIH
diff --git a/article_epub/publishers/nih.py b/article_epub/publishers/nih.py
new file mode 100644
index 0000000..60294aa
--- /dev/null
+++ b/article_epub/publishers/nih.py
@@ -0,0 +1,57 @@
+from article_epub.publisher import Publisher, register_publisher
+import requests
+import subprocess
+from bs4 import BeautifulSoup
+
+class NIH(Publisher):
+ """Class for NIH NCBI articles"""
+
+ domains = ["nih.gov"]
+
+ def soupify(self):
+ print('Loading page................',end="",flush=True)
+ req = requests.get(self.url,headers={'User-Agent':'Mozilla/5.0'})
+ self.soup = BeautifulSoup(req.content,'html.parser')
+ print('done')
+
+ def get_doi(self):
+ if self.doi == None:
+ try:
+ self.doi = self.soup.find('span',class_='doi').find('a').text
+ except:
+ self.doi = ''
+
+ def extract_data(self):
+ print('Extracting data from HTML...',end='',flush=True)
+ self.get_doi()
+ self.get_metadata()
+ self.get_citation()
+ print('done')
+
+ def epubify(self):
+
+ all_authors = ''
+ for i in range(0,len(self.author_surnames)):
+ all_authors += self.author_givennames[i] + ' '
+ all_authors += self.author_surnames[i]
+ if(i != (len(self.author_surnames) - 1)):
+ all_authors += ', '
+
+ self.output = self.author_surnames[0]+'_'+self.year+'.epub'
+ output_raw = '/tmp/raw.epub'
+
+ pdf_link = self.soup.find('div',class_='format-menu') \
+ .find_all('a')[2]['href']
+ epub_link = 'http://ncbi.nlm.nih.gov'+str(pdf_link) \
+ .replace('pdf','epub')
+
+ print('Generating epub.............',end='',flush=True)
+ epub = requests.get(epub_link,headers={'User-Agent':'Mozilla/5.0'})
+ with open(output_raw, 'wb') as f:
+ f.write(epub.content)
+ f.close()
+ subprocess.check_output(['ebook-convert',output_raw,self.output,
+ '--authors',all_authors])
+ print('done')
+
+register_publisher(NIH)