Add NIH/NCBI support

author: Ken Kellner <ken@kenkellner.com> 2018-04-06 14:30:01 -0400
committer: Ken Kellner <ken@kenkellner.com> 2018-04-06 14:30:01 -0400
commit: cdbb1518012a239ebe31dfd1032ad7116c8c8c36 (patch)
tree: c6a25a7dc53ffabcaf6c7b6922b1bbe99614ae2c
parent: a5ae056da7a2b739a7412c854f76f33958c13e4a (diff)
2 files changed, 58 insertions, 0 deletions
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py
index 6aa2f43..77fe75c 100644
--- a/article_epub/publishers/__init__.py
+++ b/article_epub/publishers/__init__.py
@@ -4,3 +4,4 @@ from article_epub.publishers.wiley import Wiley
 from article_epub.publishers.bioone import BioOne
 from article_epub.publishers.plosone import PLoSONE
 from article_epub.publishers.oxford import Oxford
+from article_epub.publishers.nih import NIH
diff --git a/article_epub/publishers/nih.py b/article_epub/publishers/nih.py
new file mode 100644
index 0000000..60294aa
--- /dev/null
+++ b/article_epub/publishers/nih.py
@@ -0,0 +1,57 @@
+from article_epub.publisher import Publisher, register_publisher
+import requests
+import subprocess
+from bs4 import BeautifulSoup
+
+class NIH(Publisher):
+    """Class for NIH NCBI articles"""
+
+    domains = ["nih.gov"]
+
+    def soupify(self):
+        print('Loading page................',end="",flush=True)
+        req = requests.get(self.url,headers={'User-Agent':'Mozilla/5.0'})
+        self.soup = BeautifulSoup(req.content,'html.parser')
+        print('done')   
+
+    def get_doi(self):
+        if self.doi == None:
+            try:
+                self.doi = self.soup.find('span',class_='doi').find('a').text
+            except:
+                self.doi = ''
+
+    def extract_data(self):
+        print('Extracting data from HTML...',end='',flush=True)
+        self.get_doi()
+        self.get_metadata()
+        self.get_citation()
+        print('done')
+
+    def epubify(self):
+
+        all_authors = ''
+        for i in range(0,len(self.author_surnames)):
+            all_authors += self.author_givennames[i] + ' '
+            all_authors += self.author_surnames[i]
+            if(i != (len(self.author_surnames) - 1)):
+                all_authors += ', '
+
+        self.output = self.author_surnames[0]+'_'+self.year+'.epub'
+        output_raw = '/tmp/raw.epub'
+        
+        pdf_link = self.soup.find('div',class_='format-menu') \
+            .find_all('a')[2]['href']
+        epub_link = 'http://ncbi.nlm.nih.gov'+str(pdf_link) \
+            .replace('pdf','epub') 
+        
+        print('Generating epub.............',end='',flush=True)
+        epub = requests.get(epub_link,headers={'User-Agent':'Mozilla/5.0'})
+        with open(output_raw, 'wb') as f:
+            f.write(epub.content)
+            f.close()
+        subprocess.check_output(['ebook-convert',output_raw,self.output,
+            '--authors',all_authors])
+        print('done')
+
+register_publisher(NIH)
author	Ken Kellner <ken@kenkellner.com>	2018-04-06 14:30:01 -0400
committer	Ken Kellner <ken@kenkellner.com>	2018-04-06 14:30:01 -0400
commit	cdbb1518012a239ebe31dfd1032ad7116c8c8c36 (patch)
tree	c6a25a7dc53ffabcaf6c7b6922b1bbe99614ae2c
parent	a5ae056da7a2b739a7412c854f76f33958c13e4a (diff)