Add NRC research press support

author: Ken Kellner <ken@kenkellner.com> 2018-04-06 15:47:49 -0400
committer: Ken Kellner <ken@kenkellner.com> 2018-04-06 15:47:49 -0400
commit: 6e7a18a32d1573340b474792bae24ed9622d81d9 (patch)
tree: 770fe24da6a9e60390a080592e25c5c853d90124
parent: cdbb1518012a239ebe31dfd1032ad7116c8c8c36 (diff)
2 files changed, 81 insertions, 0 deletions
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py
index 77fe75c..955ea3a 100644
--- a/article_epub/publishers/__init__.py
+++ b/article_epub/publishers/__init__.py
@@ -5,3 +5,4 @@ from article_epub.publishers.bioone import BioOne
 from article_epub.publishers.plosone import PLoSONE
 from article_epub.publishers.oxford import Oxford
 from article_epub.publishers.nih import NIH
+from article_epub.publishers.nrc import NRC
diff --git a/article_epub/publishers/nrc.py b/article_epub/publishers/nrc.py
new file mode 100644
index 0000000..cad59e9
--- /dev/null
+++ b/article_epub/publishers/nrc.py
@@ -0,0 +1,80 @@
+from article_epub.publisher import Publisher, register_publisher
+import copy
+
+class NRC(Publisher):
+    """Class for NRC Research Press articles"""
+
+    domains = ["nrcresearchpress.com"]
+    
+    def get_doi(self):
+        if self.doi == None:
+            doi_raw = self.soup.find('p',class_='citationLine').find('a') \
+                .text.split('/')
+            self.doi = str(doi_raw[3]+'/'+doi_raw[4])
+
+    def get_abstract(self):
+        """Get article abstract"""
+        abstract_raw = self.soup.find('div',class_='abstractSection')
+        self.abstract = '<h2>Abstract</h2>\n'+str(abstract_raw)
+
+    def get_keywords(self):
+        """Get article keywords"""
+        keywords_raw = self.soup.find('font',{'size':'-1'}).find_all('a')
+        self.keywords = []
+        for i in keywords_raw:
+            self.keywords.append(i.text)
+
+    def get_body(self):
+        """Get body of article"""
+        body_raw = copy.copy(self.soup)
+        for i in body_raw.find_all('form'):
+            i.decompose()
+        
+        figs = body_raw.find_all('a',class_='openFigLayer')
+        for i in figs:
+            oldlink = i.find('img')['src']
+            newlink = oldlink.replace('small','medium')
+            i.find('img')['src'] = 'http://nrcresearchpress.com'+newlink
+            i.find('p').decompose()
+        
+        if len(figs) > 0:
+            temp_raw = 'http://nrcresearchpress.com'+newlink
+            template = temp_raw.split('f')[0:-2][0]
+            for i in body_raw.find_all('div',class_='short-legend'):
+                i.decompose()
+            
+            for i in body_raw.find_all('a',class_='openTablesLayer'):
+                tabid = i['id']
+                img = i.find('img')
+                img['src'] = template+tabid+'.gif'
+                i.find('p').decompose()
+                img['width'] = ''
+                img['height'] = ''
+                img['align'] = ''
+                img['border'] = ''
+        else:
+            print('Unable to get table images')
+        
+        for i in body_raw.find_all('a',class_='openLayerForItem'):
+            i['href'] = '#'+i['itemid']
+    
+        for i in body_raw.find_all('a',class_='tooltip'):
+            i['href'] = '#'+i['rid']
+
+        body_parts = body_raw.find_all('div',class_='NLM_sec_level_1')
+    
+        self.body = ''
+        for i in body_parts:
+            self.body += str(i)
+    
+    def get_references(self):
+        """Get references list"""
+        references_title = '<h2>References</h2>\n'
+        references_raw = self.soup.find('ul',class_='no-bullet')
+        for i in references_raw.find_all('li'):
+            for j in i.find_all('a'):
+                j.decompose()
+
+        self.references = str(references_title)+str(references_raw)
+
+register_publisher(NRC)
author	Ken Kellner <ken@kenkellner.com>	2018-04-06 15:47:49 -0400
committer	Ken Kellner <ken@kenkellner.com>	2018-04-06 15:47:49 -0400
commit	6e7a18a32d1573340b474792bae24ed9622d81d9 (patch)
tree	770fe24da6a9e60390a080592e25c5c853d90124
parent	cdbb1518012a239ebe31dfd1032ad7116c8c8c36 (diff)