diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-06 15:47:49 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-06 15:47:49 -0400 |
commit | 6e7a18a32d1573340b474792bae24ed9622d81d9 (patch) | |
tree | 770fe24da6a9e60390a080592e25c5c853d90124 | |
parent | cdbb1518012a239ebe31dfd1032ad7116c8c8c36 (diff) |
Add NRC research press support
-rw-r--r-- | article_epub/publishers/__init__.py | 1 | ||||
-rw-r--r-- | article_epub/publishers/nrc.py | 80 |
2 files changed, 81 insertions, 0 deletions
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py index 77fe75c..955ea3a 100644 --- a/article_epub/publishers/__init__.py +++ b/article_epub/publishers/__init__.py @@ -5,3 +5,4 @@ from article_epub.publishers.bioone import BioOne from article_epub.publishers.plosone import PLoSONE from article_epub.publishers.oxford import Oxford from article_epub.publishers.nih import NIH +from article_epub.publishers.nrc import NRC diff --git a/article_epub/publishers/nrc.py b/article_epub/publishers/nrc.py new file mode 100644 index 0000000..cad59e9 --- /dev/null +++ b/article_epub/publishers/nrc.py @@ -0,0 +1,80 @@ +from article_epub.publisher import Publisher, register_publisher +import copy + +class NRC(Publisher): + """Class for NRC Research Press articles""" + + domains = ["nrcresearchpress.com"] + + def get_doi(self): + if self.doi == None: + doi_raw = self.soup.find('p',class_='citationLine').find('a') \ + .text.split('/') + self.doi = str(doi_raw[3]+'/'+doi_raw[4]) + + def get_abstract(self): + """Get article abstract""" + abstract_raw = self.soup.find('div',class_='abstractSection') + self.abstract = '<h2>Abstract</h2>\n'+str(abstract_raw) + + def get_keywords(self): + """Get article keywords""" + keywords_raw = self.soup.find('font',{'size':'-1'}).find_all('a') + self.keywords = [] + for i in keywords_raw: + self.keywords.append(i.text) + + def get_body(self): + """Get body of article""" + body_raw = copy.copy(self.soup) + for i in body_raw.find_all('form'): + i.decompose() + + figs = body_raw.find_all('a',class_='openFigLayer') + for i in figs: + oldlink = i.find('img')['src'] + newlink = oldlink.replace('small','medium') + i.find('img')['src'] = 'http://nrcresearchpress.com'+newlink + i.find('p').decompose() + + if len(figs) > 0: + temp_raw = 'http://nrcresearchpress.com'+newlink + template = temp_raw.split('f')[0:-2][0] + for i in body_raw.find_all('div',class_='short-legend'): + i.decompose() + + for i in body_raw.find_all('a',class_='openTablesLayer'): + tabid = i['id'] + img = i.find('img') + img['src'] = template+tabid+'.gif' + i.find('p').decompose() + img['width'] = '' + img['height'] = '' + img['align'] = '' + img['border'] = '' + else: + print('Unable to get table images') + + for i in body_raw.find_all('a',class_='openLayerForItem'): + i['href'] = '#'+i['itemid'] + + for i in body_raw.find_all('a',class_='tooltip'): + i['href'] = '#'+i['rid'] + + body_parts = body_raw.find_all('div',class_='NLM_sec_level_1') + + self.body = '' + for i in body_parts: + self.body += str(i) + + def get_references(self): + """Get references list""" + references_title = '<h2>References</h2>\n' + references_raw = self.soup.find('ul',class_='no-bullet') + for i in references_raw.find_all('li'): + for j in i.find_all('a'): + j.decompose() + + self.references = str(references_title)+str(references_raw) + +register_publisher(NRC) |