From ace395fc372ead82216af906aee1c97dfea821ca Mon Sep 17 00:00:00 2001 From: Ken Kellner Date: Tue, 24 Apr 2018 11:45:28 -0400 Subject: Add U Chicago Press support --- article_epub/publishers/__init__.py | 1 + article_epub/publishers/uchicago.py | 86 +++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 article_epub/publishers/uchicago.py diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py index 206b7bc..790140c 100644 --- a/article_epub/publishers/__init__.py +++ b/article_epub/publishers/__init__.py @@ -10,3 +10,4 @@ from article_epub.publishers.royalsociety import RoyalSociety from article_epub.publishers.tandf import TandF from article_epub.publishers.annualreviews import AnnualReviews from article_epub.publishers.nature import Nature +from article_epub.publishers.uchicago import UChicago diff --git a/article_epub/publishers/uchicago.py b/article_epub/publishers/uchicago.py new file mode 100644 index 0000000..9492e75 --- /dev/null +++ b/article_epub/publishers/uchicago.py @@ -0,0 +1,86 @@ +from article_epub.publisher import Publisher, register_publisher +import sys +import copy + +class UChicago(Publisher): + """Class for University of Chicago Press articles""" + + name = "University of Chicago Press" + domains = ["uchicago.edu"] + + def check_fulltext(self): + if self.soup.find('div',class_='hlFld-Fulltext') == None: + sys.exit('Error: Can\'t access fulltext of article') + else: + return(True) + + def get_doi(self): + if self.doi == None: + self.doi = str(self.soup.find('meta',{'scheme':'doi'})['content']) + + def get_abstract(self): + """Get article abstract""" + abstract_raw = self.soup.find('div',class_='abstractSection') + abstract_title = '

Abstract

\n' + self.abstract = abstract_title+str(abstract_raw) + + def get_keywords(self): + """Get article keywords""" + self.keywords = [] + try: + keywords_raw = self.soup \ + .find('div',class_='hlFld-KeywordText').text + keywords_raw = keywords_raw.strip('Keywords: ').strip('.') + self.keywords = keywords_raw.split(',') + except: + pass + + def get_body(self): + """Get body of article""" + body_raw = copy.copy(self.soup.find('div',class_='hlFld-Fulltext')) + + for i in body_raw.find_all('div',class_='sectionHeading'): + i.name = 'h2' + + for i in body_raw.find_all('div',class_='sectionJumpTo'): + i.decompose() + + for i in body_raw.find_all('div',class_='head-b'): + i.name = 'h3' + + for i in body_raw.find_all('a',class_='showFiguresEEvent'): + try: + i['href'] = '#'+i['data-id'] + except: + pass + + for i in body_raw.find_all('img',{'alt':'figure'}): + link = 'https://www.journals.uchicago.edu'+i['src'] + link = link.replace('small','medium') + i['src'] = link + + for i in body_raw.find_all('div',class_='htmlTable'): + i.decompose() + + for i in body_raw.find_all('span',class_='NLM_inline-graphic'): + img = i.find('img') + link = 'https://www.journals.uchicago.edu'+img['src'] + img['src'] = link + + self.body = str(body_raw) + + def get_references(self): + """Get references list""" + refs_raw = self.soup.find_all('div',class_='ref_layout') + + for i in refs_raw: + for j in i.find_all('a'): + j.decompose() + + refs = '

Literature Cited

\n' + for i in refs_raw: + refs += str(i) + + self.references = refs + +register_publisher(UChicago) -- cgit v1.2.3