diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-24 11:45:28 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-24 11:45:28 -0400 |
commit | ace395fc372ead82216af906aee1c97dfea821ca (patch) | |
tree | 3afd265c65cb6a524a55919c04740ad94ed296fd | |
parent | 4eed12039fca7322305b550933b94856a408fed2 (diff) |
Add U Chicago Press support
-rw-r--r-- | article_epub/publishers/__init__.py | 1 | ||||
-rw-r--r-- | article_epub/publishers/uchicago.py | 86 |
2 files changed, 87 insertions, 0 deletions
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py index 206b7bc..790140c 100644 --- a/article_epub/publishers/__init__.py +++ b/article_epub/publishers/__init__.py @@ -10,3 +10,4 @@ from article_epub.publishers.royalsociety import RoyalSociety from article_epub.publishers.tandf import TandF from article_epub.publishers.annualreviews import AnnualReviews from article_epub.publishers.nature import Nature +from article_epub.publishers.uchicago import UChicago diff --git a/article_epub/publishers/uchicago.py b/article_epub/publishers/uchicago.py new file mode 100644 index 0000000..9492e75 --- /dev/null +++ b/article_epub/publishers/uchicago.py @@ -0,0 +1,86 @@ +from article_epub.publisher import Publisher, register_publisher +import sys +import copy + +class UChicago(Publisher): + """Class for University of Chicago Press articles""" + + name = "University of Chicago Press" + domains = ["uchicago.edu"] + + def check_fulltext(self): + if self.soup.find('div',class_='hlFld-Fulltext') == None: + sys.exit('Error: Can\'t access fulltext of article') + else: + return(True) + + def get_doi(self): + if self.doi == None: + self.doi = str(self.soup.find('meta',{'scheme':'doi'})['content']) + + def get_abstract(self): + """Get article abstract""" + abstract_raw = self.soup.find('div',class_='abstractSection') + abstract_title = '<h2>Abstract</h2>\n' + self.abstract = abstract_title+str(abstract_raw) + + def get_keywords(self): + """Get article keywords""" + self.keywords = [] + try: + keywords_raw = self.soup \ + .find('div',class_='hlFld-KeywordText').text + keywords_raw = keywords_raw.strip('Keywords: ').strip('.') + self.keywords = keywords_raw.split(',') + except: + pass + + def get_body(self): + """Get body of article""" + body_raw = copy.copy(self.soup.find('div',class_='hlFld-Fulltext')) + + for i in body_raw.find_all('div',class_='sectionHeading'): + i.name = 'h2' + + for i in body_raw.find_all('div',class_='sectionJumpTo'): + i.decompose() + + for i in body_raw.find_all('div',class_='head-b'): + i.name = 'h3' + + for i in body_raw.find_all('a',class_='showFiguresEEvent'): + try: + i['href'] = '#'+i['data-id'] + except: + pass + + for i in body_raw.find_all('img',{'alt':'figure'}): + link = 'https://www.journals.uchicago.edu'+i['src'] + link = link.replace('small','medium') + i['src'] = link + + for i in body_raw.find_all('div',class_='htmlTable'): + i.decompose() + + for i in body_raw.find_all('span',class_='NLM_inline-graphic'): + img = i.find('img') + link = 'https://www.journals.uchicago.edu'+img['src'] + img['src'] = link + + self.body = str(body_raw) + + def get_references(self): + """Get references list""" + refs_raw = self.soup.find_all('div',class_='ref_layout') + + for i in refs_raw: + for j in i.find_all('a'): + j.decompose() + + refs = '<h2>Literature Cited</h2>\n' + for i in refs_raw: + refs += str(i) + + self.references = refs + +register_publisher(UChicago) |