Add U Chicago Press support

author: Ken Kellner <ken@kenkellner.com> 2018-04-24 11:45:28 -0400
committer: Ken Kellner <ken@kenkellner.com> 2018-04-24 11:45:28 -0400
commit: ace395fc372ead82216af906aee1c97dfea821ca (patch)
tree: 3afd265c65cb6a524a55919c04740ad94ed296fd
parent: 4eed12039fca7322305b550933b94856a408fed2 (diff)
2 files changed, 87 insertions, 0 deletions
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py
index 206b7bc..790140c 100644
--- a/article_epub/publishers/__init__.py
+++ b/article_epub/publishers/__init__.py
@@ -10,3 +10,4 @@ from article_epub.publishers.royalsociety import RoyalSociety
 from article_epub.publishers.tandf import TandF
 from article_epub.publishers.annualreviews import AnnualReviews
 from article_epub.publishers.nature import Nature
+from article_epub.publishers.uchicago import UChicago
diff --git a/article_epub/publishers/uchicago.py b/article_epub/publishers/uchicago.py
new file mode 100644
index 0000000..9492e75
--- /dev/null
+++ b/article_epub/publishers/uchicago.py
@@ -0,0 +1,86 @@
+from article_epub.publisher import Publisher, register_publisher
+import sys
+import copy
+
+class UChicago(Publisher):
+    """Class for University of Chicago Press articles"""
+
+    name = "University of Chicago Press"
+    domains = ["uchicago.edu"]
+
+    def check_fulltext(self):
+        if self.soup.find('div',class_='hlFld-Fulltext') == None:
+            sys.exit('Error: Can\'t access fulltext of article')
+        else:
+            return(True)
+
+    def get_doi(self):
+        if self.doi == None:
+            self.doi = str(self.soup.find('meta',{'scheme':'doi'})['content'])
+
+    def get_abstract(self):
+        """Get article abstract"""
+        abstract_raw = self.soup.find('div',class_='abstractSection')
+        abstract_title = '<h2>Abstract</h2>\n'
+        self.abstract = abstract_title+str(abstract_raw)
+
+    def get_keywords(self):
+        """Get article keywords"""
+        self.keywords = []
+        try:
+            keywords_raw = self.soup \
+                    .find('div',class_='hlFld-KeywordText').text
+            keywords_raw = keywords_raw.strip('Keywords: ').strip('.')
+            self.keywords = keywords_raw.split(',')
+        except:
+            pass
+
+    def get_body(self):
+        """Get body of article"""
+        body_raw = copy.copy(self.soup.find('div',class_='hlFld-Fulltext'))
+
+        for i in body_raw.find_all('div',class_='sectionHeading'):
+            i.name = 'h2'
+
+        for i in body_raw.find_all('div',class_='sectionJumpTo'):
+            i.decompose()
+
+        for i in body_raw.find_all('div',class_='head-b'):
+            i.name = 'h3'
+
+        for i in body_raw.find_all('a',class_='showFiguresEEvent'):
+            try:
+                i['href'] = '#'+i['data-id']
+            except:
+                pass
+
+        for i in body_raw.find_all('img',{'alt':'figure'}):
+            link = 'https://www.journals.uchicago.edu'+i['src']
+            link = link.replace('small','medium')
+            i['src'] = link
+
+        for i in body_raw.find_all('div',class_='htmlTable'):
+            i.decompose()
+
+        for i in body_raw.find_all('span',class_='NLM_inline-graphic'):
+            img = i.find('img')
+            link = 'https://www.journals.uchicago.edu'+img['src']
+            img['src'] = link
+
+        self.body = str(body_raw)
+
+    def get_references(self):
+        """Get references list"""
+        refs_raw = self.soup.find_all('div',class_='ref_layout')
+
+        for i in refs_raw:
+            for j in i.find_all('a'):
+                j.decompose()
+
+        refs = '<h2>Literature Cited</h2>\n'
+        for i in refs_raw:
+            refs += str(i)
+
+        self.references = refs
+
+register_publisher(UChicago)
author	Ken Kellner <ken@kenkellner.com>	2018-04-24 11:45:28 -0400
committer	Ken Kellner <ken@kenkellner.com>	2018-04-24 11:45:28 -0400
commit	ace395fc372ead82216af906aee1c97dfea821ca (patch)
tree	3afd265c65cb6a524a55919c04740ad94ed296fd
parent	4eed12039fca7322305b550933b94856a408fed2 (diff)