Add Royal Society support

author: Ken Kellner <ken@kenkellner.com> 2018-04-12 15:28:05 -0400
committer: Ken Kellner <ken@kenkellner.com> 2018-04-12 15:28:05 -0400
commit: 946396dea633572b6b83dd6f568cb4ab4e8395d4 (patch)
tree: 336a74525456435a0c6e1f4f6d1aaa524c580cd2
parent: 9dcfc2b9b581b4bc8f0ed6343206399aefe18acc (diff)
3 files changed, 94 insertions, 1 deletions
diff --git a/article-epub.py b/article-epub.py
index bfbed1b..21c80cf 100755
--- a/article-epub.py
+++ b/article-epub.py
@@ -70,7 +70,8 @@ def url_from_title(title):
             .find('div',class_='gs_ri').find('a')
         possible_title = result.text
         possible_link = result['href']
-
+        print('Provided title:')
+        print(title)
         print('Found following article:')
         print(possible_title)
         choice = input("Is this correct (y/n)? ")
diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py
index 955ea3a..8fcb412 100644
--- a/article_epub/publishers/__init__.py
+++ b/article_epub/publishers/__init__.py
@@ -6,3 +6,4 @@ from article_epub.publishers.plosone import PLoSONE
 from article_epub.publishers.oxford import Oxford
 from article_epub.publishers.nih import NIH
 from article_epub.publishers.nrc import NRC
+from article_epub.publishers.royalsociety import RoyalSociety
diff --git a/article_epub/publishers/royalsociety.py b/article_epub/publishers/royalsociety.py
new file mode 100644
index 0000000..2849956
--- /dev/null
+++ b/article_epub/publishers/royalsociety.py
@@ -0,0 +1,91 @@
+from article_epub.publisher import Publisher, register_publisher
+import sys
+import requests
+import re
+from bs4 import BeautifulSoup
+
+class RoyalSociety(Publisher):
+    """Class for Royal Society Publishing articles"""
+
+    domains = ["royalsocietypublishing.org"]
+
+    def check_fulltext(self):
+        if self.soup.find('div',{'id':'sec-1'}) == None:
+            sys.exit('Error: Can\'t access fulltext of article')
+        else:
+            return(True)
+
+    def get_doi(self):
+        if self.doi == None:
+            self.doi = str(self.soup.find('span',
+                class_='highwire-cite-metadata-doi').text.split(' ')[1])
+
+    def get_abstract(self):
+        """Get article abstract"""
+        self.abstract = str(self.soup.find('div',class_='section abstract'))
+
+    def get_keywords(self):
+        """Get article keywords"""
+        self.keywords = []
+        try:
+            keywords_raw = self.soup.find('div',
+                class_='pane-node-field-highwire-article-keyword') \
+                .find_all('a')
+            self.keywords = []
+            for i in keywords_raw:
+                self.keywords.append(i.text)
+        except:
+            pass
+
+    def get_body(self):
+        """Get body of article"""
+        body_raw = self.soup.find_all('div',{'id': re.compile('sec-.*')})
+
+        figs = self.soup.find_all('a',class_='fragment-images')
+        for j in figs:
+            i = j.find('span')
+            lnk = j['href']
+            i.find('img')['src'] = lnk
+            i.find('img')['width'] = ''
+            i.find('img')['height'] = ''
+
+        tags = self.soup.find_all('ul',class_="highwire-figure-links")
+        for i in tags:
+            i.find('li',class_='new-tab').decompose()
+            i.find('li',class_='download-ppt').decompose()
+
+        tables = self.soup.find_all('div',class_='table')
+        for i in tables:
+            src = 'http://rstb.royalsocietypublishing.org'+ \
+            i.find('a')['data-table-url']
+            dat = requests.get(src, headers={'User-Agent':'Mozilla/5.0'})
+            tabsoup = BeautifulSoup(dat.content,'html.parser') \
+                .find('table')
+            i.append(tabsoup)
+            i.find('div',class_='table-callout-links').decompose()
+
+        self.body = ''
+        for i in body_raw:
+            self.body += str(i)
+
+    def get_references(self):
+        """Get references list"""
+        references_raw = self.soup.find('div',{'id':'ref-list-1'}).find('ol') \
+            .find_all('li',recursive=False)
+        ref_title = '<h2>References</h2>'
+
+        reflist = '<ol>'
+        for i in references_raw:
+            try:
+                tag = i.find('a')['id']
+                reflist += '<li id="'+tag+'">'
+                for j in i.find_all('a'):
+                    j.decompose()
+                reflist += i.text.replace('↵','').replace('()','')
+                reflist += '</li>'
+            except:
+                reflist += i.text.replace('↵','')
+
+        self.references = ref_title + reflist
+
+register_publisher(RoyalSociety)
author	Ken Kellner <ken@kenkellner.com>	2018-04-12 15:28:05 -0400
committer	Ken Kellner <ken@kenkellner.com>	2018-04-12 15:28:05 -0400
commit	946396dea633572b6b83dd6f568cb4ab4e8395d4 (patch)
tree	336a74525456435a0c6e1f4f6d1aaa524c580cd2
parent	9dcfc2b9b581b4bc8f0ed6343206399aefe18acc (diff)