diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-12 15:28:05 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-12 15:28:05 -0400 |
commit | 946396dea633572b6b83dd6f568cb4ab4e8395d4 (patch) | |
tree | 336a74525456435a0c6e1f4f6d1aaa524c580cd2 | |
parent | 9dcfc2b9b581b4bc8f0ed6343206399aefe18acc (diff) |
Add Royal Society support
-rwxr-xr-x | article-epub.py | 3 | ||||
-rw-r--r-- | article_epub/publishers/__init__.py | 1 | ||||
-rw-r--r-- | article_epub/publishers/royalsociety.py | 91 |
3 files changed, 94 insertions, 1 deletions
diff --git a/article-epub.py b/article-epub.py index bfbed1b..21c80cf 100755 --- a/article-epub.py +++ b/article-epub.py @@ -70,7 +70,8 @@ def url_from_title(title): .find('div',class_='gs_ri').find('a') possible_title = result.text possible_link = result['href'] - + print('Provided title:') + print(title) print('Found following article:') print(possible_title) choice = input("Is this correct (y/n)? ") diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py index 955ea3a..8fcb412 100644 --- a/article_epub/publishers/__init__.py +++ b/article_epub/publishers/__init__.py @@ -6,3 +6,4 @@ from article_epub.publishers.plosone import PLoSONE from article_epub.publishers.oxford import Oxford from article_epub.publishers.nih import NIH from article_epub.publishers.nrc import NRC +from article_epub.publishers.royalsociety import RoyalSociety diff --git a/article_epub/publishers/royalsociety.py b/article_epub/publishers/royalsociety.py new file mode 100644 index 0000000..2849956 --- /dev/null +++ b/article_epub/publishers/royalsociety.py @@ -0,0 +1,91 @@ +from article_epub.publisher import Publisher, register_publisher +import sys +import requests +import re +from bs4 import BeautifulSoup + +class RoyalSociety(Publisher): + """Class for Royal Society Publishing articles""" + + domains = ["royalsocietypublishing.org"] + + def check_fulltext(self): + if self.soup.find('div',{'id':'sec-1'}) == None: + sys.exit('Error: Can\'t access fulltext of article') + else: + return(True) + + def get_doi(self): + if self.doi == None: + self.doi = str(self.soup.find('span', + class_='highwire-cite-metadata-doi').text.split(' ')[1]) + + def get_abstract(self): + """Get article abstract""" + self.abstract = str(self.soup.find('div',class_='section abstract')) + + def get_keywords(self): + """Get article keywords""" + self.keywords = [] + try: + keywords_raw = self.soup.find('div', + class_='pane-node-field-highwire-article-keyword') \ + .find_all('a') + self.keywords = [] + for i in keywords_raw: + self.keywords.append(i.text) + except: + pass + + def get_body(self): + """Get body of article""" + body_raw = self.soup.find_all('div',{'id': re.compile('sec-.*')}) + + figs = self.soup.find_all('a',class_='fragment-images') + for j in figs: + i = j.find('span') + lnk = j['href'] + i.find('img')['src'] = lnk + i.find('img')['width'] = '' + i.find('img')['height'] = '' + + tags = self.soup.find_all('ul',class_="highwire-figure-links") + for i in tags: + i.find('li',class_='new-tab').decompose() + i.find('li',class_='download-ppt').decompose() + + tables = self.soup.find_all('div',class_='table') + for i in tables: + src = 'http://rstb.royalsocietypublishing.org'+ \ + i.find('a')['data-table-url'] + dat = requests.get(src, headers={'User-Agent':'Mozilla/5.0'}) + tabsoup = BeautifulSoup(dat.content,'html.parser') \ + .find('table') + i.append(tabsoup) + i.find('div',class_='table-callout-links').decompose() + + self.body = '' + for i in body_raw: + self.body += str(i) + + def get_references(self): + """Get references list""" + references_raw = self.soup.find('div',{'id':'ref-list-1'}).find('ol') \ + .find_all('li',recursive=False) + ref_title = '<h2>References</h2>' + + reflist = '<ol>' + for i in references_raw: + try: + tag = i.find('a')['id'] + reflist += '<li id="'+tag+'">' + for j in i.find_all('a'): + j.decompose() + reflist += i.text.replace('↵','').replace('()','') + reflist += '</li>' + except: + reflist += i.text.replace('↵','') + + self.references = ref_title + reflist + +register_publisher(RoyalSociety) |