diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-05 17:17:22 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-05 17:17:22 -0400 |
commit | 59de534956c5dcacdd641afd0c5399c6279445fe (patch) | |
tree | cbc14fdc43a4b61a75410a5c39624ec8fda4a848 | |
parent | 26a784ccd482f4e7ac995e3147cae6185df65b7f (diff) |
Add Oxford support. Still have issues with bold font and lit cited looking bad
-rw-r--r-- | article_epub/publisher.py | 10 | ||||
-rw-r--r-- | article_epub/publishers/__init__.py | 1 | ||||
-rw-r--r-- | article_epub/publishers/oxford.py | 45 | ||||
-rw-r--r-- | article_epub/publishers/plosone.py | 2 |
4 files changed, 53 insertions, 5 deletions
diff --git a/article_epub/publisher.py b/article_epub/publisher.py index 7af9a47..7465f7f 100644 --- a/article_epub/publisher.py +++ b/article_epub/publisher.py @@ -72,13 +72,17 @@ class Publisher(object): self.journal = self.meta['container-title'] if 'published-print' in self.meta.keys(): - self.volume = str(self.meta['volume']) - self.pages = str(self.meta['page']) self.year = str(self.meta['published-print']['date-parts'][0][0]) else: + self.year = str(self.meta['published-online']['date-parts'][0][0]) + try: + self.volume = str(self.meta['volume']) + except: self.volume = '' + try: + self.pages = str(self.meta['page']) + except: self.pages = '' - self.year = str(self.meta['published-online']['date-parts'][0][0]) def get_citation(self): diff --git a/article_epub/publishers/__init__.py b/article_epub/publishers/__init__.py index 3e975c4..6aa2f43 100644 --- a/article_epub/publishers/__init__.py +++ b/article_epub/publishers/__init__.py @@ -3,3 +3,4 @@ from article_epub.publishers.springer import Springer from article_epub.publishers.wiley import Wiley from article_epub.publishers.bioone import BioOne from article_epub.publishers.plosone import PLoSONE +from article_epub.publishers.oxford import Oxford diff --git a/article_epub/publishers/oxford.py b/article_epub/publishers/oxford.py new file mode 100644 index 0000000..f8a95a1 --- /dev/null +++ b/article_epub/publishers/oxford.py @@ -0,0 +1,45 @@ +from article_epub.publisher import Publisher, register_publisher +import copy + +class Oxford(Publisher): + """Class for Oxford articles""" + + domains = ["oup.com"] + + def get_doi(self): + if self.doi == None: + doi_raw = self.soup.find('div',class_='ww-citation-primary') \ + .find('a')['href'].split('/') + self.doi = str(doi_raw[3]+'/'+doi_raw[4]) + + def get_abstract(self): + """Get article abstract""" + abstract_raw = self.soup.find('section',class_='abstract') + self.abstract = '<h2>Abstract<h2>\n'+str(abstract_raw) + + def get_keywords(self): + """Get article keywords""" + keywords_raw = self.soup.find('div',class_='kwd-group').find_all('a') + self.keywords = [] + for i in keywords_raw: + self.keywords.append(i.text) + + def get_body(self): + """Get body of article""" + body_raw = copy.copy(self.soup.find( + 'div',{'data-widgetname':'ArticleFulltext'})) + body_raw.find('h2',class_='abstract-title').decompose() + body_raw.find('div',class_='article-metadata-panel').decompose() + body_raw.find('div',class_='ref-list').decompose() + body_raw.find('span',{'id':'UserHasAccess'}).decompose() + body_raw.find('div',class_='copyright').decompose() + body_raw.find('h2',class_='backreferences-title').decompose() + self.body = body_raw + + def get_references(self): + """Get references list""" + references_title = self.soup.find('h2',class_='backreferences-title') + references_raw = self.soup.find('div',class_='ref-list') + self.references = str(references_title)+str(references_raw) + +register_publisher(Oxford) diff --git a/article_epub/publishers/plosone.py b/article_epub/publishers/plosone.py index 578826e..f096641 100644 --- a/article_epub/publishers/plosone.py +++ b/article_epub/publishers/plosone.py @@ -1,6 +1,4 @@ from article_epub.publisher import Publisher, register_publisher -import requests -from bs4 import BeautifulSoup class PLoSONE(Publisher): """Class for PLoS ONE articles""" |