1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
from article_epub.publisher import Publisher, register_publisher
import sys
import copy
class Nature(Publisher):
"""Class for Nature Publishing articles"""
name = "Nature Publishing"
domains = ["nature.com"]
def check_fulltext(self):
test = self.soup.find('a',{'data-track-action':'subscribe'})
if test != None:
sys.exit('Error: Can\'t access fulltext of article')
else:
return(True)
def get_doi(self):
if self.doi == None:
self.doi = str(self.soup.find('meta',{'name':'DOI'})['content'])
def get_abstract(self):
"""Get article abstract"""
abstract_raw = self.soup.find('div',{'id':'abstract-section'})
try:
abstract_raw.find('span').decompose()
except:
pass
self.abstract = str(abstract_raw)
def get_keywords(self):
"""Get article keywords"""
self.keywords = []
try:
keywords_raw = self.soup.find_all('a',class_='subject-tag-link')
for i in keywords_raw:
self.keywords.append(i.text)
except:
pass
def get_body(self):
"""Get body of article"""
body_raw = copy.copy(self.soup.find('div',class_='article-body'))
try:
body_raw.find('section',{'aria-labelledby':'abstract'}).decompose()
except:
pass
try:
body_raw.find('section',{'aria-labelledby':'references'}).decompose()
except:
pass
try:
body_raw.find('section', \
{'aria-labelledby':'author-information'}).decompose()
body_raw.find('section',{'aria-labelledby':'rightslink'}) \
.decompose()
body_raw.find('section',{'aria-labelledby':'article-comments'}) \
.decompose()
except:
pass
for i in body_raw.find_all('span',class_='js-section-title-label'):
i.decompose()
for i in body_raw.find_all('a',{'data-track-action':'view table'}):
link = 'https://www.nature.com'+i['href']
i['href'] = link
for i in body_raw.find_all('a',{'data-track-action':'reference anchor'}):
part = i['href'].split('#')[1]
i['href'] = '#'+part
for i in body_raw.find_all('a',{'data-track-action':'view figure'}):
link = 'https://www.nature.com'+i['href']
i['href'] = link
self.body = str(body_raw)
def get_references(self):
"""Get references list"""
ref_all = self.soup.find('div',{'id':'references-section'})
ref_all.find('span',class_='js-section-title-label').decompose()
refs = ref_all.find('ol').find_all('li',recursive=False)
for i in refs:
try:
i.find('span').decompose()
i.find('ul',class_='js-ref-links').decompose()
except:
pass
self.references = str(ref_all)
register_publisher(Nature)
|