1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
|
from article_epub.publisher import Publisher, register_publisher
import requests
import subprocess
from bs4 import BeautifulSoup
class NIH(Publisher):
"""Class for NIH NCBI articles"""
name = "NIH-NCBI"
domains = ["nih.gov"]
def soupify(self):
print('Loading page................',end="",flush=True)
req = requests.get(self.url,headers={'User-Agent':'Mozilla/5.0'})
self.soup = BeautifulSoup(req.content,'html.parser')
print('done')
def get_doi(self):
if self.doi == None:
try:
self.doi = self.soup.find('span',class_='doi').find('a').text
except:
self.doi = ''
def extract_data(self):
print('Extracting data from HTML...',end='',flush=True)
self.get_doi()
self.get_metadata()
self.get_citation()
print('done')
def epubify(self):
all_authors = ''
for i in range(0,len(self.author_surnames)):
all_authors += self.author_givennames[i] + ' '
all_authors += self.author_surnames[i]
if(i != (len(self.author_surnames) - 1)):
all_authors += ', '
self.output = self.author_surnames[0]+'_'+self.year+'.epub'
output_raw = '/tmp/raw.epub'
pdf_link = self.soup.find('div',class_='format-menu') \
.find_all('a')[2]['href']
epub_link = 'http://ncbi.nlm.nih.gov'+str(pdf_link) \
.replace('pdf','epub')
print('Generating epub.............',end='',flush=True)
epub = requests.get(epub_link,headers={'User-Agent':'Mozilla/5.0'})
with open(output_raw, 'wb') as f:
f.write(epub.content)
f.close()
subprocess.check_output(['ebook-convert',output_raw,self.output,
'--authors',all_authors])
print('done')
register_publisher(NIH)
|