aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Kellner <ken@kenkellner.com>2018-04-13 21:52:14 -0400
committerKen Kellner <ken@kenkellner.com>2018-04-13 21:52:14 -0400
commita8dff81b75c46b49c72d6015d4cce680012339b8 (patch)
treeb5e083d328e11353c5a5b1980dd25a5c0bbc602d
parent0f2a5d0502320721d81670e1be96d22c689f6963 (diff)
Add a bit of documentation
-rw-r--r--README.md55
-rw-r--r--article_epub/publisher.py11
2 files changed, 61 insertions, 5 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d34c9b9
--- /dev/null
+++ b/README.md
@@ -0,0 +1,55 @@
+article-epub
+============
+
+Description
+-----------
+
+A command-line tool written in Python to convert scientific articles available as HTML into ePub form for reading on a supported e-reader.
+Uses a plugin system with a "recipe" for each supported scientific publisher.
+Takes an article URL, title, or (ideally) DOI as input.
+
+Obviously, you need to be able to legally access any article you want to convert, e.g. via a university library.
+
+Like most web scraping applications, the provided recipes are liable to break frequently.
+
+Currently, the following publishers are supported:
+
+* ScienceDirect (Elsevier)
+* Springer
+* Wiley
+* Oxford
+* BioOne
+* Royal Society
+* PLoS ONE
+* National Institutes of Health (NIH)
+* NRC Research Press
+
+Dependencies
+------------
+
+* Linux environment required
+* [Calibre](https://calibre-ebook.com/) (to access `ebook-convert`)
+* Firefox with headless support
+* [Geckodriver](https://github.com/mozilla/geckodriver/releases) installed somewhere in `$PATH`
+* [Pandoc](http://pandoc.org/)
+
+Python packages (available with `pip`):
+
+* [Selenium](http://selenium-python.readthedocs.io/)
+* [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
+* [pypandoc](https://github.com/bebraw/pypandoc)
+
+Usage
+-----
+
+```
+usage: article-epub [-h] [-u URL] [-d DOI] [-t TITLE] [-o FILE] [-p]
+
+optional arguments:
+ -h, --help show this help message and exit
+ -u URL URL of article
+ -d DOI DOI of article
+ -t TITLE Title of article
+ -o FILE Name of output file
+ -p List supported publishers
+```
diff --git a/article_epub/publisher.py b/article_epub/publisher.py
index 17ce39b..e8f0170 100644
--- a/article_epub/publisher.py
+++ b/article_epub/publisher.py
@@ -13,7 +13,8 @@ _publishers = list()
_publisher_domains = dict()
class Publisher(object):
-
+ """General class for scientific article publishers"""
+
def __init__(self, url, doi=None, out_format='epub'):
self.url = url
self.doi = doi
@@ -54,15 +55,14 @@ class Publisher(object):
driver.quit()
def doi2json(self):
- """
- Get a dictionary of metadata for a given DOI.
- """
+ """Get a dictionary of metadata for a given DOI."""
url = "http://dx.doi.org/" + self.doi
headers = {"accept": "application/json"}
r = requests.get(url, headers = headers)
self.meta = r.json()
def get_metadata(self):
+ """Extract metadata from DOI"""
self.doi2json()
self.title = self.meta['title']
@@ -89,7 +89,7 @@ class Publisher(object):
self.pages = ''
def get_citation(self,link=False):
-
+ """Generate a formatted citation from metadata"""
all_authors = ''
for i in range(0,len(self.author_surnames)):
all_authors += self.author_surnames[i] + ', '
@@ -178,6 +178,7 @@ def list_publishers():
return _publishers
def match_publisher(url,doi):
+ """Match a URL to a publisher class"""
domain = ".".join(url.split("//")[-1].split("/")[0] \
.split('?')[0].split('.')[-2:])
try: