diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-13 21:52:14 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-13 21:52:14 -0400 |
commit | a8dff81b75c46b49c72d6015d4cce680012339b8 (patch) | |
tree | b5e083d328e11353c5a5b1980dd25a5c0bbc602d | |
parent | 0f2a5d0502320721d81670e1be96d22c689f6963 (diff) |
Add a bit of documentation
-rw-r--r-- | README.md | 55 | ||||
-rw-r--r-- | article_epub/publisher.py | 11 |
2 files changed, 61 insertions, 5 deletions
diff --git a/README.md b/README.md new file mode 100644 index 0000000..d34c9b9 --- /dev/null +++ b/README.md @@ -0,0 +1,55 @@ +article-epub +============ + +Description +----------- + +A command-line tool written in Python to convert scientific articles available as HTML into ePub form for reading on a supported e-reader. +Uses a plugin system with a "recipe" for each supported scientific publisher. +Takes an article URL, title, or (ideally) DOI as input. + +Obviously, you need to be able to legally access any article you want to convert, e.g. via a university library. + +Like most web scraping applications, the provided recipes are liable to break frequently. + +Currently, the following publishers are supported: + +* ScienceDirect (Elsevier) +* Springer +* Wiley +* Oxford +* BioOne +* Royal Society +* PLoS ONE +* National Institutes of Health (NIH) +* NRC Research Press + +Dependencies +------------ + +* Linux environment required +* [Calibre](https://calibre-ebook.com/) (to access `ebook-convert`) +* Firefox with headless support +* [Geckodriver](https://github.com/mozilla/geckodriver/releases) installed somewhere in `$PATH` +* [Pandoc](http://pandoc.org/) + +Python packages (available with `pip`): + +* [Selenium](http://selenium-python.readthedocs.io/) +* [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) +* [pypandoc](https://github.com/bebraw/pypandoc) + +Usage +----- + +``` +usage: article-epub [-h] [-u URL] [-d DOI] [-t TITLE] [-o FILE] [-p] + +optional arguments: + -h, --help show this help message and exit + -u URL URL of article + -d DOI DOI of article + -t TITLE Title of article + -o FILE Name of output file + -p List supported publishers +``` diff --git a/article_epub/publisher.py b/article_epub/publisher.py index 17ce39b..e8f0170 100644 --- a/article_epub/publisher.py +++ b/article_epub/publisher.py @@ -13,7 +13,8 @@ _publishers = list() _publisher_domains = dict() class Publisher(object): - + """General class for scientific article publishers""" + def __init__(self, url, doi=None, out_format='epub'): self.url = url self.doi = doi @@ -54,15 +55,14 @@ class Publisher(object): driver.quit() def doi2json(self): - """ - Get a dictionary of metadata for a given DOI. - """ + """Get a dictionary of metadata for a given DOI.""" url = "http://dx.doi.org/" + self.doi headers = {"accept": "application/json"} r = requests.get(url, headers = headers) self.meta = r.json() def get_metadata(self): + """Extract metadata from DOI""" self.doi2json() self.title = self.meta['title'] @@ -89,7 +89,7 @@ class Publisher(object): self.pages = '' def get_citation(self,link=False): - + """Generate a formatted citation from metadata""" all_authors = '' for i in range(0,len(self.author_surnames)): all_authors += self.author_surnames[i] + ', ' @@ -178,6 +178,7 @@ def list_publishers(): return _publishers def match_publisher(url,doi): + """Match a URL to a publisher class""" domain = ".".join(url.split("//")[-1].split("/")[0] \ .split('?')[0].split('.')[-2:]) try: |