diff options
author | Ken Kellner <ken@kenkellner.com> | 2018-04-21 14:17:16 -0400 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2018-04-21 14:17:16 -0400 |
commit | 4eed12039fca7322305b550933b94856a408fed2 (patch) | |
tree | 74ca36dbe82649004a8370bbade157d34b293501 | |
parent | 8f61b92000c9a03a8d0740f268dc1befca66ad88 (diff) |
Handle Elsevier redirects better
-rw-r--r-- | article_epub/utilities.py | 14 |
1 files changed, 12 insertions, 2 deletions
diff --git a/article_epub/utilities.py b/article_epub/utilities.py index 60a2d97..b9da761 100644 --- a/article_epub/utilities.py +++ b/article_epub/utilities.py @@ -1,6 +1,7 @@ import requests from bs4 import BeautifulSoup import sys +from urllib.parse import unquote def url_from_title(title): print("Getting URL from title......") @@ -33,7 +34,16 @@ def url_from_title(title): def url_from_doi(doi): print("Getting URL from DOI........",end='',flush=True) - url = requests.get('https://doi.org/'+doi, - headers={'User-Agent':'Mozilla/5.0'}).url + r = requests.get('https://doi.org/'+doi, + headers={'User-Agent':'Mozilla/5.0'}) + + #To handle Elsevier linkinghub redirects + soup = BeautifulSoup(r.content,'html.parser') + if soup.find('input',{'id':'redirectURL'}) is not None: + url_raw = soup.find('input',{'id':'redirectURL'})['value'] + url = unquote(url_raw.split('_returnURL')[0]) + else: + url = r.url + print('done') return(url) |