From 4eed12039fca7322305b550933b94856a408fed2 Mon Sep 17 00:00:00 2001 From: Ken Kellner Date: Sat, 21 Apr 2018 14:17:16 -0400 Subject: Handle Elsevier redirects better --- article_epub/utilities.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/article_epub/utilities.py b/article_epub/utilities.py index 60a2d97..b9da761 100644 --- a/article_epub/utilities.py +++ b/article_epub/utilities.py @@ -1,6 +1,7 @@ import requests from bs4 import BeautifulSoup import sys +from urllib.parse import unquote def url_from_title(title): print("Getting URL from title......") @@ -33,7 +34,16 @@ def url_from_title(title): def url_from_doi(doi): print("Getting URL from DOI........",end='',flush=True) - url = requests.get('https://doi.org/'+doi, - headers={'User-Agent':'Mozilla/5.0'}).url + r = requests.get('https://doi.org/'+doi, + headers={'User-Agent':'Mozilla/5.0'}) + + #To handle Elsevier linkinghub redirects + soup = BeautifulSoup(r.content,'html.parser') + if soup.find('input',{'id':'redirectURL'}) is not None: + url_raw = soup.find('input',{'id':'redirectURL'})['value'] + url = unquote(url_raw.split('_returnURL')[0]) + else: + url = r.url + print('done') return(url) -- cgit v1.2.3