import requests
from requests.utils import quote
import re
import json
import scrapers
from autologging import logged, traced
from csv import reader
from time import sleep
[docs]@logged
class ArticleDownloader:
def __init__(self, els_api_key=None, sleep_sec=1, timeout_sec=30):
'''
Initialize and set up API keys
:param els_api_key: API key for Elsevier (for Elsevier's API)
:type els_api_key: str
:param sleep_sec: Sleep time between API calls (default = 1s)
:type sleep_sec: int
:param timeout_sec: Max time before timeout (default = 30s)
:type timeout_sec: int
'''
self.els_api_key = els_api_key
self.sleep_sec = sleep_sec
self.timeout_sec = timeout_sec
[docs] @traced
def get_dois_from_search(self, query, rows=500, mailto="null@null.com"):
'''
Grabs a set of unique DOIs based on a search query using the CrossRef API
:param query: the search string
:type query: str
:param rows: the maximum number of DOIs to find
:type rows: int
:param mailto: mailto address for API
:type rows: str
:returns: the unique set of DOIs as a list
:rtype: list
'''
dois = []
base_url = 'https://api.crossref.org/works?query='
max_rows = 1000 #Defined by CrossRef API
headers = {
'Accept': 'application/json',
'User-agent': 'mailto:' + mailto
}
if rows <= max_rows: #No multi-query needed
search_url = base_url + query + '&rows=' + str(rows)
response = requests.get(search_url, headers=headers, timeout=self.timeout_sec).json()
for item in response["message"]["items"]:
dois.append(item["DOI"])
else: #Need to split queries
cursor = "*"
keep_paging = True
while (keep_paging):
sleep(self.sleep_sec)
r = requests.get(base_url + query + "&rows=" + str(max_rows) + "&cursor=" + cursor,
headers=headers, timeout=self.timeout_sec)
cursor = quote(r.json()['message']['next-cursor'], safe='')
if len(r.json()['message']['items']) == 0:
keep_paging = False
for item in r.json()['message']['items']:
dois.append(item['DOI'])
return list(set(dois))
[docs] @traced
def get_dois_from_journal_issn(self, issn, rows=500, pub_after=2000, mailto="null@null.com"):
'''
Grabs a set of unique DOIs based on a journal ISSN using the CrossRef API
:param issn: The ISSN of the journal
:type issn: str
:param rows: the maximum number of DOIs to find
:type rows: int
:param pub_after: the minimum publication year for DOIs returned
:type pub_after: int
:param mailto: mailto address for API
:type rows: str
:returns: the unique set of DOIs as a list
:rtype: list
'''
dois = []
base_url = 'https://api.crossref.org/journals/' + issn + '/works?filter=from-pub-date:' + str(pub_after)
max_rows = 1000 #Defined by CrossRef API
headers = {
'Accept': 'application/json',
'User-agent': 'mailto:' + mailto
}
if rows <= max_rows: #No multi-query needed
search_url = str(base_url) + '&rows=' + str(rows)
response = requests.get(search_url, headers=headers, timeout=self.timeout_sec).json()
for item in response["message"]["items"]:
dois.append(item["DOI"])
else: #Need to split queries
cursor = "*"
keep_paging = True
while (keep_paging):
sleep(self.sleep_sec)
r = requests.get(base_url + "&rows=" + str(max_rows) + "&cursor=" + cursor,
headers=headers, timeout=self.timeout_sec)
cursor = quote(r.json()['message']['next-cursor'], safe='')
if len(r.json()['message']['items']) == 0:
keep_paging = False
for item in r.json()['message']['items']:
dois.append(item['DOI'])
return list(set(dois))
[docs] @traced
def get_metadata_from_journal_issn(self, issn, rows=500, pub_after=2000, mailto="null@null.com"):
'''
Grabs metadata based on a journal ISSN using the CrossRef API
:param issn: The ISSN of the journal
:type issn: str
:param rows: the maximum number of DOIs to find
:type rows: int
:param pub_after: the minimum publication year for DOIs returned
:type pub_after: int
:param mailto: mailto address for API
:type rows: str
:returns: the metadata for the articles according to this ISSN
:rtype: list
'''
metadata_records = []
base_url = 'https://api.crossref.org/journals/' + issn + '/works?filter=from-pub-date:' + str(pub_after)
max_rows = 1000 #Defined by CrossRef API
headers = {
'Accept': 'application/json',
'User-agent': 'mailto:' + mailto
}
if rows <= max_rows: #No multi-query needed
search_url = str(base_url) + '&rows=' + str(rows)
response = requests.get(search_url, headers=headers, timeout=self.timeout_sec).json()
for item in response["message"]["items"]:
try:
metadata_records.append({
"doi": item["DOI"],
"issn": item["ISSN"],
"title": item["title"][0],
"prefix": item["prefix"],
"journal": item["container-title"][0],
"publisher": item["publisher"],
"volume": item["volume"],
"issue": item["issue"],
"page": item["page"],
})
except:
pass
else: #Need to split queries
cursor = "*"
keep_paging = True
while (keep_paging):
sleep(self.sleep_sec)
r = requests.get(base_url + "&rows=" + str(max_rows) + "&cursor=" + cursor,
headers=headers, timeout=self.timeout_sec)
cursor = quote(r.json()['message']['next-cursor'], safe='')
if len(r.json()['message']['items']) == 0:
keep_paging = False
for item in r.json()['message']['items']:
try:
metadata_records.append({
"doi": item["DOI"],
"issn": item["ISSN"],
"title": item["title"][0],
"prefix": item["prefix"],
"journal": item["container-title"][0],
"publisher": item["publisher"],
"volume": item["volume"],
"issue": item["issue"],
"page": item["page"],
})
except:
pass
return metadata_records
[docs] @traced
def get_xml_from_doi(self, doi, writefile, mode):
'''
Downloads and writes an HTML article to a file, given a DOI and operating mode
:param doi: DOI string for the article we want to download
:type doi: str
:param writefile: file object to write to
:type writefile: file
:param mode: choose from {'elsevier' | 'aps'}, depending on how we wish to access the file
:type mode: str
:returns: True on successful write, False otherwise
:rtype: bool
'''
if mode == 'elsevier':
try:
xml_url='https://api.elsevier.com/content/article/doi/' + doi + '?view=FULL'
headers = {
'X-ELS-APIKEY': self.els_api_key,
'Accept': 'text/xml'
}
r = requests.get(xml_url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
# API download limit exceeded
return False
return False
if mode == 'aps':
try:
xml_url='http://harvest.aps.org/v2/journals/articles/' + doi
headers = {
'Accept': 'text/xml'
}
r = requests.get(xml_url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
# API download limit exceeded
return False
return False
return False
[docs] @traced
def get_html_from_doi(self, doi, writefile, mode):
'''
Downloads and writes an HTML article to a file, given a DOI and operating mode
:param doi: DOI string for the article we want to download
:type doi: str
:param writefile: file object to write to
:type writefile: file
:param mode: choose from {'elsevier' | 'springer' | 'acs' | 'ecs' | 'rsc' | 'nature' | 'wiley' | 'aaas' | 'emerald'}, depending on how we wish to access the file
:type mode: str
:returns: True on successful write, False otherwise
:rtype: bool
'''
if mode == 'springer':
base_url = 'http://link.springer.com/'
api_url = base_url + doi + '.html'
try:
headers = {
'Accept': 'text/html',
'User-agent': 'Mozilla/5.0'
}
r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False
if mode == 'wiley':
base_url = 'http://onlinelibrary.wiley.com/doi/'
api_url = base_url + doi + '/full'
try:
headers = {
'Accept': 'text/html',
'User-agent': 'Mozilla/5.0'
}
r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False
if mode == 'acs':
base_url = 'http://pubs.acs.org/doi/full/'
api_url = base_url + doi
try:
headers = {
'Accept': 'text/html',
'User-agent': 'Mozilla/5.0'
}
r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False
if mode == 'emerald':
base_url = 'http://www.emeraldinsight.com/doi/full/'
api_url = base_url + doi
try:
headers = {
'Accept': 'text/html',
'User-agent': 'Mozilla/5.0'
}
r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False
if mode == 'rsc':
html_string = 'articlehtml'
download_url = 'https://doi.org/' + doi
headers = {
'Accept': 'text/html',
'User-agent': 'Mozilla/5.0'
}
r = requests.get(download_url, headers=headers, timeout=self.timeout_sec)
url = r.url
url = url.encode('ascii')
url = url.split('/')
url = url[0] + '//' + url[2] + '/' + url[3] + '/' + url[4] + '/' + html_string + '/' + url[6] + '/' + url[7] + '/' + url[8]
r = requests.get(url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
try:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False
if mode == 'nature':
download_url = 'https://doi.org/' + doi
headers = {
'Accept': 'text/html',
'User-agent': 'Mozilla/5.0'
}
r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
try:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False
if mode == 'aaas':
headers = {
'Accept': 'text/html',
'User-agent': 'Mozilla/5.0'
}
article_url = 'https://doi.org/' + doi
resp = requests.get(article_url, headers=headers, timeout=self.timeout_sec)
download_url = resp.url + '.full' #Capture fulltext from redirect
r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
try:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False
if mode == 'ecs':
headers = {
'Accept': 'text/html',
'User-agent': 'Mozilla/5.0'
}
article_url = 'https://doi.org/' + doi
resp = requests.get(article_url, headers=headers, timeout=self.timeout_sec)
download_url = resp.url + '.full' #Capture fulltext from redirect
r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
try:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False
return False
[docs] @traced
def get_pdf_from_doi(self, doi, writefile, mode):
'''
Downloads and writes a PDF article to a file, given a DOI and operating mode
:param doi: DOI string for the article we want to download
:type doi: str
:param writefile: file object to write to
:type writefile: file
:param mode: choose from {'crossref' | 'elsevier' | 'rsc' | 'springer' | 'ecs' | 'nature' | 'acs'}, depending on how we wish to access the file
:type mode: str
:returns: True on successful write, False otherwise
:rtype: bool
'''
if mode == 'crossref':
base_url = 'http://api.crossref.org/works/'
api_url = base_url + doi
headers = {
'Accept': 'application/json'
}
try:
response = json.loads(requests.get(api_url, headers=headers, timeout=self.timeout_sec).text)
pdf_url = response['message']['link'][0]['URL']
app_type = str(response['message']['link'][0]['content-type'])
if app_type in ['application/pdf', 'unspecified']:
headers['Accept'] = 'application/pdf'
r = requests.get(pdf_url, stream=True, headers=headers)
if r.status_code == 200:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False
if mode == 'elsevier':
try:
pdf_url='http://api.elsevier.com/content/article/doi:' + doi + '?view=FULL'
headers = {
'X-ELS-APIKEY': self.els_api_key,
'Accept': 'application/pdf'
}
r = requests.get(pdf_url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
# API download limit exceeded
return False
return False
if mode == 'rsc':
scraper = scrapers.RSC()
scrape_url = 'https://doi.org/' + doi
download_url = None
r = requests.get(scrape_url, timeout=self.timeout_sec)
if r.status_code == 200:
scraper.feed(r.content)
if scraper.download_link is not None:
download_url = scraper.download_link
if download_url is not None:
headers = {
'Accept': 'application/pdf'
}
r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
try:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False
if mode == 'ecs':
scraper = scrapers.ECS()
scrape_url = 'https://doi.org/' + doi
download_url = None
r = requests.get(scrape_url, timeout=self.timeout_sec)
if r.status_code == 200:
scraper.feed(r.content)
if scraper.download_link is not None:
download_url = scraper.download_link
if download_url is not None:
headers = {
'Accept': 'application/pdf'
}
r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
try:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False
if mode == 'nature':
scraper = scrapers.Nature()
scrape_url = 'https://doi.org/' + doi
download_url = None
r = requests.get(scrape_url, timeout=self.timeout_sec)
if r.status_code == 200:
scraper.feed(r.content)
if scraper.download_link is not None:
download_url = scraper.download_link
if download_url is not None:
headers = {
'Accept': 'application/pdf'
}
r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
try:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False
if mode == 'acs':
base_url = 'http://pubs.acs.org/doi/pdf/'
api_url = base_url + doi
try:
headers = {
'Accept': 'application/pdf',
'User-agent': 'Mozilla/5.0'
}
r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False
if mode == 'springer':
base_url = 'http://link.springer.com/content/pdf/'
api_url = base_url + doi
try:
headers = {
'Accept': 'application/pdf',
'User-agent': 'Mozilla/5.0'
}
r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False
return False
[docs] @traced
def get_abstract_from_doi(self, doi, mode):
'''
Returns abstract as a unicode string given a DOI
:param doi: DOI string for the article we want to grab metadata for
:type doi: str
:param mode: Only supports 'elsevier' for now
:type mode: str
:returns: An abstract (or None on failure)
:rtype: unicode
'''
if mode == 'elsevier':
try:
url='http://api.elsevier.com/content/article/doi/' + doi + '?view=FULL'
headers = {
'X-ELS-APIKEY': self.els_api_key,
'Accept': 'application/json'
}
r = requests.get(url, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
abstract = unicode(json.loads(r.text)['full-text-retrieval-response']['coredata']['dc:description'])
return abstract
except:
# API download limit exceeded or no abstract exists
return None
return None
[docs] @traced
def get_title_from_doi(self, doi, mode):
'''
Returns title of an article as a unicode string given a DOI
:param doi: DOI string for the article we want to grab metadata for
:type doi: str
:param mode: Only supports 'crossref' for now
:type mode: str
:returns: A title (or None on failure)
:rtype: unicode
'''
if mode == 'crossref':
try:
url='http://api.crossref.org/works/' + doi
headers = {
'X-ELS-APIKEY': self.els_api_key,
'Accept': 'application/json'
}
r = requests.get(url, headers=headers, timeout=self.timeout_sec)
if r.status_code == 200:
title = unicode(r.json()['message']['title'][0])
return title
except:
# API download limit exceeded or no title exists
return None
return None
[docs] @traced
def load_queries_from_csv(self, csvf):
'''
Loads a list of queries from a CSV file
:param csvf: file object containing a CSV file with one query per line
:type csvf: file
:returns: a list of queries, processed to be insertable into REST API (GET) calls
:rtype: list
'''
csvf.seek(0)
csvreader = reader(csvf, delimiter=',')
queries = []
for line in csvreader:
#Build search query (assume 1st column is queries)
query = quote(line[0])
query = query.split()
query = '+'.join(query)
final_query = query
queries.append(final_query)
return queries