Source code for articledownloader.articledownloader

import requests
from requests.utils import quote
import re
import json
import scrapers
from autologging import logged, traced
from csv import reader
from time import sleep

[docs]@logged class ArticleDownloader: def __init__(self, els_api_key=None, sleep_sec=1, timeout_sec=30): ''' Initialize and set up API keys :param els_api_key: API key for Elsevier (for Elsevier's API) :type els_api_key: str :param sleep_sec: Sleep time between API calls (default = 1s) :type sleep_sec: int :param timeout_sec: Max time before timeout (default = 30s) :type timeout_sec: int ''' self.els_api_key = els_api_key self.sleep_sec = sleep_sec self.timeout_sec = timeout_sec
[docs] @traced def get_dois_from_journal_issn(self, issn, rows=500, pub_after=2000, mailto="null@null.com"): ''' Grabs a set of unique DOIs based on a journal ISSN using the CrossRef API :param issn: The ISSN of the journal :type issn: str :param rows: the maximum number of DOIs to find :type rows: int :param pub_after: the minimum publication year for DOIs returned :type pub_after: int :param mailto: mailto address for API :type rows: str :returns: the unique set of DOIs as a list :rtype: list ''' dois = [] base_url = 'https://api.crossref.org/journals/' + issn + '/works?filter=from-pub-date:' + str(pub_after) max_rows = 1000 #Defined by CrossRef API headers = { 'Accept': 'application/json', 'User-agent': 'mailto:' + mailto } if rows <= max_rows: #No multi-query needed search_url = str(base_url) + '&rows=' + str(rows) response = requests.get(search_url, headers=headers, timeout=self.timeout_sec).json() for item in response["message"]["items"]: dois.append(item["DOI"]) else: #Need to split queries cursor = "*" keep_paging = True while (keep_paging): sleep(self.sleep_sec) r = requests.get(base_url + "&rows=" + str(max_rows) + "&cursor=" + cursor, headers=headers, timeout=self.timeout_sec) cursor = quote(r.json()['message']['next-cursor'], safe='') if len(r.json()['message']['items']) == 0: keep_paging = False for item in r.json()['message']['items']: dois.append(item['DOI']) return list(set(dois))
[docs] @traced def get_metadata_from_journal_issn(self, issn, rows=500, pub_after=2000, mailto="null@null.com"): ''' Grabs metadata based on a journal ISSN using the CrossRef API :param issn: The ISSN of the journal :type issn: str :param rows: the maximum number of DOIs to find :type rows: int :param pub_after: the minimum publication year for DOIs returned :type pub_after: int :param mailto: mailto address for API :type rows: str :returns: the metadata for the articles according to this ISSN :rtype: list ''' metadata_records = [] base_url = 'https://api.crossref.org/journals/' + issn + '/works?filter=from-pub-date:' + str(pub_after) max_rows = 1000 #Defined by CrossRef API headers = { 'Accept': 'application/json', 'User-agent': 'mailto:' + mailto } if rows <= max_rows: #No multi-query needed search_url = str(base_url) + '&rows=' + str(rows) response = requests.get(search_url, headers=headers, timeout=self.timeout_sec).json() for item in response["message"]["items"]: try: metadata_records.append({ "doi": item["DOI"], "issn": item["ISSN"], "title": item["title"][0], "prefix": item["prefix"], "journal": item["container-title"][0], "publisher": item["publisher"], "volume": item["volume"], "issue": item["issue"], "page": item["page"], }) except: pass else: #Need to split queries cursor = "*" keep_paging = True while (keep_paging): sleep(self.sleep_sec) r = requests.get(base_url + "&rows=" + str(max_rows) + "&cursor=" + cursor, headers=headers, timeout=self.timeout_sec) cursor = quote(r.json()['message']['next-cursor'], safe='') if len(r.json()['message']['items']) == 0: keep_paging = False for item in r.json()['message']['items']: try: metadata_records.append({ "doi": item["DOI"], "issn": item["ISSN"], "title": item["title"][0], "prefix": item["prefix"], "journal": item["container-title"][0], "publisher": item["publisher"], "volume": item["volume"], "issue": item["issue"], "page": item["page"], }) except: pass return metadata_records
[docs] @traced def get_xml_from_doi(self, doi, writefile, mode): ''' Downloads and writes an HTML article to a file, given a DOI and operating mode :param doi: DOI string for the article we want to download :type doi: str :param writefile: file object to write to :type writefile: file :param mode: choose from {'elsevier' | 'aps'}, depending on how we wish to access the file :type mode: str :returns: True on successful write, False otherwise :rtype: bool ''' if mode == 'elsevier': try: xml_url='https://api.elsevier.com/content/article/doi/' + doi + '?view=FULL' headers = { 'X-ELS-APIKEY': self.els_api_key, 'Accept': 'text/xml' } r = requests.get(xml_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: # API download limit exceeded return False return False if mode == 'aps': try: xml_url='http://harvest.aps.org/v2/journals/articles/' + doi headers = { 'Accept': 'text/xml' } r = requests.get(xml_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: # API download limit exceeded return False return False return False
[docs] @traced def get_html_from_doi(self, doi, writefile, mode): ''' Downloads and writes an HTML article to a file, given a DOI and operating mode :param doi: DOI string for the article we want to download :type doi: str :param writefile: file object to write to :type writefile: file :param mode: choose from {'elsevier' | 'springer' | 'acs' | 'ecs' | 'rsc' | 'nature' | 'wiley' | 'aaas' | 'emerald'}, depending on how we wish to access the file :type mode: str :returns: True on successful write, False otherwise :rtype: bool ''' if mode == 'springer': base_url = 'http://link.springer.com/' api_url = base_url + doi + '.html' try: headers = { 'Accept': 'text/html', 'User-agent': 'Mozilla/5.0' } r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'wiley': base_url = 'http://onlinelibrary.wiley.com/doi/' api_url = base_url + doi + '/full' try: headers = { 'Accept': 'text/html', 'User-agent': 'Mozilla/5.0' } r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'acs': base_url = 'http://pubs.acs.org/doi/full/' api_url = base_url + doi try: headers = { 'Accept': 'text/html', 'User-agent': 'Mozilla/5.0' } r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'emerald': base_url = 'http://www.emeraldinsight.com/doi/full/' api_url = base_url + doi try: headers = { 'Accept': 'text/html', 'User-agent': 'Mozilla/5.0' } r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'rsc': html_string = 'articlehtml' download_url = 'https://doi.org/' + doi headers = { 'Accept': 'text/html', 'User-agent': 'Mozilla/5.0' } r = requests.get(download_url, headers=headers, timeout=self.timeout_sec) url = r.url url = url.encode('ascii') url = url.split('/') url = url[0] + '//' + url[2] + '/' + url[3] + '/' + url[4] + '/' + html_string + '/' + url[6] + '/' + url[7] + '/' + url[8] r = requests.get(url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: try: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'nature': download_url = 'https://doi.org/' + doi headers = { 'Accept': 'text/html', 'User-agent': 'Mozilla/5.0' } r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: try: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'aaas': headers = { 'Accept': 'text/html', 'User-agent': 'Mozilla/5.0' } article_url = 'https://doi.org/' + doi resp = requests.get(article_url, headers=headers, timeout=self.timeout_sec) download_url = resp.url + '.full' #Capture fulltext from redirect r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: try: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'ecs': headers = { 'Accept': 'text/html', 'User-agent': 'Mozilla/5.0' } article_url = 'https://doi.org/' + doi resp = requests.get(article_url, headers=headers, timeout=self.timeout_sec) download_url = resp.url + '.full' #Capture fulltext from redirect r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: try: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False return False
[docs] @traced def get_pdf_from_doi(self, doi, writefile, mode): ''' Downloads and writes a PDF article to a file, given a DOI and operating mode :param doi: DOI string for the article we want to download :type doi: str :param writefile: file object to write to :type writefile: file :param mode: choose from {'crossref' | 'elsevier' | 'rsc' | 'springer' | 'ecs' | 'nature' | 'acs'}, depending on how we wish to access the file :type mode: str :returns: True on successful write, False otherwise :rtype: bool ''' if mode == 'crossref': base_url = 'http://api.crossref.org/works/' api_url = base_url + doi headers = { 'Accept': 'application/json' } try: response = json.loads(requests.get(api_url, headers=headers, timeout=self.timeout_sec).text) pdf_url = response['message']['link'][0]['URL'] app_type = str(response['message']['link'][0]['content-type']) if app_type in ['application/pdf', 'unspecified']: headers['Accept'] = 'application/pdf' r = requests.get(pdf_url, stream=True, headers=headers) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'elsevier': try: pdf_url='http://api.elsevier.com/content/article/doi:' + doi + '?view=FULL' headers = { 'X-ELS-APIKEY': self.els_api_key, 'Accept': 'application/pdf' } r = requests.get(pdf_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: # API download limit exceeded return False return False if mode == 'rsc': scraper = scrapers.RSC() scrape_url = 'https://doi.org/' + doi download_url = None r = requests.get(scrape_url, timeout=self.timeout_sec) if r.status_code == 200: scraper.feed(r.content) if scraper.download_link is not None: download_url = scraper.download_link if download_url is not None: headers = { 'Accept': 'application/pdf' } r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: try: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'ecs': scraper = scrapers.ECS() scrape_url = 'https://doi.org/' + doi download_url = None r = requests.get(scrape_url, timeout=self.timeout_sec) if r.status_code == 200: scraper.feed(r.content) if scraper.download_link is not None: download_url = scraper.download_link if download_url is not None: headers = { 'Accept': 'application/pdf' } r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: try: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'nature': scraper = scrapers.Nature() scrape_url = 'https://doi.org/' + doi download_url = None r = requests.get(scrape_url, timeout=self.timeout_sec) if r.status_code == 200: scraper.feed(r.content) if scraper.download_link is not None: download_url = scraper.download_link if download_url is not None: headers = { 'Accept': 'application/pdf' } r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: try: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'acs': base_url = 'http://pubs.acs.org/doi/pdf/' api_url = base_url + doi try: headers = { 'Accept': 'application/pdf', 'User-agent': 'Mozilla/5.0' } r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'springer': base_url = 'http://link.springer.com/content/pdf/' api_url = base_url + doi try: headers = { 'Accept': 'application/pdf', 'User-agent': 'Mozilla/5.0' } r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False return False
[docs] @traced def get_abstract_from_doi(self, doi, mode): ''' Returns abstract as a unicode string given a DOI :param doi: DOI string for the article we want to grab metadata for :type doi: str :param mode: Only supports 'elsevier' for now :type mode: str :returns: An abstract (or None on failure) :rtype: unicode ''' if mode == 'elsevier': try: url='http://api.elsevier.com/content/article/doi/' + doi + '?view=FULL' headers = { 'X-ELS-APIKEY': self.els_api_key, 'Accept': 'application/json' } r = requests.get(url, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: abstract = unicode(json.loads(r.text)['full-text-retrieval-response']['coredata']['dc:description']) return abstract except: # API download limit exceeded or no abstract exists return None return None
[docs] @traced def get_title_from_doi(self, doi, mode): ''' Returns title of an article as a unicode string given a DOI :param doi: DOI string for the article we want to grab metadata for :type doi: str :param mode: Only supports 'crossref' for now :type mode: str :returns: A title (or None on failure) :rtype: unicode ''' if mode == 'crossref': try: url='http://api.crossref.org/works/' + doi headers = { 'X-ELS-APIKEY': self.els_api_key, 'Accept': 'application/json' } r = requests.get(url, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: title = unicode(r.json()['message']['title'][0]) return title except: # API download limit exceeded or no title exists return None return None
[docs] @traced def load_queries_from_csv(self, csvf): ''' Loads a list of queries from a CSV file :param csvf: file object containing a CSV file with one query per line :type csvf: file :returns: a list of queries, processed to be insertable into REST API (GET) calls :rtype: list ''' csvf.seek(0) csvreader = reader(csvf, delimiter=',') queries = [] for line in csvreader: #Build search query (assume 1st column is queries) query = quote(line[0]) query = query.split() query = '+'.join(query) final_query = query queries.append(final_query) return queries