Source code for jsonstat.downloader

# -*- coding: utf-8 -*-
# This file is part of https://github.com/26fe/jsonstat.py
# Copyright (C) 2016-2017 gf <gf@26fe.com>
# See LICENSE file

# stdlib
from __future__ import print_function
from __future__ import unicode_literals
import time
import os
import hashlib

# packages
import requests

# jsonstat
from jsonstat.exceptions import JsonStatException


[docs]class Downloader: """Helper class to download json stat files. It has a very simple cache mechanism """ def __init__(self, cache_dir="./data", time_to_live=None): """initialize downloader :param cache_dir: directory where to store downloaded files, if cache_dir is None files are not stored :param time_to_live: how many seconds to store file on disk, None is infinity, 0 for not to store """ if cache_dir is not None: self.__cache_dir = os.path.abspath(cache_dir) else: self.__cache_dir = None self.__time_to_live = time_to_live self.__session = requests.session()
[docs] def cache_dir(self): return self.__cache_dir
[docs] def download(self, url, filename=None, time_to_live=None): """Download url from internet. Store the downloaded content into <cache_dir>/file. If <cache_dir>/file exists, it returns content from disk :param url: page to be downloaded :param filename: filename where to store the content of url, None if we want not store :param time_to_live: how many seconds to store file on disk, None use default time_to_live, 0 don't use cached version if any :returns: the content of url (str type) """ pathname = self.__build_pathname(filename, url) # note: html must be a str type not byte type if time_to_live == 0 or not self.__is_cached(pathname): response = self.__session.get(url) response.raise_for_status() html = response.text self.__write_page_to_cache(pathname, html) else: html = self.__read_page_from_file(pathname) return html
def __build_pathname(self, filename, url): if self.__cache_dir is None: return None if filename is None: filename = hashlib.md5(url.encode('utf-8')).hexdigest() pathname = os.path.join(self.__cache_dir, filename) return pathname def __is_cached(self, pathname): """check if pathname exists :param pathname: :returns: True if the file can be retrieved from the disk (cache) """ if pathname is None: return False if not os.path.exists(pathname): return False if self.__time_to_live is None: return True cur = time.time() mtime = os.stat(pathname).st_mtime # print("last modified: %s" % time.ctime(mtime)) return cur - mtime < self.__time_to_live def __write_page_to_cache(self, pathname, content): """write content to pathname :param pathname: :param content: """ if pathname is None: return # create cache directory only the fist time it is needed if not os.path.exists(self.__cache_dir): os.makedirs(self.__cache_dir) if not os.path.isdir(self.__cache_dir): msg = "cache_dir '{}' is not a directory".format(self.__cache_dir) raise JsonStatException(msg) # note: # in python 3 file must be open without b (binary) option to write string # otherwise the following error will be generated # TypeError: a bytes-like object is required, not 'str' with open(pathname, 'w') as f: f.write(content) @staticmethod def __read_page_from_file(pathname): """it reads content from pathname :param pathname: """ with open(pathname, 'r') as f: content = f.read() return content