Source code for km3flux.utils.km3flux

#!/usr/bin/env python3
"""
Updates the files in the data folder by scraping the publications.
Existing data files are not re-downloaded.

Usage:
    km3flux [-spx] update
    km3flux (-h | --help)
    km3flux --version

Options:
    -x    Overwrite existing files when updating.
    -s    Include seasonal flux data from Honda.
    -p    Include production height tables from Honda.
    -h    Show this screen.
    -v    Show the version.

Currently only the Honda fluxes are download from
https://www.icrr.u-tokyo.ac.jp/~mhonda/
"""
import os
import re
from urllib.parse import urljoin

try:
    import requests
    from bs4 import BeautifulSoup
    from docopt import docopt
    from tqdm import tqdm
except ModuleNotFoundError:
    print(
        "Install the optional dependencies to be able to manage the archive:\n\n"
        "    pip install 'km3flux[all]'\n"
    )
    exit(1)

import km3flux
from km3flux.data import basepath

[docs]URL = "https://www.icrr.u-tokyo.ac.jp/~mhonda/"
[docs]log = km3flux.logger.get_logger("km3flux")
[docs]def get_honda(include_seasonal=False, include_production_height=False, overwrite=False): """Grab all the Honda fluxes""" def archive_data(url, year, overwrite=False): """Archives a file from `url` under `year`. Currently, only Honda files are downloaded so there is no logic in place to manage multiple download target locations. Therefore "honda/" is hard-coded. """ target_path = basepath / "honda" / year / os.path.basename(url) if not overwrite and os.path.exists(target_path): return os.makedirs(target_path.parent, exist_ok=True) r = requests.get(url) if not r.ok: log.error( "Unable to retrieve '%s', reason: '%s' (status code %d)", url, r.reason, r.status_code, ) else: with open(target_path, "wb") as fobj: fobj.write(r.content) def get_all_data(url, year, overwrite=False, label=""): """Downloads all the datafiles from a given `url`""" p = requests.get(url) s = BeautifulSoup(p.content, "html.parser") hrefs = [a["href"] for a in s.find_all("a") if a["href"].endswith(".d.gz")] for href in tqdm(hrefs, label): data_url = urljoin(p.url, href) archive_data(data_url, year, overwrite) print("Updating Honda fluxes...") page = requests.get(URL) soup = BeautifulSoup(page.content, "html.parser") for e in soup.find_all("a"): # yearly datasets m = re.search(r"(nflx(\d{4})/index.html)", e.attrs["href"]) if m: suburl, year = m.groups() print(f"-> year {year}") get_all_data(urljoin(page.url, suburl), year, overwrite, "flux tables") if include_seasonal: p = requests.get(urljoin(page.url, suburl)) s = BeautifulSoup(p.content, "html.parser") links = s.find_all("a") for _e in links: ms = re.search(r"index-\d{4}.html", _e.attrs["href"]) if ms: suburl = urljoin(p.url, _e.attrs["href"]) get_all_data(suburl, year, overwrite, "seasonal fluxes") if include_production_height: p = requests.get(urljoin(page.url, suburl)) s = BeautifulSoup(p.content, "html.parser") links = s.find_all("a") for _e in links: ms = re.search(r"index-height.html", _e.attrs["href"]) if ms: suburl = urljoin(p.url, _e.attrs["href"]) get_all_data( suburl, year, overwrite, "production height tables"
)
[docs]def main(): args = docopt(__doc__, version=km3flux.version) get_honda( include_seasonal=args["-x"], include_production_height=args["-p"], overwrite=args["-x"],
) if __name__ == "__main__": main()