Coverage for src/km3flux/utils/km3flux.py: 0%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/bin/env python3
2"""
3Updates the files in the data folder by scraping the publications.
4Existing data files are not re-downloaded.
6Usage:
7 km3flux [-spx] update
8 km3flux (-h | --help)
9 km3flux --version
11Options:
12 -x Overwrite existing files when updating.
13 -s Include seasonal flux data from Honda.
14 -p Include production height tables from Honda.
15 -h Show this screen.
16 -v Show the version.
18Currently only the Honda fluxes are download from
19https://www.icrr.u-tokyo.ac.jp/~mhonda/
20"""
21import os
22import re
23from urllib.parse import urljoin
25try:
26 import requests
27 from bs4 import BeautifulSoup
28 from docopt import docopt
29 from tqdm import tqdm
30except ModuleNotFoundError:
31 print(
32 "Install the optional dependencies to be able to manage the archive:\n\n"
33 " pip install 'km3flux[all]'\n"
34 )
35 exit(1)
37import km3flux
38from km3flux.data import basepath
40URL = "https://www.icrr.u-tokyo.ac.jp/~mhonda/"
42log = km3flux.logger.get_logger("km3flux")
45def get_honda(include_seasonal=False, include_production_height=False, overwrite=False):
46 """Grab all the Honda fluxes"""
48 def archive_data(url, year, overwrite=False):
49 """Archives a file from `url` under `year`.
51 Currently, only Honda files are downloaded so there is no logic in place
52 to manage multiple download target locations. Therefore "honda/" is
53 hard-coded.
54 """
55 target_path = basepath / "honda" / year / os.path.basename(url)
56 if not overwrite and os.path.exists(target_path):
57 return
58 os.makedirs(target_path.parent, exist_ok=True)
59 r = requests.get(url)
60 if not r.ok:
61 log.error(
62 "Unable to retrieve '%s', reason: '%s' (status code %d)",
63 url,
64 r.reason,
65 r.status_code,
66 )
67 else:
68 with open(target_path, "wb") as fobj:
69 fobj.write(r.content)
71 def get_all_data(url, year, overwrite=False, label=""):
72 """Downloads all the datafiles from a given `url`"""
73 p = requests.get(url)
74 s = BeautifulSoup(p.content, "html.parser")
75 hrefs = [a["href"] for a in s.find_all("a") if a["href"].endswith(".d.gz")]
76 for href in tqdm(hrefs, label):
77 data_url = urljoin(p.url, href)
78 archive_data(data_url, year, overwrite)
80 print("Updating Honda fluxes...")
81 page = requests.get(URL)
82 soup = BeautifulSoup(page.content, "html.parser")
84 for e in soup.find_all("a"):
85 # yearly datasets
86 m = re.search(r"(nflx(\d{4})/index.html)", e.attrs["href"])
87 if m:
88 suburl, year = m.groups()
89 print(f"-> year {year}")
90 get_all_data(urljoin(page.url, suburl), year, overwrite, "flux tables")
92 if include_seasonal:
93 p = requests.get(urljoin(page.url, suburl))
94 s = BeautifulSoup(p.content, "html.parser")
95 links = s.find_all("a")
96 for _e in links:
97 ms = re.search(r"index-\d{4}.html", _e.attrs["href"])
98 if ms:
99 suburl = urljoin(p.url, _e.attrs["href"])
100 get_all_data(suburl, year, overwrite, "seasonal fluxes")
102 if include_production_height:
103 p = requests.get(urljoin(page.url, suburl))
104 s = BeautifulSoup(p.content, "html.parser")
105 links = s.find_all("a")
106 for _e in links:
107 ms = re.search(r"index-height.html", _e.attrs["href"])
108 if ms:
109 suburl = urljoin(p.url, _e.attrs["href"])
110 get_all_data(
111 suburl, year, overwrite, "production height tables"
112 )
115def main():
116 args = docopt(__doc__, version=km3flux.version)
118 get_honda(
119 include_seasonal=args["-x"],
120 include_production_height=args["-p"],
121 overwrite=args["-x"],
122 )
125if __name__ == "__main__":
126 main()