Coverage for src/km3flux/utils/km3flux.py: 0%

1#!/usr/bin/env python3

2"""

3Updates the files in the data folder by scraping the publications.

4Existing data files are not re-downloaded.

6Usage:

7 km3flux [-spx] update

8 km3flux (-h | --help)

9 km3flux --version

11Options:

12 -x Overwrite existing files when updating.

13 -s Include seasonal flux data from Honda.

14 -p Include production height tables from Honda.

15 -h Show this screen.

16 -v Show the version.

18Currently only the Honda fluxes are download from

19https://www.icrr.u-tokyo.ac.jp/~mhonda/

20"""

21import os

22import re

23from urllib.parse import urljoin

25try:

26 import requests

27 from bs4 import BeautifulSoup

28 from docopt import docopt

29 from tqdm import tqdm

30except ModuleNotFoundError:

31 print(

32 "Install the optional dependencies to be able to manage the archive:\n\n"

33 " pip install 'km3flux[all]'\n"

34 )

35 exit(1)

37import km3flux

38from km3flux.data import basepath

40URL = "https://www.icrr.u-tokyo.ac.jp/~mhonda/"

42log = km3flux.logger.get_logger("km3flux")

45def get_honda(include_seasonal=False, include_production_height=False, overwrite=False):

46 """Grab all the Honda fluxes"""

48 def archive_data(url, year, overwrite=False):

49 """Archives a file from `url` under `year`.

51 Currently, only Honda files are downloaded so there is no logic in place

52 to manage multiple download target locations. Therefore "honda/" is

53 hard-coded.

54 """

55 target_path = basepath / "honda" / year / os.path.basename(url)

56 if not overwrite and os.path.exists(target_path):

57 return

58 os.makedirs(target_path.parent, exist_ok=True)

59 r = requests.get(url)

60 if not r.ok:

61 log.error(

62 "Unable to retrieve '%s', reason: '%s' (status code %d)",

63 url,

64 r.reason,

65 r.status_code,

66 )

67 else:

68 with open(target_path, "wb") as fobj:

69 fobj.write(r.content)

71 def get_all_data(url, year, overwrite=False, label=""):

72 """Downloads all the datafiles from a given `url`"""

73 p = requests.get(url)

74 s = BeautifulSoup(p.content, "html.parser")

75 hrefs = [a["href"] for a in s.find_all("a") if a["href"].endswith(".d.gz")]

76 for href in tqdm(hrefs, label):

77 data_url = urljoin(p.url, href)

78 archive_data(data_url, year, overwrite)

80 print("Updating Honda fluxes...")

81 page = requests.get(URL)

82 soup = BeautifulSoup(page.content, "html.parser")

84 for e in soup.find_all("a"):

85 # yearly datasets

86 m = re.search(r"(nflx(\d{4})/index.html)", e.attrs["href"])

87 if m:

88 suburl, year = m.groups()

89 print(f"-> year {year}")

90 get_all_data(urljoin(page.url, suburl), year, overwrite, "flux tables")

92 if include_seasonal:

93 p = requests.get(urljoin(page.url, suburl))

94 s = BeautifulSoup(p.content, "html.parser")

95 links = s.find_all("a")

96 for _e in links:

97 ms = re.search(r"index-\d{4}.html", _e.attrs["href"])

98 if ms:

99 suburl = urljoin(p.url, _e.attrs["href"])

100 get_all_data(suburl, year, overwrite, "seasonal fluxes")

101

102 if include_production_height:

103 p = requests.get(urljoin(page.url, suburl))

104 s = BeautifulSoup(p.content, "html.parser")

105 links = s.find_all("a")

106 for _e in links:

107 ms = re.search(r"index-height.html", _e.attrs["href"])

108 if ms:

109 suburl = urljoin(p.url, _e.attrs["href"])

110 get_all_data(

111 suburl, year, overwrite, "production height tables"

112 )

113

114

115def main():

116 args = docopt(__doc__, version=km3flux.version)

117

118 get_honda(

119 include_seasonal=args["-x"],

120 include_production_height=args["-p"],

121 overwrite=args["-x"],

122 )

123

124

125if __name__ == "__main__":

126 main()