Coverage for src/km3flux/utils/km3flux.py: 0%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

66 statements  

1#!/usr/bin/env python3 

2""" 

3Updates the files in the data folder by scraping the publications. 

4Existing data files are not re-downloaded. 

5 

6Usage: 

7 km3flux [-spx] update 

8 km3flux (-h | --help) 

9 km3flux --version 

10 

11Options: 

12 -x Overwrite existing files when updating. 

13 -s Include seasonal flux data from Honda. 

14 -p Include production height tables from Honda. 

15 -h Show this screen. 

16 -v Show the version. 

17 

18Currently only the Honda fluxes are download from 

19https://www.icrr.u-tokyo.ac.jp/~mhonda/ 

20""" 

21import os 

22import re 

23from urllib.parse import urljoin 

24 

25try: 

26 import requests 

27 from bs4 import BeautifulSoup 

28 from docopt import docopt 

29 from tqdm import tqdm 

30except ModuleNotFoundError: 

31 print( 

32 "Install the optional dependencies to be able to manage the archive:\n\n" 

33 " pip install 'km3flux[all]'\n" 

34 ) 

35 exit(1) 

36 

37import km3flux 

38from km3flux.data import basepath 

39 

40URL = "https://www.icrr.u-tokyo.ac.jp/~mhonda/" 

41 

42log = km3flux.logger.get_logger("km3flux") 

43 

44 

45def get_honda(include_seasonal=False, include_production_height=False, overwrite=False): 

46 """Grab all the Honda fluxes""" 

47 

48 def archive_data(url, year, overwrite=False): 

49 """Archives a file from `url` under `year`. 

50 

51 Currently, only Honda files are downloaded so there is no logic in place 

52 to manage multiple download target locations. Therefore "honda/" is 

53 hard-coded. 

54 """ 

55 target_path = basepath / "honda" / year / os.path.basename(url) 

56 if not overwrite and os.path.exists(target_path): 

57 return 

58 os.makedirs(target_path.parent, exist_ok=True) 

59 r = requests.get(url) 

60 if not r.ok: 

61 log.error( 

62 "Unable to retrieve '%s', reason: '%s' (status code %d)", 

63 url, 

64 r.reason, 

65 r.status_code, 

66 ) 

67 else: 

68 with open(target_path, "wb") as fobj: 

69 fobj.write(r.content) 

70 

71 def get_all_data(url, year, overwrite=False, label=""): 

72 """Downloads all the datafiles from a given `url`""" 

73 p = requests.get(url) 

74 s = BeautifulSoup(p.content, "html.parser") 

75 hrefs = [a["href"] for a in s.find_all("a") if a["href"].endswith(".d.gz")] 

76 for href in tqdm(hrefs, label): 

77 data_url = urljoin(p.url, href) 

78 archive_data(data_url, year, overwrite) 

79 

80 print("Updating Honda fluxes...") 

81 page = requests.get(URL) 

82 soup = BeautifulSoup(page.content, "html.parser") 

83 

84 for e in soup.find_all("a"): 

85 # yearly datasets 

86 m = re.search(r"(nflx(\d{4})/index.html)", e.attrs["href"]) 

87 if m: 

88 suburl, year = m.groups() 

89 print(f"-> year {year}") 

90 get_all_data(urljoin(page.url, suburl), year, overwrite, "flux tables") 

91 

92 if include_seasonal: 

93 p = requests.get(urljoin(page.url, suburl)) 

94 s = BeautifulSoup(p.content, "html.parser") 

95 links = s.find_all("a") 

96 for _e in links: 

97 ms = re.search(r"index-\d{4}.html", _e.attrs["href"]) 

98 if ms: 

99 suburl = urljoin(p.url, _e.attrs["href"]) 

100 get_all_data(suburl, year, overwrite, "seasonal fluxes") 

101 

102 if include_production_height: 

103 p = requests.get(urljoin(page.url, suburl)) 

104 s = BeautifulSoup(p.content, "html.parser") 

105 links = s.find_all("a") 

106 for _e in links: 

107 ms = re.search(r"index-height.html", _e.attrs["href"]) 

108 if ms: 

109 suburl = urljoin(p.url, _e.attrs["href"]) 

110 get_all_data( 

111 suburl, year, overwrite, "production height tables" 

112 ) 

113 

114 

115def main(): 

116 args = docopt(__doc__, version=km3flux.version) 

117 

118 get_honda( 

119 include_seasonal=args["-x"], 

120 include_production_height=args["-p"], 

121 overwrite=args["-x"], 

122 ) 

123 

124 

125if __name__ == "__main__": 

126 main()