Coverage for /home/runner/work/nr-catalog-tools/nr-catalog-tools/nrcatalogtools/rit.py: 13%
406 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-01 05:18 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-01 05:18 +0000
1import collections
2import functools
3import glob
4import os
5import subprocess
7import pandas as pd
8import requests
9from tqdm import tqdm
11from nrcatalogtools import catalog, utils
14class RITCatalog(catalog.CatalogBase):
15 def __init__(self, catalog=None, helper=None, verbosity=0, **kwargs) -> None:
16 if catalog is not None:
17 super().__init__(catalog)
18 else:
19 obj = type(self).load(verbosity=verbosity, **kwargs)
20 super().__init__(obj._dict)
21 helper = obj._helper
22 self._helper = helper
23 self._verbosity = verbosity
24 self._dict["catalog_file_description"] = "scraped from website"
25 self._dict["modified"] = {}
26 self._dict["records"] = {}
28 @classmethod
29 @functools.lru_cache()
30 def load(
31 cls,
32 download=None,
33 num_sims_to_crawl=2000,
34 acceptable_scraping_fraction=0.7,
35 verbosity=0,
36 ):
37 helper = RITCatalogHelper(use_cache=True, verbosity=verbosity)
38 if verbosity > 2:
39 print("..Going to read catalog file from cache.")
40 catalog_df = helper.read_metadata_df_from_disk()
41 if len(catalog_df) == 0:
42 if verbosity > 2:
43 print("..Catalog file not found on disk. Going to refresh from cache.")
44 catalog_df = helper.refresh_metadata_df_on_disk(
45 num_sims_to_crawl=num_sims_to_crawl
46 )
47 elif len(catalog_df) < acceptable_scraping_fraction * num_sims_to_crawl:
48 if verbosity > 2:
49 print(
50 """..Catalog file on disk is likely incomplete with only {} sims.
51 ...Going to refresh from cache.
52 """.format(
53 len(catalog_df)
54 )
55 )
56 catalog_df = helper.refresh_metadata_df_on_disk(
57 num_sims_to_crawl=num_sims_to_crawl
58 )
60 if len(catalog_df) < acceptable_scraping_fraction * num_sims_to_crawl:
61 if verbosity > 2:
62 print(
63 "Refreshing catalog file from cache did not work.",
64 "...Falling back to downloading metadata for the full",
65 "...catalog. This will take some time.",
66 )
67 if download:
68 catalog_df = helper.fetch_metadata_for_catalog(
69 num_sims_to_crawl=num_sims_to_crawl
70 )
71 else:
72 raise ValueError(
73 "Catalog not found in {}. Please set `download=True`".format(
74 helper.metadata_dir
75 )
76 )
77 # Get the catalog from helper object
78 catalog = {}
79 simulations = {}
80 for idx, row in catalog_df.iterrows():
81 name = row["simulation_name"]
82 metadata_dict = row.to_dict()
83 simulations[name] = metadata_dict
84 catalog["simulations"] = simulations
85 return cls(catalog=catalog, helper=helper, verbosity=verbosity)
87 @property
88 @functools.lru_cache()
89 def simulations_dataframe(self):
90 df = self._helper.metadata
91 for col_name in list(df.columns):
92 if "Unnamed" in col_name:
93 df = df.drop(columns=[col_name])
94 break
95 self._helper.metadata = df
96 df = df.set_index("simulation_name")
97 df.index.names = [None]
98 df["simulation_name"] = df.index.to_list()
99 return df
101 @property
102 @functools.lru_cache()
103 def files(self):
104 """Map of all file names to the corresponding file info"""
105 file_infos = {}
106 for _, row in self.simulations_dataframe.iterrows():
107 waveform_data_location = row["waveform_data_location"]
108 path_str = os.path.basename(waveform_data_location)
109 if os.path.exists(waveform_data_location):
110 file_size = os.path.getsize(waveform_data_location)
111 else:
112 file_size = 0
113 file_info = {
114 "checksum": None,
115 "filename": os.path.basename(waveform_data_location),
116 "filesize": file_size,
117 "download": row["waveform_data_link"],
118 }
119 file_infos[path_str] = file_info
121 unique_files = collections.defaultdict(list)
122 for k, v in file_infos.items():
123 unique_files[f"{v['checksum']}{v['filesize']}"].append(k)
125 original_paths = {k: min(v) for k, v in unique_files.items()}
127 for v in file_infos.values():
128 v["truepath"] = original_paths[f"{v['checksum']}{v['filesize']}"]
130 return file_infos
132 def waveform_filename_from_simname(self, sim_name):
133 return self._helper.waveform_filename_from_simname(sim_name)
135 def waveform_filepath_from_simname(self, sim_name):
136 file_path = self.get_metadata(sim_name)["waveform_data_location"]
137 if not os.path.exists(file_path):
138 if self._verbosity > 2:
139 print(
140 f"WARNING: Could not resolve path for {sim_name}"
141 f"..best calculated path = {file_path}"
142 )
143 return str(file_path)
145 def waveform_url_from_simname(self, sim_name):
146 return (
147 self._helper.waveform_data_url
148 + "/"
149 + self.waveform_filename_from_simname(sim_name)
150 )
152 def metadata_filename_from_simname(self, sim_name):
153 return self._helper.metadata_filename_from_simname(sim_name)
155 def metadata_filepath_from_simname(self, sim_name):
156 file_path = self.get_metadata(sim_name)["metadata_location"]
157 if not os.path.exists(file_path):
158 raise RuntimeError(
159 f"Could not resolve path for {sim_name}"
160 f"..best calculated path = {file_path}"
161 )
162 return str(file_path)
164 def metadata_url_from_simname(self, sim_name):
165 return (
166 self._helper.metadata_url
167 + "/"
168 + self.metadata_filename_from_simname(sim_name)
169 )
171 def download_waveform_data(self, sim_name, use_cache=None):
172 return self._helper.download_waveform_data(sim_name, use_cache=use_cache)
175class RITCatalogHelper(object):
176 def __init__(self, catalog=None, use_cache=True, verbosity=0) -> None:
177 self.verbosity = verbosity
178 self.catalog_url = utils.rit_catalog_info["url"]
179 self.use_cache = use_cache
180 self.cache_dir = utils.rit_catalog_info["cache_dir"]
182 self.num_of_sims = 0
184 self.metadata = pd.DataFrame.from_dict({})
185 self.metadata_url = utils.rit_catalog_info["metadata_url"]
186 self.metadata_file_fmts = utils.rit_catalog_info["metadata_file_fmts"]
187 self.metadata_dir = utils.rit_catalog_info["metadata_dir"]
189 self.waveform_data = {}
190 self.waveform_data_url = utils.rit_catalog_info["data_url"]
191 self.waveform_file_fmts = utils.rit_catalog_info["waveform_file_fmts"]
192 self.data_dir = utils.rit_catalog_info["data_dir"]
193 self.waveform_data_dir = utils.rit_catalog_info["data_dir"]
195 self.possible_res = utils.rit_catalog_info["possible_resolutions"]
196 self.max_id_val = utils.rit_catalog_info["max_id_val"]
198 internal_dirs = [self.cache_dir, self.metadata_dir, self.waveform_data_dir]
199 for d in internal_dirs:
200 d.mkdir(parents=True, exist_ok=True)
202 def sim_info_from_metadata_filename(self, file_name):
203 """
204 Input:
205 ------
206 file_name: name (not path) of metadata file as hosted on the web
208 Output:
209 -------
210 - simulation number
211 - resolution as indicated with an integer
212 - ID value (only for non-eccentric simulations)
213 """
214 sim_number = int(file_name.split("-")[0][-4:])
215 res_number = int(file_name.split("-")[1][1:])
216 try:
217 id_val = int(file_name.split("-")[2].split("_")[0][2:])
218 except Exception:
219 id_val = -1
220 return (sim_number, res_number, id_val)
222 def simname_from_metadata_filename(self, filename):
223 """
224 Input:
225 ------
226 - filename: name (not path) of metadata file as hosted on the web
228 Output:
229 -------
230 - Simulation Name Tag (Class uses this tag for internal indexing)
231 """
232 return filename.split("_Meta")[0]
234 def metadata_filename_from_simname(self, sim_name):
235 """
236 We assume the sim names are either of the format:
237 (1) RIT:eBBH:1109-n100-ecc
238 (2) RIT:BBH:1109-n100-id1
239 """
240 txt = sim_name.split(":")[-1]
241 idx = int(txt[:4])
242 res = int(txt.split("-")[1][1:])
243 if "eBBH" not in sim_name:
244 # If this works, its a quasicircular sim
245 id_val = int(txt[-1])
246 return self.metadata_file_fmts[0].format(idx, res, id_val)
247 else:
248 return self.metadata_file_fmts[1].format(idx, res)
250 def metadata_filename_from_cache(self, idx):
251 possible_sim_tags = self.simtags(idx)
252 for sim_tag in possible_sim_tags:
253 mf = self.metadata_dir / sim_tag
254 poss_files = glob.glob(str(mf) + "*")
255 if len(poss_files) == 0:
256 if self.verbosity > 4:
257 print("...found no files matching {}".format(str(mf) + "*"))
258 continue
259 file_name = poss_files[0]
260 return file_name
262 def waveform_filename_from_simname(self, sim_name):
263 """
264 ExtrapStrain_RIT-BBH-0005-n100.h5 -->
265 ExtrapStrain_RIT-eBBH-1843-n100.h5
266 RIT:eBBH:1843-n100-ecc_Metadata.txt
267 """
268 txt = sim_name.split(":")[-1]
269 idx = int(txt[:4])
270 res = int(txt.split("-")[1][1:])
271 try:
272 # If this works, its a quasicircular sim
273 id_val = int(txt[-1])
274 mf = self.metadata_file_fmts[0].format(idx, res, id_val)
275 except Exception:
276 mf = self.metadata_file_fmts[1].format(idx, res)
277 parts = mf.split(":")
278 return (
279 "ExtrapStrain_"
280 + parts[0]
281 + "-"
282 + parts[1]
283 + "-"
284 + parts[2].split("_")[0].split("-")[0]
285 + "-"
286 + parts[2].split("_")[0].split("-")[1]
287 + ".h5"
288 )
290 def waveform_filename_from_cache(self, idx):
291 mf = self.metadata_filename_from_cache(idx)
292 sim_name = self.simname_from_metadata_filename(mf)
293 return self.waveform_filename_from_simname(sim_name)
295 def metadata_filenames(self, idx, res, id_val):
296 return [
297 self.metadata_file_fmts[0].format(idx, res, id_val),
298 self.metadata_file_fmts[1].format(idx, res),
299 ]
301 def simname_from_cache(self, idx):
302 possible_sim_tags = self.simtags(idx)
303 for sim_tag in possible_sim_tags:
304 mf = self.metadata_dir / sim_tag
305 poss_files = glob.glob(str(mf) + "*")
306 if len(poss_files) == 0:
307 if self.verbosity > 4:
308 print("...found no files matching {}".format(str(mf) + "*"))
309 continue
310 file_path = poss_files[0] # glob gives full paths
311 file_name = os.path.basename(file_path)
312 return self.simname_from_metadata_filename(file_name)
313 return ""
315 def simnames(self, idx, res, id_val):
316 return [
317 self.simname_from_metadata_filename(mf)
318 for mf in self.metadata_filenames(idx, res, id_val)
319 ]
321 def simtags(self, idx):
322 return [
323 self.metadata_file_fmts[0].split("-")[0].format(idx),
324 self.metadata_file_fmts[1].split("-")[0].format(idx),
325 ]
327 def parse_metadata_txt(self, raw):
328 next = [s for s in raw if len(s) > 0 and s[0].isalpha()]
329 opts = {}
330 for s in next:
331 kv = s.split("=")
332 try:
333 opts[kv[0].strip()] = float(kv[1].strip())
334 except Exception:
335 opts[kv[0].strip()] = str(kv[1].strip())
336 return next, opts
338 def metadata_from_link(self, link, save_to=None):
339 if save_to is not None:
340 utils.download_file(link, save_to, progress=True)
341 return self.metadata_from_file(save_to)
342 else:
343 requests.packages.urllib3.disable_warnings()
344 for n in range(100):
345 try:
346 response = requests.get(link, verify=False)
347 break
348 except Exception:
349 continue
350 return self.parse_metadata_txt(response.content.decode().split("\n"))
352 def metadata_from_file(self, file_path):
353 with open(file_path, "r") as f:
354 lines = f.readlines()
355 return self.parse_metadata_txt(lines)
357 def metadata_from_cache(self, idx):
358 possible_sim_tags = self.simtags(idx)
359 for sim_tag in possible_sim_tags:
360 mf = self.metadata_dir / sim_tag
361 poss_files = glob.glob(str(mf) + "*")
362 if len(poss_files) == 0:
363 if self.verbosity > 4:
364 print("...found no files matching {}".format(str(mf) + "*"))
365 continue
366 file_path = poss_files[0] # glob gives full paths
367 file_name = os.path.basename(file_path)
368 file_path_web = self.metadata_url + "/" + file_name
369 wf_file_name = self.waveform_filename_from_cache(idx)
370 wf_file_path_web = self.waveform_data_url + "/" + wf_file_name
371 _, metadata_dict = self.metadata_from_file(file_path)
372 if len(metadata_dict) > 0:
373 metadata_dict["simulation_name"] = [
374 self.simname_from_metadata_filename(file_name)
375 ]
376 metadata_dict["metadata_link"] = [file_path_web]
377 metadata_dict["metadata_location"] = [file_path]
378 metadata_dict["waveform_data_link"] = [wf_file_path_web]
379 metadata_dict["waveform_data_location"] = [
380 str(
381 self.waveform_data_dir
382 / self.waveform_filename_from_simname(
383 metadata_dict["simulation_name"][0]
384 )
385 )
386 ]
387 return pd.DataFrame.from_dict(metadata_dict)
388 return pd.DataFrame({})
390 def fetch_metadata(self, idx, res, id_val=-1):
391 possible_file_names = [
392 self.metadata_file_fmts[0].format(idx, res, id_val),
393 self.metadata_file_fmts[1].format(idx, res),
394 ]
395 metadata_txt, metadata_dict = "", {}
397 for file_name in possible_file_names:
398 if self.verbosity > 2:
399 print("...beginning search for {}".format(file_name))
400 file_path_web = self.metadata_url + "/" + file_name
401 mf = self.metadata_dir / file_name
403 if self.use_cache:
404 if os.path.exists(mf) and os.path.getsize(mf) > 0:
405 if self.verbosity > 2:
406 print("...reading from cache: {}".format(str(mf)))
407 metadata_txt, metadata_dict = self.metadata_from_file(mf)
409 if len(metadata_dict) == 0:
410 if utils.url_exists(file_path_web):
411 if self.verbosity > 2:
412 print("...found {}".format(file_path_web))
413 metadata_txt, metadata_dict = self.metadata_from_link(
414 file_path_web, save_to=mf
415 )
416 else:
417 if self.verbosity > 3:
418 print("...tried and failed to find {}".format(file_path_web))
420 if len(metadata_dict) > 0:
421 # Convert to DataFrame and break loop
422 metadata_dict["simulation_name"] = [
423 self.simname_from_metadata_filename(file_name)
424 ]
425 metadata_dict["metadata_link"] = [file_path_web]
426 metadata_dict["metadata_location"] = [mf]
427 metadata_dict["waveform_data_location"] = [
428 str(
429 self.waveform_data_dir
430 / self.waveform_filename_from_simname(
431 metadata_dict["simulation_name"][0]
432 )
433 )
434 ]
435 break
437 sim = pd.DataFrame.from_dict(metadata_dict)
438 return sim
440 def fetch_metadata_for_catalog(
441 self, num_sims_to_crawl=2000, possible_res=[], max_id_in_name=-1
442 ):
443 """
444 We crawl the webdirectory where RIT metadata usually lives,
445 and try to read metadata for as many simulations as we can
446 """
447 if len(possible_res) == 0:
448 possible_res = self.possible_res
449 if max_id_in_name <= 0:
450 max_id_in_name = self.max_id_val
451 import pandas as pd
453 sims = pd.DataFrame({})
455 if self.use_cache:
456 metadata_df_fpath = self.metadata_dir / "metadata.csv"
457 if (
458 os.path.exists(metadata_df_fpath)
459 and os.path.getsize(metadata_df_fpath) > 0
460 ):
461 if self.verbosity > 2:
462 print("Opening file {}".format(metadata_df_fpath))
463 self.metadata = pd.read_csv(metadata_df_fpath)
464 if len(self.metadata) >= (num_sims_to_crawl - 1):
465 # return self.metadata
466 return self.metadata.iloc[: num_sims_to_crawl - 1]
467 else:
468 sims = self.metadata
469 if self.verbosity > 2:
470 print("Found metadata for {} sims".format(len(sims)))
472 for idx in tqdm(range(1, 1 + num_sims_to_crawl)):
473 found = False
474 possible_sim_tags = self.simtags(idx)
476 if self.verbosity > 3:
477 print("\nHunting for sim with idx: {}".format(idx))
479 # First, check if metadata present as file on disk
480 if not found and self.use_cache:
481 if self.verbosity > 3:
482 print("checking for metadata file on disk")
483 sim_data = self.metadata_from_cache(idx)
484 if len(sim_data) > 0:
485 found = True
486 if self.verbosity > 3:
487 print("...metadata found on disk for {}".format(idx))
489 # Second, check if metadata present already in DataFrame
490 if len(sims) > 0 and not found:
491 if self.verbosity > 1:
492 print("Checking existing dataframe")
493 for _, row in sims.iterrows():
494 name = row["simulation_name"]
495 for sim_tag in possible_sim_tags:
496 if sim_tag in name:
497 found = True
498 f_idx, res, id_val = self.sim_info_from_metadata_filename(
499 name
500 )
501 assert f_idx == idx, (
502 "Index found for sim from metadata is not",
503 " the same as we were searching for ({} vs {}).".format(
504 f_idx, idx
505 ),
506 )
507 if self.verbosity > 3:
508 print(
509 "...metadata found in DF for {}, {}, {}".format(
510 idx, res, id_val
511 )
512 )
513 sim_data = pd.DataFrame.from_dict(row.to_dict(), index=[0])
514 break
516 # If not already present, fetch metadata the hard way
517 if not found:
518 for res in possible_res:
519 for id_val in range(max_id_in_name):
520 # If not already present, fetch metadata
521 sim_data = self.fetch_metadata(idx, res, id_val)
522 if len(sim_data) > 0:
523 found = True
524 if self.verbosity > 3:
525 print(
526 "...metadata txt file found for {}, {}, {}".format(
527 idx, res, id_val
528 )
529 )
530 break
531 else:
532 if self.verbosity > 3:
533 print(
534 "...metadata not found for {}, {}, {}".format(
535 idx, res, id_val
536 )
537 )
538 # just need to find one resolution, so exit loop if its been found
539 if found:
540 break
541 if found:
542 sims = pd.concat([sims, sim_data])
543 else:
544 if self.verbosity > 3:
545 print("...metadata for {} NOT FOUND.".format(possible_sim_tags))
547 self.metadata = sims
548 if self.use_cache:
549 self.write_metadata_df_to_disk()
551 self.num_of_sims = len(sims)
552 return self.metadata
554 def write_metadata_df_to_disk(self):
555 metadata_df_fpath = self.metadata_dir / "metadata.csv"
556 with open(metadata_df_fpath, "w+") as f:
557 try:
558 self.metadata.to_csv(f)
559 except Exception:
560 self.metadata.reset_index(drop=True, inplace=True)
561 self.metadata.to_csv(f)
563 def refresh_metadata_df_on_disk(self, num_sims_to_crawl=2000):
564 sims = []
565 for idx in tqdm(range(1, 1 + num_sims_to_crawl)):
566 sim_data = self.metadata_from_cache(idx)
567 if len(sims) == 0:
568 sims = sim_data
569 else:
570 sims = pd.concat([sims, sim_data])
571 sims.reset_index(drop=True, inplace=True)
572 metadata_df_fpath = self.metadata_dir / "metadata.csv"
573 with open(metadata_df_fpath, "w") as f:
574 sims.to_csv(f)
575 self.metadata = sims # set this member
576 return self.metadata
578 def read_metadata_df_from_disk(self):
579 metadata_df_fpath = self.metadata_dir / "metadata.csv"
580 if os.path.exists(metadata_df_fpath) and os.path.getsize(metadata_df_fpath) > 0:
581 self.metadata = pd.read_csv(metadata_df_fpath)
582 else:
583 self.metadata = pd.DataFrame([])
584 return self.metadata
586 def download_waveform_data(self, sim_name, use_cache=None):
587 """
588 Possible file formats:
589 (1) https://ccrgpages.rit.edu/~RITCatalog/Data/ExtrapStrain_RIT-BBH-0193-n100.h5
590 (2) https://ccrgpages.rit.edu/~RITCatalog/Data/ExtrapStrain_RIT-eBBH-1911-n100.h5
591 """
592 if use_cache is None:
593 use_cache = self.use_cache
594 file_name = self.waveform_filename_from_simname(sim_name)
595 file_path_web = self.waveform_data_url + "/" + file_name
596 local_file_path = self.waveform_data_dir / file_name
597 if (
598 use_cache
599 and os.path.exists(local_file_path)
600 and os.path.getsize(local_file_path) > 0
601 ):
602 if self.verbosity > 2:
603 print("...can read from cache: {}".format(str(local_file_path)))
604 pass
605 elif os.path.exists(local_file_path) and os.path.getsize(local_file_path) > 0:
606 pass
607 else:
608 if self.verbosity > 2:
609 print("...writing to cache: {}".format(str(local_file_path)))
610 if utils.url_exists(file_path_web):
611 if self.verbosity > 2:
612 print("...downloading {}".format(file_path_web))
613 # wget.download(str(file_path_web), str(local_file_path))
615 subprocess.call(
616 [
617 "wget",
618 "--no-check-certificate",
619 str(file_path_web),
620 "-O",
621 str(local_file_path),
622 ]
623 )
624 else:
625 if self.verbosity > 2:
626 print(
627 "... ... but couldnt find link: {}".format(str(file_path_web))
628 )
630 def fetch_waveform_data_from_cache(self, idx):
631 raise NotImplementedError()
633 def download_waveform_data_for_catalog(
634 self, num_sims_to_crawl=100, possible_res=[], max_id_in_name=-1, use_cache=None
635 ):
636 """
637 We crawl the webdirectory where RIT waveform data usually lives,
638 and try to read waveform data for as many simulations as we can
639 """
640 if len(possible_res) == 0:
641 possible_res = self.possible_res
642 if max_id_in_name <= 0:
643 max_id_in_name = self.max_id_val
644 if use_cache is None:
645 use_cache = self.use_cache
647 try:
648 x = os.popen("/bin/ls {}/*.txt | wc -l".format(str(self.metadata_dir)))
649 num_metadata_txt_files = int(x.read().strip())
650 x = os.popen(
651 "/bin/cat {}/metadata.csv | wc -l".format(str(self.metadata_dir))
652 )
653 num_metadata_df = int(x.read().strip())
654 except Exception:
655 # dummy values to force refresh below
656 num_metadata_txt_files, num_metadata_df = 10, 0
658 if num_metadata_df - 1 < num_metadata_txt_files:
659 metadata = self.refresh_metadata_df_on_disk()
660 else:
661 metadata = self.read_metadata_df_from_disk()
662 sims = {}
664 for idx, sim_name in tqdm(enumerate(metadata["simulation_name"])):
665 if idx + 1 > num_sims_to_crawl:
666 break
667 file_name = self.waveform_filename_from_simname(sim_name)
668 local_file_path = self.waveform_data_dir / file_name
669 self.download_waveform_data(sim_name, use_cache=use_cache)
670 sims[sim_name] = local_file_path
672 return sims