Coverage for /home/runner/work/nr-catalog-tools/nr-catalog-tools/nrcatalogtools/rit.py: 13%

406 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-01 05:18 +0000

1import collections 

2import functools 

3import glob 

4import os 

5import subprocess 

6 

7import pandas as pd 

8import requests 

9from tqdm import tqdm 

10 

11from nrcatalogtools import catalog, utils 

12 

13 

14class RITCatalog(catalog.CatalogBase): 

15 def __init__(self, catalog=None, helper=None, verbosity=0, **kwargs) -> None: 

16 if catalog is not None: 

17 super().__init__(catalog) 

18 else: 

19 obj = type(self).load(verbosity=verbosity, **kwargs) 

20 super().__init__(obj._dict) 

21 helper = obj._helper 

22 self._helper = helper 

23 self._verbosity = verbosity 

24 self._dict["catalog_file_description"] = "scraped from website" 

25 self._dict["modified"] = {} 

26 self._dict["records"] = {} 

27 

28 @classmethod 

29 @functools.lru_cache() 

30 def load( 

31 cls, 

32 download=None, 

33 num_sims_to_crawl=2000, 

34 acceptable_scraping_fraction=0.7, 

35 verbosity=0, 

36 ): 

37 helper = RITCatalogHelper(use_cache=True, verbosity=verbosity) 

38 if verbosity > 2: 

39 print("..Going to read catalog file from cache.") 

40 catalog_df = helper.read_metadata_df_from_disk() 

41 if len(catalog_df) == 0: 

42 if verbosity > 2: 

43 print("..Catalog file not found on disk. Going to refresh from cache.") 

44 catalog_df = helper.refresh_metadata_df_on_disk( 

45 num_sims_to_crawl=num_sims_to_crawl 

46 ) 

47 elif len(catalog_df) < acceptable_scraping_fraction * num_sims_to_crawl: 

48 if verbosity > 2: 

49 print( 

50 """..Catalog file on disk is likely incomplete with only {} sims. 

51 ...Going to refresh from cache. 

52 """.format( 

53 len(catalog_df) 

54 ) 

55 ) 

56 catalog_df = helper.refresh_metadata_df_on_disk( 

57 num_sims_to_crawl=num_sims_to_crawl 

58 ) 

59 

60 if len(catalog_df) < acceptable_scraping_fraction * num_sims_to_crawl: 

61 if verbosity > 2: 

62 print( 

63 "Refreshing catalog file from cache did not work.", 

64 "...Falling back to downloading metadata for the full", 

65 "...catalog. This will take some time.", 

66 ) 

67 if download: 

68 catalog_df = helper.fetch_metadata_for_catalog( 

69 num_sims_to_crawl=num_sims_to_crawl 

70 ) 

71 else: 

72 raise ValueError( 

73 "Catalog not found in {}. Please set `download=True`".format( 

74 helper.metadata_dir 

75 ) 

76 ) 

77 # Get the catalog from helper object 

78 catalog = {} 

79 simulations = {} 

80 for idx, row in catalog_df.iterrows(): 

81 name = row["simulation_name"] 

82 metadata_dict = row.to_dict() 

83 simulations[name] = metadata_dict 

84 catalog["simulations"] = simulations 

85 return cls(catalog=catalog, helper=helper, verbosity=verbosity) 

86 

87 @property 

88 @functools.lru_cache() 

89 def simulations_dataframe(self): 

90 df = self._helper.metadata 

91 for col_name in list(df.columns): 

92 if "Unnamed" in col_name: 

93 df = df.drop(columns=[col_name]) 

94 break 

95 self._helper.metadata = df 

96 df = df.set_index("simulation_name") 

97 df.index.names = [None] 

98 df["simulation_name"] = df.index.to_list() 

99 return df 

100 

101 @property 

102 @functools.lru_cache() 

103 def files(self): 

104 """Map of all file names to the corresponding file info""" 

105 file_infos = {} 

106 for _, row in self.simulations_dataframe.iterrows(): 

107 waveform_data_location = row["waveform_data_location"] 

108 path_str = os.path.basename(waveform_data_location) 

109 if os.path.exists(waveform_data_location): 

110 file_size = os.path.getsize(waveform_data_location) 

111 else: 

112 file_size = 0 

113 file_info = { 

114 "checksum": None, 

115 "filename": os.path.basename(waveform_data_location), 

116 "filesize": file_size, 

117 "download": row["waveform_data_link"], 

118 } 

119 file_infos[path_str] = file_info 

120 

121 unique_files = collections.defaultdict(list) 

122 for k, v in file_infos.items(): 

123 unique_files[f"{v['checksum']}{v['filesize']}"].append(k) 

124 

125 original_paths = {k: min(v) for k, v in unique_files.items()} 

126 

127 for v in file_infos.values(): 

128 v["truepath"] = original_paths[f"{v['checksum']}{v['filesize']}"] 

129 

130 return file_infos 

131 

132 def waveform_filename_from_simname(self, sim_name): 

133 return self._helper.waveform_filename_from_simname(sim_name) 

134 

135 def waveform_filepath_from_simname(self, sim_name): 

136 file_path = self.get_metadata(sim_name)["waveform_data_location"] 

137 if not os.path.exists(file_path): 

138 if self._verbosity > 2: 

139 print( 

140 f"WARNING: Could not resolve path for {sim_name}" 

141 f"..best calculated path = {file_path}" 

142 ) 

143 return str(file_path) 

144 

145 def waveform_url_from_simname(self, sim_name): 

146 return ( 

147 self._helper.waveform_data_url 

148 + "/" 

149 + self.waveform_filename_from_simname(sim_name) 

150 ) 

151 

152 def metadata_filename_from_simname(self, sim_name): 

153 return self._helper.metadata_filename_from_simname(sim_name) 

154 

155 def metadata_filepath_from_simname(self, sim_name): 

156 file_path = self.get_metadata(sim_name)["metadata_location"] 

157 if not os.path.exists(file_path): 

158 raise RuntimeError( 

159 f"Could not resolve path for {sim_name}" 

160 f"..best calculated path = {file_path}" 

161 ) 

162 return str(file_path) 

163 

164 def metadata_url_from_simname(self, sim_name): 

165 return ( 

166 self._helper.metadata_url 

167 + "/" 

168 + self.metadata_filename_from_simname(sim_name) 

169 ) 

170 

171 def download_waveform_data(self, sim_name, use_cache=None): 

172 return self._helper.download_waveform_data(sim_name, use_cache=use_cache) 

173 

174 

175class RITCatalogHelper(object): 

176 def __init__(self, catalog=None, use_cache=True, verbosity=0) -> None: 

177 self.verbosity = verbosity 

178 self.catalog_url = utils.rit_catalog_info["url"] 

179 self.use_cache = use_cache 

180 self.cache_dir = utils.rit_catalog_info["cache_dir"] 

181 

182 self.num_of_sims = 0 

183 

184 self.metadata = pd.DataFrame.from_dict({}) 

185 self.metadata_url = utils.rit_catalog_info["metadata_url"] 

186 self.metadata_file_fmts = utils.rit_catalog_info["metadata_file_fmts"] 

187 self.metadata_dir = utils.rit_catalog_info["metadata_dir"] 

188 

189 self.waveform_data = {} 

190 self.waveform_data_url = utils.rit_catalog_info["data_url"] 

191 self.waveform_file_fmts = utils.rit_catalog_info["waveform_file_fmts"] 

192 self.data_dir = utils.rit_catalog_info["data_dir"] 

193 self.waveform_data_dir = utils.rit_catalog_info["data_dir"] 

194 

195 self.possible_res = utils.rit_catalog_info["possible_resolutions"] 

196 self.max_id_val = utils.rit_catalog_info["max_id_val"] 

197 

198 internal_dirs = [self.cache_dir, self.metadata_dir, self.waveform_data_dir] 

199 for d in internal_dirs: 

200 d.mkdir(parents=True, exist_ok=True) 

201 

202 def sim_info_from_metadata_filename(self, file_name): 

203 """ 

204 Input: 

205 ------ 

206 file_name: name (not path) of metadata file as hosted on the web 

207 

208 Output: 

209 ------- 

210 - simulation number 

211 - resolution as indicated with an integer 

212 - ID value (only for non-eccentric simulations) 

213 """ 

214 sim_number = int(file_name.split("-")[0][-4:]) 

215 res_number = int(file_name.split("-")[1][1:]) 

216 try: 

217 id_val = int(file_name.split("-")[2].split("_")[0][2:]) 

218 except Exception: 

219 id_val = -1 

220 return (sim_number, res_number, id_val) 

221 

222 def simname_from_metadata_filename(self, filename): 

223 """ 

224 Input: 

225 ------ 

226 - filename: name (not path) of metadata file as hosted on the web 

227 

228 Output: 

229 ------- 

230 - Simulation Name Tag (Class uses this tag for internal indexing) 

231 """ 

232 return filename.split("_Meta")[0] 

233 

234 def metadata_filename_from_simname(self, sim_name): 

235 """ 

236 We assume the sim names are either of the format: 

237 (1) RIT:eBBH:1109-n100-ecc 

238 (2) RIT:BBH:1109-n100-id1 

239 """ 

240 txt = sim_name.split(":")[-1] 

241 idx = int(txt[:4]) 

242 res = int(txt.split("-")[1][1:]) 

243 if "eBBH" not in sim_name: 

244 # If this works, its a quasicircular sim 

245 id_val = int(txt[-1]) 

246 return self.metadata_file_fmts[0].format(idx, res, id_val) 

247 else: 

248 return self.metadata_file_fmts[1].format(idx, res) 

249 

250 def metadata_filename_from_cache(self, idx): 

251 possible_sim_tags = self.simtags(idx) 

252 for sim_tag in possible_sim_tags: 

253 mf = self.metadata_dir / sim_tag 

254 poss_files = glob.glob(str(mf) + "*") 

255 if len(poss_files) == 0: 

256 if self.verbosity > 4: 

257 print("...found no files matching {}".format(str(mf) + "*")) 

258 continue 

259 file_name = poss_files[0] 

260 return file_name 

261 

262 def waveform_filename_from_simname(self, sim_name): 

263 """ 

264 ExtrapStrain_RIT-BBH-0005-n100.h5 --> 

265 ExtrapStrain_RIT-eBBH-1843-n100.h5 

266 RIT:eBBH:1843-n100-ecc_Metadata.txt 

267 """ 

268 txt = sim_name.split(":")[-1] 

269 idx = int(txt[:4]) 

270 res = int(txt.split("-")[1][1:]) 

271 try: 

272 # If this works, its a quasicircular sim 

273 id_val = int(txt[-1]) 

274 mf = self.metadata_file_fmts[0].format(idx, res, id_val) 

275 except Exception: 

276 mf = self.metadata_file_fmts[1].format(idx, res) 

277 parts = mf.split(":") 

278 return ( 

279 "ExtrapStrain_" 

280 + parts[0] 

281 + "-" 

282 + parts[1] 

283 + "-" 

284 + parts[2].split("_")[0].split("-")[0] 

285 + "-" 

286 + parts[2].split("_")[0].split("-")[1] 

287 + ".h5" 

288 ) 

289 

290 def waveform_filename_from_cache(self, idx): 

291 mf = self.metadata_filename_from_cache(idx) 

292 sim_name = self.simname_from_metadata_filename(mf) 

293 return self.waveform_filename_from_simname(sim_name) 

294 

295 def metadata_filenames(self, idx, res, id_val): 

296 return [ 

297 self.metadata_file_fmts[0].format(idx, res, id_val), 

298 self.metadata_file_fmts[1].format(idx, res), 

299 ] 

300 

301 def simname_from_cache(self, idx): 

302 possible_sim_tags = self.simtags(idx) 

303 for sim_tag in possible_sim_tags: 

304 mf = self.metadata_dir / sim_tag 

305 poss_files = glob.glob(str(mf) + "*") 

306 if len(poss_files) == 0: 

307 if self.verbosity > 4: 

308 print("...found no files matching {}".format(str(mf) + "*")) 

309 continue 

310 file_path = poss_files[0] # glob gives full paths 

311 file_name = os.path.basename(file_path) 

312 return self.simname_from_metadata_filename(file_name) 

313 return "" 

314 

315 def simnames(self, idx, res, id_val): 

316 return [ 

317 self.simname_from_metadata_filename(mf) 

318 for mf in self.metadata_filenames(idx, res, id_val) 

319 ] 

320 

321 def simtags(self, idx): 

322 return [ 

323 self.metadata_file_fmts[0].split("-")[0].format(idx), 

324 self.metadata_file_fmts[1].split("-")[0].format(idx), 

325 ] 

326 

327 def parse_metadata_txt(self, raw): 

328 next = [s for s in raw if len(s) > 0 and s[0].isalpha()] 

329 opts = {} 

330 for s in next: 

331 kv = s.split("=") 

332 try: 

333 opts[kv[0].strip()] = float(kv[1].strip()) 

334 except Exception: 

335 opts[kv[0].strip()] = str(kv[1].strip()) 

336 return next, opts 

337 

338 def metadata_from_link(self, link, save_to=None): 

339 if save_to is not None: 

340 utils.download_file(link, save_to, progress=True) 

341 return self.metadata_from_file(save_to) 

342 else: 

343 requests.packages.urllib3.disable_warnings() 

344 for n in range(100): 

345 try: 

346 response = requests.get(link, verify=False) 

347 break 

348 except Exception: 

349 continue 

350 return self.parse_metadata_txt(response.content.decode().split("\n")) 

351 

352 def metadata_from_file(self, file_path): 

353 with open(file_path, "r") as f: 

354 lines = f.readlines() 

355 return self.parse_metadata_txt(lines) 

356 

357 def metadata_from_cache(self, idx): 

358 possible_sim_tags = self.simtags(idx) 

359 for sim_tag in possible_sim_tags: 

360 mf = self.metadata_dir / sim_tag 

361 poss_files = glob.glob(str(mf) + "*") 

362 if len(poss_files) == 0: 

363 if self.verbosity > 4: 

364 print("...found no files matching {}".format(str(mf) + "*")) 

365 continue 

366 file_path = poss_files[0] # glob gives full paths 

367 file_name = os.path.basename(file_path) 

368 file_path_web = self.metadata_url + "/" + file_name 

369 wf_file_name = self.waveform_filename_from_cache(idx) 

370 wf_file_path_web = self.waveform_data_url + "/" + wf_file_name 

371 _, metadata_dict = self.metadata_from_file(file_path) 

372 if len(metadata_dict) > 0: 

373 metadata_dict["simulation_name"] = [ 

374 self.simname_from_metadata_filename(file_name) 

375 ] 

376 metadata_dict["metadata_link"] = [file_path_web] 

377 metadata_dict["metadata_location"] = [file_path] 

378 metadata_dict["waveform_data_link"] = [wf_file_path_web] 

379 metadata_dict["waveform_data_location"] = [ 

380 str( 

381 self.waveform_data_dir 

382 / self.waveform_filename_from_simname( 

383 metadata_dict["simulation_name"][0] 

384 ) 

385 ) 

386 ] 

387 return pd.DataFrame.from_dict(metadata_dict) 

388 return pd.DataFrame({}) 

389 

390 def fetch_metadata(self, idx, res, id_val=-1): 

391 possible_file_names = [ 

392 self.metadata_file_fmts[0].format(idx, res, id_val), 

393 self.metadata_file_fmts[1].format(idx, res), 

394 ] 

395 metadata_txt, metadata_dict = "", {} 

396 

397 for file_name in possible_file_names: 

398 if self.verbosity > 2: 

399 print("...beginning search for {}".format(file_name)) 

400 file_path_web = self.metadata_url + "/" + file_name 

401 mf = self.metadata_dir / file_name 

402 

403 if self.use_cache: 

404 if os.path.exists(mf) and os.path.getsize(mf) > 0: 

405 if self.verbosity > 2: 

406 print("...reading from cache: {}".format(str(mf))) 

407 metadata_txt, metadata_dict = self.metadata_from_file(mf) 

408 

409 if len(metadata_dict) == 0: 

410 if utils.url_exists(file_path_web): 

411 if self.verbosity > 2: 

412 print("...found {}".format(file_path_web)) 

413 metadata_txt, metadata_dict = self.metadata_from_link( 

414 file_path_web, save_to=mf 

415 ) 

416 else: 

417 if self.verbosity > 3: 

418 print("...tried and failed to find {}".format(file_path_web)) 

419 

420 if len(metadata_dict) > 0: 

421 # Convert to DataFrame and break loop 

422 metadata_dict["simulation_name"] = [ 

423 self.simname_from_metadata_filename(file_name) 

424 ] 

425 metadata_dict["metadata_link"] = [file_path_web] 

426 metadata_dict["metadata_location"] = [mf] 

427 metadata_dict["waveform_data_location"] = [ 

428 str( 

429 self.waveform_data_dir 

430 / self.waveform_filename_from_simname( 

431 metadata_dict["simulation_name"][0] 

432 ) 

433 ) 

434 ] 

435 break 

436 

437 sim = pd.DataFrame.from_dict(metadata_dict) 

438 return sim 

439 

440 def fetch_metadata_for_catalog( 

441 self, num_sims_to_crawl=2000, possible_res=[], max_id_in_name=-1 

442 ): 

443 """ 

444 We crawl the webdirectory where RIT metadata usually lives, 

445 and try to read metadata for as many simulations as we can 

446 """ 

447 if len(possible_res) == 0: 

448 possible_res = self.possible_res 

449 if max_id_in_name <= 0: 

450 max_id_in_name = self.max_id_val 

451 import pandas as pd 

452 

453 sims = pd.DataFrame({}) 

454 

455 if self.use_cache: 

456 metadata_df_fpath = self.metadata_dir / "metadata.csv" 

457 if ( 

458 os.path.exists(metadata_df_fpath) 

459 and os.path.getsize(metadata_df_fpath) > 0 

460 ): 

461 if self.verbosity > 2: 

462 print("Opening file {}".format(metadata_df_fpath)) 

463 self.metadata = pd.read_csv(metadata_df_fpath) 

464 if len(self.metadata) >= (num_sims_to_crawl - 1): 

465 # return self.metadata 

466 return self.metadata.iloc[: num_sims_to_crawl - 1] 

467 else: 

468 sims = self.metadata 

469 if self.verbosity > 2: 

470 print("Found metadata for {} sims".format(len(sims))) 

471 

472 for idx in tqdm(range(1, 1 + num_sims_to_crawl)): 

473 found = False 

474 possible_sim_tags = self.simtags(idx) 

475 

476 if self.verbosity > 3: 

477 print("\nHunting for sim with idx: {}".format(idx)) 

478 

479 # First, check if metadata present as file on disk 

480 if not found and self.use_cache: 

481 if self.verbosity > 3: 

482 print("checking for metadata file on disk") 

483 sim_data = self.metadata_from_cache(idx) 

484 if len(sim_data) > 0: 

485 found = True 

486 if self.verbosity > 3: 

487 print("...metadata found on disk for {}".format(idx)) 

488 

489 # Second, check if metadata present already in DataFrame 

490 if len(sims) > 0 and not found: 

491 if self.verbosity > 1: 

492 print("Checking existing dataframe") 

493 for _, row in sims.iterrows(): 

494 name = row["simulation_name"] 

495 for sim_tag in possible_sim_tags: 

496 if sim_tag in name: 

497 found = True 

498 f_idx, res, id_val = self.sim_info_from_metadata_filename( 

499 name 

500 ) 

501 assert f_idx == idx, ( 

502 "Index found for sim from metadata is not", 

503 " the same as we were searching for ({} vs {}).".format( 

504 f_idx, idx 

505 ), 

506 ) 

507 if self.verbosity > 3: 

508 print( 

509 "...metadata found in DF for {}, {}, {}".format( 

510 idx, res, id_val 

511 ) 

512 ) 

513 sim_data = pd.DataFrame.from_dict(row.to_dict(), index=[0]) 

514 break 

515 

516 # If not already present, fetch metadata the hard way 

517 if not found: 

518 for res in possible_res: 

519 for id_val in range(max_id_in_name): 

520 # If not already present, fetch metadata 

521 sim_data = self.fetch_metadata(idx, res, id_val) 

522 if len(sim_data) > 0: 

523 found = True 

524 if self.verbosity > 3: 

525 print( 

526 "...metadata txt file found for {}, {}, {}".format( 

527 idx, res, id_val 

528 ) 

529 ) 

530 break 

531 else: 

532 if self.verbosity > 3: 

533 print( 

534 "...metadata not found for {}, {}, {}".format( 

535 idx, res, id_val 

536 ) 

537 ) 

538 # just need to find one resolution, so exit loop if its been found 

539 if found: 

540 break 

541 if found: 

542 sims = pd.concat([sims, sim_data]) 

543 else: 

544 if self.verbosity > 3: 

545 print("...metadata for {} NOT FOUND.".format(possible_sim_tags)) 

546 

547 self.metadata = sims 

548 if self.use_cache: 

549 self.write_metadata_df_to_disk() 

550 

551 self.num_of_sims = len(sims) 

552 return self.metadata 

553 

554 def write_metadata_df_to_disk(self): 

555 metadata_df_fpath = self.metadata_dir / "metadata.csv" 

556 with open(metadata_df_fpath, "w+") as f: 

557 try: 

558 self.metadata.to_csv(f) 

559 except Exception: 

560 self.metadata.reset_index(drop=True, inplace=True) 

561 self.metadata.to_csv(f) 

562 

563 def refresh_metadata_df_on_disk(self, num_sims_to_crawl=2000): 

564 sims = [] 

565 for idx in tqdm(range(1, 1 + num_sims_to_crawl)): 

566 sim_data = self.metadata_from_cache(idx) 

567 if len(sims) == 0: 

568 sims = sim_data 

569 else: 

570 sims = pd.concat([sims, sim_data]) 

571 sims.reset_index(drop=True, inplace=True) 

572 metadata_df_fpath = self.metadata_dir / "metadata.csv" 

573 with open(metadata_df_fpath, "w") as f: 

574 sims.to_csv(f) 

575 self.metadata = sims # set this member 

576 return self.metadata 

577 

578 def read_metadata_df_from_disk(self): 

579 metadata_df_fpath = self.metadata_dir / "metadata.csv" 

580 if os.path.exists(metadata_df_fpath) and os.path.getsize(metadata_df_fpath) > 0: 

581 self.metadata = pd.read_csv(metadata_df_fpath) 

582 else: 

583 self.metadata = pd.DataFrame([]) 

584 return self.metadata 

585 

586 def download_waveform_data(self, sim_name, use_cache=None): 

587 """ 

588 Possible file formats: 

589 (1) https://ccrgpages.rit.edu/~RITCatalog/Data/ExtrapStrain_RIT-BBH-0193-n100.h5 

590 (2) https://ccrgpages.rit.edu/~RITCatalog/Data/ExtrapStrain_RIT-eBBH-1911-n100.h5 

591 """ 

592 if use_cache is None: 

593 use_cache = self.use_cache 

594 file_name = self.waveform_filename_from_simname(sim_name) 

595 file_path_web = self.waveform_data_url + "/" + file_name 

596 local_file_path = self.waveform_data_dir / file_name 

597 if ( 

598 use_cache 

599 and os.path.exists(local_file_path) 

600 and os.path.getsize(local_file_path) > 0 

601 ): 

602 if self.verbosity > 2: 

603 print("...can read from cache: {}".format(str(local_file_path))) 

604 pass 

605 elif os.path.exists(local_file_path) and os.path.getsize(local_file_path) > 0: 

606 pass 

607 else: 

608 if self.verbosity > 2: 

609 print("...writing to cache: {}".format(str(local_file_path))) 

610 if utils.url_exists(file_path_web): 

611 if self.verbosity > 2: 

612 print("...downloading {}".format(file_path_web)) 

613 # wget.download(str(file_path_web), str(local_file_path)) 

614 

615 subprocess.call( 

616 [ 

617 "wget", 

618 "--no-check-certificate", 

619 str(file_path_web), 

620 "-O", 

621 str(local_file_path), 

622 ] 

623 ) 

624 else: 

625 if self.verbosity > 2: 

626 print( 

627 "... ... but couldnt find link: {}".format(str(file_path_web)) 

628 ) 

629 

630 def fetch_waveform_data_from_cache(self, idx): 

631 raise NotImplementedError() 

632 

633 def download_waveform_data_for_catalog( 

634 self, num_sims_to_crawl=100, possible_res=[], max_id_in_name=-1, use_cache=None 

635 ): 

636 """ 

637 We crawl the webdirectory where RIT waveform data usually lives, 

638 and try to read waveform data for as many simulations as we can 

639 """ 

640 if len(possible_res) == 0: 

641 possible_res = self.possible_res 

642 if max_id_in_name <= 0: 

643 max_id_in_name = self.max_id_val 

644 if use_cache is None: 

645 use_cache = self.use_cache 

646 

647 try: 

648 x = os.popen("/bin/ls {}/*.txt | wc -l".format(str(self.metadata_dir))) 

649 num_metadata_txt_files = int(x.read().strip()) 

650 x = os.popen( 

651 "/bin/cat {}/metadata.csv | wc -l".format(str(self.metadata_dir)) 

652 ) 

653 num_metadata_df = int(x.read().strip()) 

654 except Exception: 

655 # dummy values to force refresh below 

656 num_metadata_txt_files, num_metadata_df = 10, 0 

657 

658 if num_metadata_df - 1 < num_metadata_txt_files: 

659 metadata = self.refresh_metadata_df_on_disk() 

660 else: 

661 metadata = self.read_metadata_df_from_disk() 

662 sims = {} 

663 

664 for idx, sim_name in tqdm(enumerate(metadata["simulation_name"])): 

665 if idx + 1 > num_sims_to_crawl: 

666 break 

667 file_name = self.waveform_filename_from_simname(sim_name) 

668 local_file_path = self.waveform_data_dir / file_name 

669 self.download_waveform_data(sim_name, use_cache=use_cache) 

670 sims[sim_name] = local_file_path 

671 

672 return sims