diff --git a/tools/import_validation/Validations.md b/tools/import_validation/Validations.md index 4efebb3a55..d46ece74fc 100644 --- a/tools/import_validation/Validations.md +++ b/tools/import_validation/Validations.md @@ -72,6 +72,8 @@ To generate goldens for the summary_report.csv to verify that all the expected StatVars are generated with the corresponding number of places and dates, run the following: +This will compare the golden files using summary_report.csv as the default input: + ```shell python3 validator_goldens.py \ --validate_goldens_input=summary_report.csv \ diff --git a/tools/import_validation/runner.py b/tools/import_validation/runner.py index f1364518e6..964ed98285 100644 --- a/tools/import_validation/runner.py +++ b/tools/import_validation/runner.py @@ -41,6 +41,8 @@ class ValidationRunner: def __init__(self, validation_config_path: str, differ_output: str, stats_summary: str, lint_report: str, validation_output: str): + self.validation_config_path = validation_config_path + self.stats_summary = stats_summary self.config = ValidationConfig(validation_config_path) self.validation_output = validation_output self.validator = Validator() @@ -212,6 +214,47 @@ def run_validations(self) -> tuple[bool, list[ValidationResult]]: if output_dir: rule_params.setdefault('output_path', output_dir) + # Resolve paths relative to the directory of the validation config. + if validator_name == 'GOLDENS_CHECK': + config_dir = os.path.dirname( + os.path.abspath(self.validation_config_path)) + # We walk up to find where the golden_data folder is situated. + curr = config_dir + while curr and curr != os.path.dirname(curr): + if os.path.exists(os.path.join(curr, 'golden_data')): + config_dir = curr + break + curr = os.path.dirname(curr) + + print( + f"DEBUG: Found GOLDENS_CHECK rule: '{rule.get('rule_id')}'" + ) + print( + f"DEBUG: Config directory resolved to: '{config_dir}'") + for path_key in list(rule_params.keys()): + # Check any key in rule_params that equals 'golden_files' or 'input_files' or ends with '_file' or '_files' + if path_key in ( + 'golden_files', + 'input_files') or path_key.endswith( + '_file') or path_key.endswith('_files'): + val = rule_params[path_key] + print( + f"DEBUG: Before resolve '{path_key}': '{val}'") + if isinstance(val, str): + if val and not os.path.isabs(val): + rule_params[path_key] = os.path.join( + config_dir, val) + elif isinstance(val, list): + rule_params[path_key] = [ + os.path.join(config_dir, item) + if isinstance(item, str) and item and + not os.path.isabs(item) else item + for item in val + ] + print( + f"DEBUG: After resolve '{path_key}': '{rule_params[path_key]}'" + ) + if validator_name == 'SQL_VALIDATOR': result = validation_func(self.data_sources['stats'], self.data_sources['differ'], diff --git a/tools/import_validation/validator_goldens.py b/tools/import_validation/validator_goldens.py index 7b19b783fe..82c1781ad9 100644 --- a/tools/import_validation/validator_goldens.py +++ b/tools/import_validation/validator_goldens.py @@ -298,7 +298,13 @@ def load_nodes_from_file(files: str) -> dict: file_nodes = file_util.file_load_csv_dict(input_file, key_index=True) for node in file_nodes.values(): - nodes[len(nodes)] = node + # Clean up "dcid:" prefixes from values (column headers are kept as is) + clean_node = {} + for k, v in node.items(): + clean_val = v.removeprefix("dcid:") if isinstance( + v, str) else v + clean_node[k] = clean_val + nodes[len(nodes)] = clean_node else: # For MCF or JSON, we assume nodes are already keyed by DCID. file_nodes = mcf_file_util.load_mcf_nodes(input_file) diff --git a/util/file_util.py b/util/file_util.py index c255bc365a..0371a67adc 100644 --- a/util/file_util.py +++ b/util/file_util.py @@ -619,7 +619,7 @@ def file_write_csv_dict(py_dict: dict, filename: str, columns: list = None, key_column_name: str = 'key') -> list: - """Returns the filename after writing py_dict with a csv row per item. + """Returns the list of columns after writing py_dict with a csv row per item. Each dictionary items is written as a row in the CSV file. @@ -674,9 +674,11 @@ def file_write_csv_dict(py_dict: dict, if col not in columns: columns.append(col) if len(columns) == 1: - # Value is not a dict. Write it as a column name value. - value_column_name = 'value' - columns.append(value_column_name) + # Check if values are dicts. If they are, it's not a primitive value. + if not any(isinstance(value, dict) for value in py_dict.values()): + # Value is not a dict. Write it as a column name value. + value_column_name = 'value' + columns.append(value_column_name) # Use the first column for the key. if key_column_name == '': key_column_name = columns[0]