From 81a632b74de84db2cb05de6ce75f4b43432a520c Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Mon, 8 Jun 2026 10:48:30 +0000 Subject: [PATCH 01/24] improve validation path resolution and sanitize CSV parsing --- tools/import_validation/runner.py | 71 +++++++++++ tools/import_validation/runner_test.py | 116 ++++++++++++++++++ tools/import_validation/validator_goldens.py | 39 +++++- .../validator_goldens_test.py | 19 +++ 4 files changed, 242 insertions(+), 3 deletions(-) diff --git a/tools/import_validation/runner.py b/tools/import_validation/runner.py index f1364518e6..aca77b0233 100644 --- a/tools/import_validation/runner.py +++ b/tools/import_validation/runner.py @@ -14,6 +14,7 @@ """Module for the ValidationRunner class.""" import os +import logging from absl import app from absl import flags from absl import logging @@ -34,6 +35,30 @@ _FLAGS = flags.FLAGS +def _is_relative_local(path_val: str) -> bool: + """Checks if a path is a relative, local file path.""" + if not isinstance(path_val, str) or not path_val: + return False + return (not os.path.isabs(path_val) and not path_val.startswith('gs://') and + not path_val.startswith('http://') and + not path_val.startswith('https://')) + + +def _find_base_dir(start_path: str, target_sub_path: str) -> str | None: + """Helper to find a base directory containing target_sub_path by walking up.""" + if not start_path: + return None + curr = os.path.abspath(start_path) + for _ in range(10): # limit to 10 levels up + if os.path.exists(os.path.join(curr, target_sub_path)): + return curr + parent = os.path.dirname(curr) + if parent == curr: + break + curr = parent + return None + + class ValidationRunner: """ Orchestrates the validation process based on the new schema. @@ -41,6 +66,8 @@ class ValidationRunner: def __init__(self, validation_config_path: str, differ_output: str, stats_summary: str, lint_report: str, validation_output: str): + self.validation_config_path = validation_config_path + self.stats_summary = stats_summary self.config = ValidationConfig(validation_config_path) self.validation_output = validation_output self.validator = Validator() @@ -212,6 +239,50 @@ def run_validations(self) -> tuple[bool, list[ValidationResult]]: if output_dir: rule_params.setdefault('output_path', output_dir) + # Resolve paths relative to the directory of the validation config. + if validator_name == 'GOLDENS_CHECK': + config_dir = None + # Walk up from validation_config_path, self.stats_summary, or CWD to find where 'golden_data' lives + for start in [ + self.validation_config_path, self.stats_summary, + os.getcwd() + ]: + config_dir = _find_base_dir(start, 'golden_data') + if config_dir: + break + + if not config_dir: + config_dir = os.path.dirname( + os.path.abspath(self.validation_config_path)) + + print( + f"DEBUG: Found GOLDENS_CHECK rule: '{rule.get('rule_id')}'" + ) + print( + f"DEBUG: Config directory resolved to: '{config_dir}'") + for path_key in list(rule_params.keys()): + # Check any key in rule_params that equals 'golden_files' or 'input_files' or ends with '_file' or '_files' + if path_key in ( + 'golden_files', + 'input_files') or path_key.endswith( + '_file') or path_key.endswith('_files'): + val = rule_params[path_key] + print( + f"DEBUG: Before resolve '{path_key}': '{val}'") + if isinstance(val, str): + if _is_relative_local(val): + rule_params[path_key] = os.path.join( + config_dir, val) + elif isinstance(val, list): + rule_params[path_key] = [ + os.path.join(config_dir, item) + if _is_relative_local(item) else item + for item in val + ] + print( + f"DEBUG: After resolve '{path_key}': '{rule_params[path_key]}'" + ) + if validator_name == 'SQL_VALIDATOR': result = validation_func(self.data_sources['stats'], self.data_sources['differ'], diff --git a/tools/import_validation/runner_test.py b/tools/import_validation/runner_test.py index 76c5bc1d0b..049f5a9f60 100644 --- a/tools/import_validation/runner_test.py +++ b/tools/import_validation/runner_test.py @@ -350,3 +350,119 @@ def test_init_raises_error_if_required_file_is_missing(self): lint_report=self.report_path, validation_output=self.output_path) self.assertIn("'stats' data source", str(context.exception)) + + @patch('tools.import_validation.runner.Validator') + def test_runner_resolves_goldens_check_paths(self, MockValidator): + """Tests that GOLDENS_CHECK validator resolves relative paths correctly.""" + # 1. Setup the mock + mock_validator_instance = MockValidator.return_value + mock_validator_instance.validate_goldens.return_value = ValidationResult( + ValidationStatus.PASSED, 'GOLDENS_CHECK') + + # 2. Create the config file with relative paths + with open(self.config_path, 'w') as f: + json.dump( + { + 'rules': [{ + 'rule_id': 'check_goldens_output_csv', + 'validator': 'GOLDENS_CHECK', + 'scope': {}, + 'params': { + 'golden_files': 'golden_data/golden_file.csv', + 'input_files': 'output/input_file.csv' + } + }] + }, f) + pd.DataFrame({'StatVar': ['sv1']}).to_csv(self.stats_path, index=False) + + # 3. Setup a dummy golden_data folder inside the temp dir to trigger base dir resolution + golden_dir = os.path.join(self.test_dir.name, 'golden_data') + os.makedirs(golden_dir, exist_ok=True) + + runner = ValidationRunner(validation_config_path=self.config_path, + stats_summary=self.stats_path, + differ_output=self.differ_path, + lint_report=self.report_path, + validation_output=self.output_path) + runner.run_validations() + + # 4. Assert that validate_goldens was called with resolved paths + mock_validator_instance.validate_goldens.assert_called_once() + args, kwargs = mock_validator_instance.validate_goldens.call_args + rule_params = args[1] + expected_golden = os.path.join(self.test_dir.name, + 'golden_data/golden_file.csv') + expected_input = os.path.join(self.test_dir.name, + 'output/input_file.csv') + self.assertEqual(rule_params['golden_files'], expected_golden) + self.assertEqual(rule_params['input_files'], expected_input) + + @patch('tools.import_validation.runner.Validator') + def test_path_resolution_helpers_and_robustness(self, MockValidator): + """Tests that relative local path helpers work and robust path resolution handles ends-with keys.""" + from tools.import_validation.runner import _is_relative_local, _find_base_dir + + # Test helper functions + self.assertTrue(_is_relative_local("some/relative/path.csv")) + self.assertFalse(_is_relative_local("/absolute/path.csv")) + self.assertFalse(_is_relative_local("gs://bucket/path.csv")) + self.assertFalse(_is_relative_local("http://example.com/path.csv")) + self.assertFalse(_is_relative_local("https://example.com/path.csv")) + self.assertFalse(_is_relative_local(None)) + self.assertFalse(_is_relative_local(123)) + + # Setup mock validator + mock_validator_instance = MockValidator.return_value + mock_validator_instance.validate_goldens.return_value = ValidationResult( + ValidationStatus.PASSED, 'GOLDENS_CHECK') + + # Create config file with various parameter keys (like output_file) that contain paths + with open(self.config_path, 'w') as f: + json.dump( + { + 'rules': [{ + 'rule_id': 'check_goldens_output_csv', + 'validator': 'GOLDENS_CHECK', + 'scope': {}, + 'params': { + 'golden_files': + 'golden_data/golden_file.csv', + 'input_files': + 'output/input_file.csv', + 'output_file': + 'output/output_file.csv', + 'custom_files': + ['output/custom_file.csv', 'gs://skip/this.csv'] + } + }] + }, f) + pd.DataFrame({'StatVar': ['sv1']}).to_csv(self.stats_path, index=False) + + # Setup dummy golden_data folder + golden_dir = os.path.join(self.test_dir.name, 'golden_data') + os.makedirs(golden_dir, exist_ok=True) + + runner = ValidationRunner(validation_config_path=self.config_path, + stats_summary=self.stats_path, + differ_output=self.differ_path, + lint_report=self.report_path, + validation_output=self.output_path) + runner.run_validations() + + # Check path resolution for the parameters + args, kwargs = mock_validator_instance.validate_goldens.call_args + rule_params = args[1] + + self.assertEqual( + rule_params['golden_files'], + os.path.join(self.test_dir.name, + 'golden_data/golden_file.csv')) + self.assertEqual( + rule_params['input_files'], + os.path.join(self.test_dir.name, 'output/input_file.csv')) + self.assertEqual(rule_params['output_file'], + os.path.join(self.test_dir.name, 'output/output_file.csv')) + self.assertEqual(rule_params['custom_files'], [ + os.path.join(self.test_dir.name, 'output/custom_file.csv'), + 'gs://skip/this.csv' + ]) diff --git a/tools/import_validation/validator_goldens.py b/tools/import_validation/validator_goldens.py index 7b19b783fe..c6f3efe599 100644 --- a/tools/import_validation/validator_goldens.py +++ b/tools/import_validation/validator_goldens.py @@ -68,6 +68,7 @@ --generate_goldens=goldens_data/generated_goldens.csv """ +import csv import os import sys import tempfile @@ -297,15 +298,47 @@ def load_nodes_from_file(files: str) -> dict: # Nodes are keyed by their index in the combined loaded set. file_nodes = file_util.file_load_csv_dict(input_file, key_index=True) + # Check if any loaded node has None as a key, indicating an incorrect delimiter + # was auto-detected (e.g. splitting 'dcid:Earth' on colon ':') + has_delimiter_error = False for node in file_nodes.values(): - nodes[len(nodes)] = node + if None in node: + has_delimiter_error = True + break + + if has_delimiter_error: + import csv + with file_util.FileIO(input_file) as csvfile: + rawdata = csvfile.read() + if isinstance(rawdata, bytes): + encoding = file_util.file_get_encoding(input_file) + data_str = rawdata.decode(encoding) + else: + data_str = rawdata + reader = csv.DictReader(data_str.splitlines(), + delimiter=',') + file_nodes = {} + for row in reader: + file_nodes[len(file_nodes)] = dict(row) + + for node in file_nodes.values(): + # Clean up None/empty keys and strip whitespace from headers/keys and values to ensure robust parsing + cleaned_node = { + k.strip(): (v.strip() if isinstance(v, str) else v) + for k, v in node.items() + if k is not None and isinstance(k, str) and k.strip() != '' + } + nodes[len(nodes)] = cleaned_node else: # For MCF or JSON, we assume nodes are already keyed by DCID. file_nodes = mcf_file_util.load_mcf_nodes(input_file) for dcid, node in file_nodes.items(): - # Ensure the dcid is present in the node dictionary itself. + # Ensure the dcid is present in the node dictionary itself with a null-safe check. if 'dcid' not in node: - node['dcid'] = mcf_file_util.strip_namespace(dcid) + if dcid and isinstance(dcid, str): + node['dcid'] = mcf_file_util.strip_namespace(dcid) + else: + node['dcid'] = '' mcf_file_util.add_mcf_node(node, nodes) logging.info(f'Loaded {len(nodes)} nodes from {input_files}') diff --git a/tools/import_validation/validator_goldens_test.py b/tools/import_validation/validator_goldens_test.py index e486a8fe91..6fa4ab09ac 100644 --- a/tools/import_validation/validator_goldens_test.py +++ b/tools/import_validation/validator_goldens_test.py @@ -190,6 +190,25 @@ def test_validate_goldens(self, mock_file, mock_compare, mock_load): self.assertEqual(missing, []) mock_compare.assert_called_once() + def test_load_nodes_from_file_with_colon_values(self): + import tempfile + # Create a temporary CSV file with values containing colons and no commas + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', + delete=False) as f: + f.write("header1\n") + f.write("value:1\n") + f.write("value:2\n") + temp_name = f.name + + try: + nodes = validator_goldens.load_nodes_from_file(temp_name) + # The nodes should be parsed using comma as delimiter, which correctly preserves values with colons + self.assertEqual(len(nodes), 2) + self.assertEqual(nodes[0], {'header1': 'value:1'}) + self.assertEqual(nodes[1], {'header1': 'value:2'}) + finally: + os.remove(temp_name) + if __name__ == '__main__': unittest.main() From 903e0314c0eed3f9f3942ada3052dec8f554cae2 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Mon, 8 Jun 2026 11:05:28 +0000 Subject: [PATCH 02/24] fixed tests --- tools/import_validation/runner_test.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/import_validation/runner_test.py b/tools/import_validation/runner_test.py index 049f5a9f60..80e7508020 100644 --- a/tools/import_validation/runner_test.py +++ b/tools/import_validation/runner_test.py @@ -431,8 +431,9 @@ def test_path_resolution_helpers_and_robustness(self, MockValidator): 'output/input_file.csv', 'output_file': 'output/output_file.csv', - 'custom_files': - ['output/custom_file.csv', 'gs://skip/this.csv'] + 'custom_files': [ + 'output/custom_file.csv', 'gs://skip/this.csv' + ] } }] }, f) @@ -455,13 +456,13 @@ def test_path_resolution_helpers_and_robustness(self, MockValidator): self.assertEqual( rule_params['golden_files'], - os.path.join(self.test_dir.name, - 'golden_data/golden_file.csv')) + os.path.join(self.test_dir.name, 'golden_data/golden_file.csv')) self.assertEqual( rule_params['input_files'], os.path.join(self.test_dir.name, 'output/input_file.csv')) - self.assertEqual(rule_params['output_file'], - os.path.join(self.test_dir.name, 'output/output_file.csv')) + self.assertEqual( + rule_params['output_file'], + os.path.join(self.test_dir.name, 'output/output_file.csv')) self.assertEqual(rule_params['custom_files'], [ os.path.join(self.test_dir.name, 'output/custom_file.csv'), 'gs://skip/this.csv' From 20e1292b928ab911104a18ba0b256443d13c2097 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Mon, 8 Jun 2026 11:32:14 +0000 Subject: [PATCH 03/24] docs: add detailed docstrings for path resolution helpers in runner --- tools/import_validation/runner.py | 32 +++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/tools/import_validation/runner.py b/tools/import_validation/runner.py index aca77b0233..0b33f7abb0 100644 --- a/tools/import_validation/runner.py +++ b/tools/import_validation/runner.py @@ -36,7 +36,19 @@ def _is_relative_local(path_val: str) -> bool: - """Checks if a path is a relative, local file path.""" + """Checks if a path is a relative, local file path. + + This function identifies path strings that represent local relative files + (e.g., 'golden_data/un_wpp.csv') as opposed to absolute paths or remote/cloud + URIs. It filters out non-strings, empty strings, absolute local paths, + Google Cloud Storage URIs (gs://), and web URLs (http://, https://). + + Args: + path_val: The file path string to evaluate. + + Returns: + True if the path represents a relative, local file path; False otherwise. + """ if not isinstance(path_val, str) or not path_val: return False return (not os.path.isabs(path_val) and not path_val.startswith('gs://') and @@ -45,7 +57,23 @@ def _is_relative_local(path_val: str) -> bool: def _find_base_dir(start_path: str, target_sub_path: str) -> str | None: - """Helper to find a base directory containing target_sub_path by walking up.""" + """Helper to find a base directory containing a target sub-path by walking up. + + Starting from the absolute directory of `start_path`, this function recursively + checks if `target_sub_path` exists in the current folder. If not, it walks up the + parent directory tree up to 10 levels. This is crucial for resolving paths relative + to import-specific golden directories when tests/validation are run from + different working directories (such as the repository root in CI/CD). + + Args: + start_path: The file or directory path to start the upward search from. + target_sub_path: The name of the subdirectory or file (e.g., 'golden_data') + to search for within the parent tree. + + Returns: + The absolute path of the directory containing `target_sub_path` if found, + or None if the root was reached or the 10-level limit was exceeded. + """ if not start_path: return None curr = os.path.abspath(start_path) From 14409c496c517d8ddac67a37a64b8225f172cfcb Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Mon, 8 Jun 2026 13:41:18 +0000 Subject: [PATCH 04/24] modified validation --- tools/import_validation/Validations.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/import_validation/Validations.md b/tools/import_validation/Validations.md index 4efebb3a55..d46ece74fc 100644 --- a/tools/import_validation/Validations.md +++ b/tools/import_validation/Validations.md @@ -72,6 +72,8 @@ To generate goldens for the summary_report.csv to verify that all the expected StatVars are generated with the corresponding number of places and dates, run the following: +This will compare the golden files using summary_report.csv as the default input: + ```shell python3 validator_goldens.py \ --validate_goldens_input=summary_report.csv \ From 194a3bc311a19c4e617c96a7234ac74d0771a1a2 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Tue, 9 Jun 2026 16:10:32 +0000 Subject: [PATCH 05/24] testing --- tools/import_validation/runner.py | 9 +-- tools/import_validation/runner_test.py | 7 +-- tools/import_validation/validator_goldens.py | 59 +++++++------------ .../validator_goldens_test.py | 19 ------ 4 files changed, 25 insertions(+), 69 deletions(-) diff --git a/tools/import_validation/runner.py b/tools/import_validation/runner.py index 0b33f7abb0..37ce44a5a6 100644 --- a/tools/import_validation/runner.py +++ b/tools/import_validation/runner.py @@ -39,9 +39,8 @@ def _is_relative_local(path_val: str) -> bool: """Checks if a path is a relative, local file path. This function identifies path strings that represent local relative files - (e.g., 'golden_data/un_wpp.csv') as opposed to absolute paths or remote/cloud - URIs. It filters out non-strings, empty strings, absolute local paths, - Google Cloud Storage URIs (gs://), and web URLs (http://, https://). + (e.g., 'golden_data/un_wpp.csv') as opposed to absolute paths. It filters + out non-strings, empty strings, and absolute local paths. Args: path_val: The file path string to evaluate. @@ -51,9 +50,7 @@ def _is_relative_local(path_val: str) -> bool: """ if not isinstance(path_val, str) or not path_val: return False - return (not os.path.isabs(path_val) and not path_val.startswith('gs://') and - not path_val.startswith('http://') and - not path_val.startswith('https://')) + return not os.path.isabs(path_val) def _find_base_dir(start_path: str, target_sub_path: str) -> str | None: diff --git a/tools/import_validation/runner_test.py b/tools/import_validation/runner_test.py index 80e7508020..e1201eca69 100644 --- a/tools/import_validation/runner_test.py +++ b/tools/import_validation/runner_test.py @@ -405,9 +405,6 @@ def test_path_resolution_helpers_and_robustness(self, MockValidator): # Test helper functions self.assertTrue(_is_relative_local("some/relative/path.csv")) self.assertFalse(_is_relative_local("/absolute/path.csv")) - self.assertFalse(_is_relative_local("gs://bucket/path.csv")) - self.assertFalse(_is_relative_local("http://example.com/path.csv")) - self.assertFalse(_is_relative_local("https://example.com/path.csv")) self.assertFalse(_is_relative_local(None)) self.assertFalse(_is_relative_local(123)) @@ -432,7 +429,7 @@ def test_path_resolution_helpers_and_robustness(self, MockValidator): 'output_file': 'output/output_file.csv', 'custom_files': [ - 'output/custom_file.csv', 'gs://skip/this.csv' + 'output/custom_file.csv', '/skip/this.csv' ] } }] @@ -465,5 +462,5 @@ def test_path_resolution_helpers_and_robustness(self, MockValidator): os.path.join(self.test_dir.name, 'output/output_file.csv')) self.assertEqual(rule_params['custom_files'], [ os.path.join(self.test_dir.name, 'output/custom_file.csv'), - 'gs://skip/this.csv' + '/skip/this.csv' ]) diff --git a/tools/import_validation/validator_goldens.py b/tools/import_validation/validator_goldens.py index c6f3efe599..4f8d8b8b32 100644 --- a/tools/import_validation/validator_goldens.py +++ b/tools/import_validation/validator_goldens.py @@ -298,53 +298,20 @@ def load_nodes_from_file(files: str) -> dict: # Nodes are keyed by their index in the combined loaded set. file_nodes = file_util.file_load_csv_dict(input_file, key_index=True) - # Check if any loaded node has None as a key, indicating an incorrect delimiter - # was auto-detected (e.g. splitting 'dcid:Earth' on colon ':') - has_delimiter_error = False for node in file_nodes.values(): - if None in node: - has_delimiter_error = True - break - - if has_delimiter_error: - import csv - with file_util.FileIO(input_file) as csvfile: - rawdata = csvfile.read() - if isinstance(rawdata, bytes): - encoding = file_util.file_get_encoding(input_file) - data_str = rawdata.decode(encoding) - else: - data_str = rawdata - reader = csv.DictReader(data_str.splitlines(), - delimiter=',') - file_nodes = {} - for row in reader: - file_nodes[len(file_nodes)] = dict(row) - - for node in file_nodes.values(): - # Clean up None/empty keys and strip whitespace from headers/keys and values to ensure robust parsing - cleaned_node = { - k.strip(): (v.strip() if isinstance(v, str) else v) - for k, v in node.items() - if k is not None and isinstance(k, str) and k.strip() != '' - } - nodes[len(nodes)] = cleaned_node + nodes[len(nodes)] = node else: # For MCF or JSON, we assume nodes are already keyed by DCID. file_nodes = mcf_file_util.load_mcf_nodes(input_file) for dcid, node in file_nodes.items(): - # Ensure the dcid is present in the node dictionary itself with a null-safe check. + # Ensure the dcid is present in the node dictionary itself. if 'dcid' not in node: - if dcid and isinstance(dcid, str): - node['dcid'] = mcf_file_util.strip_namespace(dcid) - else: - node['dcid'] = '' + node['dcid'] = mcf_file_util.strip_namespace(dcid) mcf_file_util.add_mcf_node(node, nodes) logging.info(f'Loaded {len(nodes)} nodes from {input_files}') return nodes - def generate_goldens(input_files: str, property_sets: list, output_file: str = None, @@ -473,9 +440,23 @@ def generate_goldens(input_files: str, if golden_nodes and output_file: logging.info(f'Writing {len(golden_nodes)} goldens to {output_file}') if file_util.file_is_csv(output_file): - file_util.file_write_csv_dict(golden_nodes, - output_file, - key_column_name=None) + headers = [] + for node in golden_nodes.values(): + for prop in node.keys(): + if prop not in headers: + headers.append(prop) + with file_util.FileIO(output_file, mode='w') as csvfile: + writer = csv.DictWriter( + csvfile, + fieldnames=headers, + escapechar='\\', + extrasaction='ignore', + quotechar='"', + quoting=csv.QUOTE_NONNUMERIC, + ) + writer.writeheader() + for node in golden_nodes.values(): + writer.writerow(node) else: mcf_file_util.write_mcf_nodes([golden_nodes], output_file) diff --git a/tools/import_validation/validator_goldens_test.py b/tools/import_validation/validator_goldens_test.py index 6fa4ab09ac..e486a8fe91 100644 --- a/tools/import_validation/validator_goldens_test.py +++ b/tools/import_validation/validator_goldens_test.py @@ -190,25 +190,6 @@ def test_validate_goldens(self, mock_file, mock_compare, mock_load): self.assertEqual(missing, []) mock_compare.assert_called_once() - def test_load_nodes_from_file_with_colon_values(self): - import tempfile - # Create a temporary CSV file with values containing colons and no commas - with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', - delete=False) as f: - f.write("header1\n") - f.write("value:1\n") - f.write("value:2\n") - temp_name = f.name - - try: - nodes = validator_goldens.load_nodes_from_file(temp_name) - # The nodes should be parsed using comma as delimiter, which correctly preserves values with colons - self.assertEqual(len(nodes), 2) - self.assertEqual(nodes[0], {'header1': 'value:1'}) - self.assertEqual(nodes[1], {'header1': 'value:2'}) - finally: - os.remove(temp_name) - if __name__ == '__main__': unittest.main() From aabebba716833239a6518b440260884ee24fcc33 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Tue, 9 Jun 2026 20:45:51 +0000 Subject: [PATCH 06/24] fix: strip dcid: namespace prefix from CSV values in golden loading --- tools/import_validation/validator_goldens.py | 7 +++++- .../validator_goldens_test.py | 22 +++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/tools/import_validation/validator_goldens.py b/tools/import_validation/validator_goldens.py index 4f8d8b8b32..54653367a8 100644 --- a/tools/import_validation/validator_goldens.py +++ b/tools/import_validation/validator_goldens.py @@ -299,7 +299,12 @@ def load_nodes_from_file(files: str) -> dict: file_nodes = file_util.file_load_csv_dict(input_file, key_index=True) for node in file_nodes.values(): - nodes[len(nodes)] = node + # Clean up "dcid:" prefixes from values (column headers are kept as is) + clean_node = {} + for k, v in node.items(): + clean_val = v[5:] if (isinstance(v, str) and v.startswith("dcid:")) else v + clean_node[k] = clean_val + nodes[len(nodes)] = clean_node else: # For MCF or JSON, we assume nodes are already keyed by DCID. file_nodes = mcf_file_util.load_mcf_nodes(input_file) diff --git a/tools/import_validation/validator_goldens_test.py b/tools/import_validation/validator_goldens_test.py index e486a8fe91..c2fa1b9ada 100644 --- a/tools/import_validation/validator_goldens_test.py +++ b/tools/import_validation/validator_goldens_test.py @@ -190,6 +190,28 @@ def test_validate_goldens(self, mock_file, mock_compare, mock_load): self.assertEqual(missing, []) mock_compare.assert_called_once() + @patch('validator_goldens.file_util') + def test_load_nodes_from_file_with_dcid_prefix_values(self, mock_file): + mock_file.file_get_matching.return_value = ['dummy.csv'] + mock_file.file_is_csv.return_value = True + mock_file.file_load_csv_dict.return_value = { + 0: { + 'observationAbout': 'dcid:Earth', + 'variableMeasured': 'dcid:Count_Person', + 'value': '10' + } + } + + nodes = validator_goldens.load_nodes_from_file('dummy.csv') + + self.assertEqual(len(nodes), 1) + # Verify that prefixes 'dcid:' have been stripped from values only + self.assertIn('observationAbout', nodes[0]) + self.assertEqual(nodes[0]['observationAbout'], 'Earth') + self.assertIn('variableMeasured', nodes[0]) + self.assertEqual(nodes[0]['variableMeasured'], 'Count_Person') + self.assertEqual(nodes[0]['value'], '10') + if __name__ == '__main__': unittest.main() From 6e36dcb848bfd630e2ba53384b5455a4bdb638ce Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Wed, 10 Jun 2026 05:24:37 +0000 Subject: [PATCH 07/24] docs: add descriptive comments to CSV writer configuration --- tools/import_validation/validator_goldens.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/import_validation/validator_goldens.py b/tools/import_validation/validator_goldens.py index 54653367a8..62030d489b 100644 --- a/tools/import_validation/validator_goldens.py +++ b/tools/import_validation/validator_goldens.py @@ -445,12 +445,19 @@ def generate_goldens(input_files: str, if golden_nodes and output_file: logging.info(f'Writing {len(golden_nodes)} goldens to {output_file}') if file_util.file_is_csv(output_file): + # Gather all unique column headers across all generated golden nodes. + # This ensures that even if some nodes have differing schema properties, + # every single column is fully represented in the CSV header. headers = [] for node in golden_nodes.values(): for prop in node.keys(): if prop not in headers: headers.append(prop) with file_util.FileIO(output_file, mode='w') as csvfile: + # Use standard-compliant csv.DictWriter with QUOTE_NONNUMERIC. + # This automatically wraps all string fields (e.g. text containing commas) + # in double quotes to prevent column misalignment/corruption, while leaving + # numbers (like float and int values) unquoted. writer = csv.DictWriter( csvfile, fieldnames=headers, From 1161f179694283ac1d1698267b92c92a3fdd737d Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Wed, 10 Jun 2026 05:36:27 +0000 Subject: [PATCH 08/24] testing --- tools/import_validation/runner_test.py | 5 ++--- tools/import_validation/validator_goldens.py | 4 +++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/import_validation/runner_test.py b/tools/import_validation/runner_test.py index e1201eca69..ace339a81e 100644 --- a/tools/import_validation/runner_test.py +++ b/tools/import_validation/runner_test.py @@ -428,9 +428,8 @@ def test_path_resolution_helpers_and_robustness(self, MockValidator): 'output/input_file.csv', 'output_file': 'output/output_file.csv', - 'custom_files': [ - 'output/custom_file.csv', '/skip/this.csv' - ] + 'custom_files': + ['output/custom_file.csv', '/skip/this.csv'] } }] }, f) diff --git a/tools/import_validation/validator_goldens.py b/tools/import_validation/validator_goldens.py index 62030d489b..0e78a334de 100644 --- a/tools/import_validation/validator_goldens.py +++ b/tools/import_validation/validator_goldens.py @@ -302,7 +302,8 @@ def load_nodes_from_file(files: str) -> dict: # Clean up "dcid:" prefixes from values (column headers are kept as is) clean_node = {} for k, v in node.items(): - clean_val = v[5:] if (isinstance(v, str) and v.startswith("dcid:")) else v + clean_val = v[5:] if (isinstance(v, str) and + v.startswith("dcid:")) else v clean_node[k] = clean_val nodes[len(nodes)] = clean_node else: @@ -317,6 +318,7 @@ def load_nodes_from_file(files: str) -> dict: logging.info(f'Loaded {len(nodes)} nodes from {input_files}') return nodes + def generate_goldens(input_files: str, property_sets: list, output_file: str = None, From ddf793b4b485aaad8a65b1b56b75246934437e24 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Sun, 14 Jun 2026 11:36:04 +0000 Subject: [PATCH 09/24] testing --- tools/import_validation/validator_goldens.py | 32 +++----------------- util/file_util.py | 11 ++++--- 2 files changed, 12 insertions(+), 31 deletions(-) diff --git a/tools/import_validation/validator_goldens.py b/tools/import_validation/validator_goldens.py index 0e78a334de..478f3a7853 100644 --- a/tools/import_validation/validator_goldens.py +++ b/tools/import_validation/validator_goldens.py @@ -68,7 +68,7 @@ --generate_goldens=goldens_data/generated_goldens.csv """ -import csv + import os import sys import tempfile @@ -302,8 +302,7 @@ def load_nodes_from_file(files: str) -> dict: # Clean up "dcid:" prefixes from values (column headers are kept as is) clean_node = {} for k, v in node.items(): - clean_val = v[5:] if (isinstance(v, str) and - v.startswith("dcid:")) else v + clean_val = v[5:] if (isinstance(v, str) and v.startswith("dcid:")) else v clean_node[k] = clean_val nodes[len(nodes)] = clean_node else: @@ -447,30 +446,9 @@ def generate_goldens(input_files: str, if golden_nodes and output_file: logging.info(f'Writing {len(golden_nodes)} goldens to {output_file}') if file_util.file_is_csv(output_file): - # Gather all unique column headers across all generated golden nodes. - # This ensures that even if some nodes have differing schema properties, - # every single column is fully represented in the CSV header. - headers = [] - for node in golden_nodes.values(): - for prop in node.keys(): - if prop not in headers: - headers.append(prop) - with file_util.FileIO(output_file, mode='w') as csvfile: - # Use standard-compliant csv.DictWriter with QUOTE_NONNUMERIC. - # This automatically wraps all string fields (e.g. text containing commas) - # in double quotes to prevent column misalignment/corruption, while leaving - # numbers (like float and int values) unquoted. - writer = csv.DictWriter( - csvfile, - fieldnames=headers, - escapechar='\\', - extrasaction='ignore', - quotechar='"', - quoting=csv.QUOTE_NONNUMERIC, - ) - writer.writeheader() - for node in golden_nodes.values(): - writer.writerow(node) + file_util.file_write_csv_dict(golden_nodes, + output_file, + key_column_name=None) else: mcf_file_util.write_mcf_nodes([golden_nodes], output_file) diff --git a/util/file_util.py b/util/file_util.py index c255bc365a..7ff1c9c2f3 100644 --- a/util/file_util.py +++ b/util/file_util.py @@ -674,9 +674,12 @@ def file_write_csv_dict(py_dict: dict, if col not in columns: columns.append(col) if len(columns) == 1: - # Value is not a dict. Write it as a column name value. - value_column_name = 'value' - columns.append(value_column_name) + # Check if values are dicts. If they are, it's not a primitive value. + is_value_dict = any(isinstance(value, dict) for value in py_dict.values()) + if not is_value_dict: + # Value is not a dict. Write it as a column name value. + value_column_name = 'value' + columns.append(value_column_name) # Use the first column for the key. if key_column_name == '': key_column_name = columns[0] @@ -1052,7 +1055,7 @@ def file_get_csv_reader_options( default_options: dict = {}, data: str = None, encoding: str = None, - delim_chars: list = [',', ' ', ';', '|', ':']) -> dict: + delim_chars: list = [',', ' ', ';', '|', ':']) -> dict: """Returns a dictionary with options for the CSV file reader. Args: From 5315183f2f6fec2a6075aa958e1a9c1e31ec497b Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Sun, 14 Jun 2026 11:40:49 +0000 Subject: [PATCH 10/24] revert: undo changes to validator_goldens_test.py --- .../validator_goldens_test.py | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/tools/import_validation/validator_goldens_test.py b/tools/import_validation/validator_goldens_test.py index c2fa1b9ada..e486a8fe91 100644 --- a/tools/import_validation/validator_goldens_test.py +++ b/tools/import_validation/validator_goldens_test.py @@ -190,28 +190,6 @@ def test_validate_goldens(self, mock_file, mock_compare, mock_load): self.assertEqual(missing, []) mock_compare.assert_called_once() - @patch('validator_goldens.file_util') - def test_load_nodes_from_file_with_dcid_prefix_values(self, mock_file): - mock_file.file_get_matching.return_value = ['dummy.csv'] - mock_file.file_is_csv.return_value = True - mock_file.file_load_csv_dict.return_value = { - 0: { - 'observationAbout': 'dcid:Earth', - 'variableMeasured': 'dcid:Count_Person', - 'value': '10' - } - } - - nodes = validator_goldens.load_nodes_from_file('dummy.csv') - - self.assertEqual(len(nodes), 1) - # Verify that prefixes 'dcid:' have been stripped from values only - self.assertIn('observationAbout', nodes[0]) - self.assertEqual(nodes[0]['observationAbout'], 'Earth') - self.assertIn('variableMeasured', nodes[0]) - self.assertEqual(nodes[0]['variableMeasured'], 'Count_Person') - self.assertEqual(nodes[0]['value'], '10') - if __name__ == '__main__': unittest.main() From 8d80c46037580678fae7f18cefaa9f5980aa1795 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Sun, 14 Jun 2026 11:50:35 +0000 Subject: [PATCH 11/24] style: clean up whitespace in validator_goldens.py --- tools/import_validation/validator_goldens.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/import_validation/validator_goldens.py b/tools/import_validation/validator_goldens.py index 478f3a7853..c8ba12f44e 100644 --- a/tools/import_validation/validator_goldens.py +++ b/tools/import_validation/validator_goldens.py @@ -68,7 +68,6 @@ --generate_goldens=goldens_data/generated_goldens.csv """ - import os import sys import tempfile From c3eff23b6ff5d6e2c10bcb4002655ace53a66384 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Sun, 14 Jun 2026 11:51:14 +0000 Subject: [PATCH 12/24] refactor: reduce base directory search depth limit to 8 in runner.py --- tools/import_validation/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/import_validation/runner.py b/tools/import_validation/runner.py index 37ce44a5a6..a6c588faf9 100644 --- a/tools/import_validation/runner.py +++ b/tools/import_validation/runner.py @@ -74,7 +74,7 @@ def _find_base_dir(start_path: str, target_sub_path: str) -> str | None: if not start_path: return None curr = os.path.abspath(start_path) - for _ in range(10): # limit to 10 levels up + for _ in range(8): # limit to 8 levels up if os.path.exists(os.path.join(curr, target_sub_path)): return curr parent = os.path.dirname(curr) From cab980d97d9113848c7bd26e504afb23899590d6 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Sun, 14 Jun 2026 11:57:41 +0000 Subject: [PATCH 13/24] revert: restore tab character in delim_chars in file_util.py --- util/file_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/file_util.py b/util/file_util.py index 7ff1c9c2f3..9c0f0d744f 100644 --- a/util/file_util.py +++ b/util/file_util.py @@ -1055,7 +1055,7 @@ def file_get_csv_reader_options( default_options: dict = {}, data: str = None, encoding: str = None, - delim_chars: list = [',', ' ', ';', '|', ':']) -> dict: + delim_chars: list = [',', '\t', ';', '|', ':']) -> dict: """Returns a dictionary with options for the CSV file reader. Args: From 2701c54e0ede24141c8965039355d4f610a199e6 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Sun, 14 Jun 2026 12:00:59 +0000 Subject: [PATCH 14/24] style: restore literal tab character in delim_chars in file_util.py --- util/file_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/file_util.py b/util/file_util.py index 9c0f0d744f..1e57a4c7ef 100644 --- a/util/file_util.py +++ b/util/file_util.py @@ -1055,7 +1055,7 @@ def file_get_csv_reader_options( default_options: dict = {}, data: str = None, encoding: str = None, - delim_chars: list = [',', '\t', ';', '|', ':']) -> dict: + delim_chars: list = [',', ' ', ';', '|', ':']) -> dict: """Returns a dictionary with options for the CSV file reader. Args: From b3d2053453359d21304341eb16e1f8b2a54b1f31 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Sun, 14 Jun 2026 17:47:20 +0000 Subject: [PATCH 15/24] testing --- tools/import_validation/validator_goldens.py | 3 ++- util/file_util.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/import_validation/validator_goldens.py b/tools/import_validation/validator_goldens.py index c8ba12f44e..b8f0b7f53b 100644 --- a/tools/import_validation/validator_goldens.py +++ b/tools/import_validation/validator_goldens.py @@ -301,7 +301,8 @@ def load_nodes_from_file(files: str) -> dict: # Clean up "dcid:" prefixes from values (column headers are kept as is) clean_node = {} for k, v in node.items(): - clean_val = v[5:] if (isinstance(v, str) and v.startswith("dcid:")) else v + clean_val = v[5:] if (isinstance(v, str) and + v.startswith("dcid:")) else v clean_node[k] = clean_val nodes[len(nodes)] = clean_node else: diff --git a/util/file_util.py b/util/file_util.py index 1e57a4c7ef..fae21b6c72 100644 --- a/util/file_util.py +++ b/util/file_util.py @@ -675,7 +675,8 @@ def file_write_csv_dict(py_dict: dict, columns.append(col) if len(columns) == 1: # Check if values are dicts. If they are, it's not a primitive value. - is_value_dict = any(isinstance(value, dict) for value in py_dict.values()) + is_value_dict = any( + isinstance(value, dict) for value in py_dict.values()) if not is_value_dict: # Value is not a dict. Write it as a column name value. value_column_name = 'value' From 8f7e9e71953159d2a68c9d2e728d22e1bda9e574 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Sun, 14 Jun 2026 18:18:45 +0000 Subject: [PATCH 16/24] testing --- tools/import_validation/runner.py | 5 ++--- tools/import_validation/validator_goldens.py | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/tools/import_validation/runner.py b/tools/import_validation/runner.py index a6c588faf9..6e6a61d089 100644 --- a/tools/import_validation/runner.py +++ b/tools/import_validation/runner.py @@ -14,7 +14,6 @@ """Module for the ValidationRunner class.""" import os -import logging from absl import app from absl import flags from absl import logging @@ -58,7 +57,7 @@ def _find_base_dir(start_path: str, target_sub_path: str) -> str | None: Starting from the absolute directory of `start_path`, this function recursively checks if `target_sub_path` exists in the current folder. If not, it walks up the - parent directory tree up to 10 levels. This is crucial for resolving paths relative + parent directory tree up to 8 levels. This is crucial for resolving paths relative to import-specific golden directories when tests/validation are run from different working directories (such as the repository root in CI/CD). @@ -69,7 +68,7 @@ def _find_base_dir(start_path: str, target_sub_path: str) -> str | None: Returns: The absolute path of the directory containing `target_sub_path` if found, - or None if the root was reached or the 10-level limit was exceeded. + or None if the root was reached or the 8-level limit was exceeded. """ if not start_path: return None diff --git a/tools/import_validation/validator_goldens.py b/tools/import_validation/validator_goldens.py index b8f0b7f53b..48e2937968 100644 --- a/tools/import_validation/validator_goldens.py +++ b/tools/import_validation/validator_goldens.py @@ -301,8 +301,7 @@ def load_nodes_from_file(files: str) -> dict: # Clean up "dcid:" prefixes from values (column headers are kept as is) clean_node = {} for k, v in node.items(): - clean_val = v[5:] if (isinstance(v, str) and - v.startswith("dcid:")) else v + clean_val = v.removeprefix("dcid:") if isinstance(v, str) else v clean_node[k] = clean_val nodes[len(nodes)] = clean_node else: From 298cc57f3bee7311621cd5edae1dce41ec3ca971 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Mon, 15 Jun 2026 07:01:34 +0000 Subject: [PATCH 17/24] testing --- tools/import_validation/runner.py | 51 +------------------------- tools/import_validation/runner_test.py | 2 +- tools/import_validation/util.py | 50 +++++++++++++++++++++++++ util/file_util.py | 6 +-- 4 files changed, 54 insertions(+), 55 deletions(-) diff --git a/tools/import_validation/runner.py b/tools/import_validation/runner.py index 6e6a61d089..8517753f2a 100644 --- a/tools/import_validation/runner.py +++ b/tools/import_validation/runner.py @@ -29,60 +29,11 @@ from report_generator import ReportGenerator from validator import Validator from result import ValidationResult, ValidationStatus -from util import filter_dataframe +from util import filter_dataframe, _is_relative_local, _find_base_dir _FLAGS = flags.FLAGS -def _is_relative_local(path_val: str) -> bool: - """Checks if a path is a relative, local file path. - - This function identifies path strings that represent local relative files - (e.g., 'golden_data/un_wpp.csv') as opposed to absolute paths. It filters - out non-strings, empty strings, and absolute local paths. - - Args: - path_val: The file path string to evaluate. - - Returns: - True if the path represents a relative, local file path; False otherwise. - """ - if not isinstance(path_val, str) or not path_val: - return False - return not os.path.isabs(path_val) - - -def _find_base_dir(start_path: str, target_sub_path: str) -> str | None: - """Helper to find a base directory containing a target sub-path by walking up. - - Starting from the absolute directory of `start_path`, this function recursively - checks if `target_sub_path` exists in the current folder. If not, it walks up the - parent directory tree up to 8 levels. This is crucial for resolving paths relative - to import-specific golden directories when tests/validation are run from - different working directories (such as the repository root in CI/CD). - - Args: - start_path: The file or directory path to start the upward search from. - target_sub_path: The name of the subdirectory or file (e.g., 'golden_data') - to search for within the parent tree. - - Returns: - The absolute path of the directory containing `target_sub_path` if found, - or None if the root was reached or the 8-level limit was exceeded. - """ - if not start_path: - return None - curr = os.path.abspath(start_path) - for _ in range(8): # limit to 8 levels up - if os.path.exists(os.path.join(curr, target_sub_path)): - return curr - parent = os.path.dirname(curr) - if parent == curr: - break - curr = parent - return None - - class ValidationRunner: """ Orchestrates the validation process based on the new schema. diff --git a/tools/import_validation/runner_test.py b/tools/import_validation/runner_test.py index ace339a81e..690e6dd9ef 100644 --- a/tools/import_validation/runner_test.py +++ b/tools/import_validation/runner_test.py @@ -400,7 +400,7 @@ def test_runner_resolves_goldens_check_paths(self, MockValidator): @patch('tools.import_validation.runner.Validator') def test_path_resolution_helpers_and_robustness(self, MockValidator): """Tests that relative local path helpers work and robust path resolution handles ends-with keys.""" - from tools.import_validation.runner import _is_relative_local, _find_base_dir + from tools.import_validation.util import _is_relative_local, _find_base_dir # Test helper functions self.assertTrue(_is_relative_local("some/relative/path.csv")) diff --git a/tools/import_validation/util.py b/tools/import_validation/util.py index c26675eb8e..ca2f8d9ddb 100644 --- a/tools/import_validation/util.py +++ b/tools/import_validation/util.py @@ -13,6 +13,7 @@ # limitations under the License. """Utility functions for the import validation tool.""" +import os import pandas as pd import re @@ -108,3 +109,52 @@ def filter_dataframe(df: pd.DataFrame, matching_indices.update(df[combined_condition].index) return df.loc[sorted(list(matching_indices))] + + +def _is_relative_local(path_val: str) -> bool: + """Checks if a path is a relative, local file path. + + This function identifies path strings that represent local relative files + (e.g., 'golden_data/un_wpp.csv') as opposed to absolute paths. It filters + out non-strings, empty strings, and absolute local paths. + + Args: + path_val: The file path string to evaluate. + + Returns: + True if the path represents a relative, local file path; False otherwise. + """ + if not isinstance(path_val, str) or not path_val: + return False + return not os.path.isabs(path_val) + + +def _find_base_dir(start_path: str, target_sub_path: str) -> str | None: + """Helper to find a base directory containing a target sub-path by walking up. + + Starting from the absolute directory of `start_path`, this function recursively + checks if `target_sub_path` exists in the current folder. If not, it walks up the + parent directory tree up to 8 levels. This is crucial for resolving paths relative + to import-specific golden directories when tests/validation are run from + different working directories (such as the repository root in CI/CD). + + Args: + start_path: The file or directory path to start the upward search from. + target_sub_path: The name of the subdirectory or file (e.g., 'golden_data') + to search for within the parent tree. + + Returns: + The absolute path of the directory containing `target_sub_path` if found, + or None if the root was reached or the 8-level limit was exceeded. + """ + if not start_path: + return None + curr = os.path.abspath(start_path) + for _ in range(8): # limit to 8 levels up + if os.path.exists(os.path.join(curr, target_sub_path)): + return curr + parent = os.path.dirname(curr) + if parent == curr: + break + curr = parent + return None diff --git a/util/file_util.py b/util/file_util.py index fae21b6c72..0371a67adc 100644 --- a/util/file_util.py +++ b/util/file_util.py @@ -619,7 +619,7 @@ def file_write_csv_dict(py_dict: dict, filename: str, columns: list = None, key_column_name: str = 'key') -> list: - """Returns the filename after writing py_dict with a csv row per item. + """Returns the list of columns after writing py_dict with a csv row per item. Each dictionary items is written as a row in the CSV file. @@ -675,9 +675,7 @@ def file_write_csv_dict(py_dict: dict, columns.append(col) if len(columns) == 1: # Check if values are dicts. If they are, it's not a primitive value. - is_value_dict = any( - isinstance(value, dict) for value in py_dict.values()) - if not is_value_dict: + if not any(isinstance(value, dict) for value in py_dict.values()): # Value is not a dict. Write it as a column name value. value_column_name = 'value' columns.append(value_column_name) From a889f0e57dd27291c5aed70a1c17a2e50d92bc9a Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Mon, 15 Jun 2026 07:10:27 +0000 Subject: [PATCH 18/24] revert: restore original runner_test.py to remove it from PR --- tools/import_validation/runner_test.py | 113 ------------------------- 1 file changed, 113 deletions(-) diff --git a/tools/import_validation/runner_test.py b/tools/import_validation/runner_test.py index 690e6dd9ef..76c5bc1d0b 100644 --- a/tools/import_validation/runner_test.py +++ b/tools/import_validation/runner_test.py @@ -350,116 +350,3 @@ def test_init_raises_error_if_required_file_is_missing(self): lint_report=self.report_path, validation_output=self.output_path) self.assertIn("'stats' data source", str(context.exception)) - - @patch('tools.import_validation.runner.Validator') - def test_runner_resolves_goldens_check_paths(self, MockValidator): - """Tests that GOLDENS_CHECK validator resolves relative paths correctly.""" - # 1. Setup the mock - mock_validator_instance = MockValidator.return_value - mock_validator_instance.validate_goldens.return_value = ValidationResult( - ValidationStatus.PASSED, 'GOLDENS_CHECK') - - # 2. Create the config file with relative paths - with open(self.config_path, 'w') as f: - json.dump( - { - 'rules': [{ - 'rule_id': 'check_goldens_output_csv', - 'validator': 'GOLDENS_CHECK', - 'scope': {}, - 'params': { - 'golden_files': 'golden_data/golden_file.csv', - 'input_files': 'output/input_file.csv' - } - }] - }, f) - pd.DataFrame({'StatVar': ['sv1']}).to_csv(self.stats_path, index=False) - - # 3. Setup a dummy golden_data folder inside the temp dir to trigger base dir resolution - golden_dir = os.path.join(self.test_dir.name, 'golden_data') - os.makedirs(golden_dir, exist_ok=True) - - runner = ValidationRunner(validation_config_path=self.config_path, - stats_summary=self.stats_path, - differ_output=self.differ_path, - lint_report=self.report_path, - validation_output=self.output_path) - runner.run_validations() - - # 4. Assert that validate_goldens was called with resolved paths - mock_validator_instance.validate_goldens.assert_called_once() - args, kwargs = mock_validator_instance.validate_goldens.call_args - rule_params = args[1] - expected_golden = os.path.join(self.test_dir.name, - 'golden_data/golden_file.csv') - expected_input = os.path.join(self.test_dir.name, - 'output/input_file.csv') - self.assertEqual(rule_params['golden_files'], expected_golden) - self.assertEqual(rule_params['input_files'], expected_input) - - @patch('tools.import_validation.runner.Validator') - def test_path_resolution_helpers_and_robustness(self, MockValidator): - """Tests that relative local path helpers work and robust path resolution handles ends-with keys.""" - from tools.import_validation.util import _is_relative_local, _find_base_dir - - # Test helper functions - self.assertTrue(_is_relative_local("some/relative/path.csv")) - self.assertFalse(_is_relative_local("/absolute/path.csv")) - self.assertFalse(_is_relative_local(None)) - self.assertFalse(_is_relative_local(123)) - - # Setup mock validator - mock_validator_instance = MockValidator.return_value - mock_validator_instance.validate_goldens.return_value = ValidationResult( - ValidationStatus.PASSED, 'GOLDENS_CHECK') - - # Create config file with various parameter keys (like output_file) that contain paths - with open(self.config_path, 'w') as f: - json.dump( - { - 'rules': [{ - 'rule_id': 'check_goldens_output_csv', - 'validator': 'GOLDENS_CHECK', - 'scope': {}, - 'params': { - 'golden_files': - 'golden_data/golden_file.csv', - 'input_files': - 'output/input_file.csv', - 'output_file': - 'output/output_file.csv', - 'custom_files': - ['output/custom_file.csv', '/skip/this.csv'] - } - }] - }, f) - pd.DataFrame({'StatVar': ['sv1']}).to_csv(self.stats_path, index=False) - - # Setup dummy golden_data folder - golden_dir = os.path.join(self.test_dir.name, 'golden_data') - os.makedirs(golden_dir, exist_ok=True) - - runner = ValidationRunner(validation_config_path=self.config_path, - stats_summary=self.stats_path, - differ_output=self.differ_path, - lint_report=self.report_path, - validation_output=self.output_path) - runner.run_validations() - - # Check path resolution for the parameters - args, kwargs = mock_validator_instance.validate_goldens.call_args - rule_params = args[1] - - self.assertEqual( - rule_params['golden_files'], - os.path.join(self.test_dir.name, 'golden_data/golden_file.csv')) - self.assertEqual( - rule_params['input_files'], - os.path.join(self.test_dir.name, 'output/input_file.csv')) - self.assertEqual( - rule_params['output_file'], - os.path.join(self.test_dir.name, 'output/output_file.csv')) - self.assertEqual(rule_params['custom_files'], [ - os.path.join(self.test_dir.name, 'output/custom_file.csv'), - '/skip/this.csv' - ]) From 44bdc78251c2d8b854d1b43c91a2f4b5ab3d16a5 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Mon, 15 Jun 2026 10:01:20 +0000 Subject: [PATCH 19/24] refactor: inline path resolution in runner.py and remove util.py from PR --- tools/import_validation/runner.py | 26 +++++++--------- tools/import_validation/util.py | 49 ------------------------------- 2 files changed, 11 insertions(+), 64 deletions(-) diff --git a/tools/import_validation/runner.py b/tools/import_validation/runner.py index 8517753f2a..a852ff812d 100644 --- a/tools/import_validation/runner.py +++ b/tools/import_validation/runner.py @@ -29,7 +29,7 @@ from report_generator import ReportGenerator from validator import Validator from result import ValidationResult, ValidationStatus -from util import filter_dataframe, _is_relative_local, _find_base_dir +from util import filter_dataframe _FLAGS = flags.FLAGS @@ -216,19 +216,15 @@ def run_validations(self) -> tuple[bool, list[ValidationResult]]: # Resolve paths relative to the directory of the validation config. if validator_name == 'GOLDENS_CHECK': - config_dir = None - # Walk up from validation_config_path, self.stats_summary, or CWD to find where 'golden_data' lives - for start in [ - self.validation_config_path, self.stats_summary, - os.getcwd() - ]: - config_dir = _find_base_dir(start, 'golden_data') - if config_dir: + config_dir = os.path.dirname( + os.path.abspath(self.validation_config_path)) + # We walk up to find where the golden_data folder is situated. + curr = config_dir + while curr and curr != os.path.dirname(curr): + if os.path.exists(os.path.join(curr, 'golden_data')): + config_dir = curr break - - if not config_dir: - config_dir = os.path.dirname( - os.path.abspath(self.validation_config_path)) + curr = os.path.dirname(curr) print( f"DEBUG: Found GOLDENS_CHECK rule: '{rule.get('rule_id')}'" @@ -245,13 +241,13 @@ def run_validations(self) -> tuple[bool, list[ValidationResult]]: print( f"DEBUG: Before resolve '{path_key}': '{val}'") if isinstance(val, str): - if _is_relative_local(val): + if val and not os.path.isabs(val): rule_params[path_key] = os.path.join( config_dir, val) elif isinstance(val, list): rule_params[path_key] = [ os.path.join(config_dir, item) - if _is_relative_local(item) else item + if isinstance(item, str) and item and not os.path.isabs(item) else item for item in val ] print( diff --git a/tools/import_validation/util.py b/tools/import_validation/util.py index ca2f8d9ddb..7cd5ecc6da 100644 --- a/tools/import_validation/util.py +++ b/tools/import_validation/util.py @@ -109,52 +109,3 @@ def filter_dataframe(df: pd.DataFrame, matching_indices.update(df[combined_condition].index) return df.loc[sorted(list(matching_indices))] - - -def _is_relative_local(path_val: str) -> bool: - """Checks if a path is a relative, local file path. - - This function identifies path strings that represent local relative files - (e.g., 'golden_data/un_wpp.csv') as opposed to absolute paths. It filters - out non-strings, empty strings, and absolute local paths. - - Args: - path_val: The file path string to evaluate. - - Returns: - True if the path represents a relative, local file path; False otherwise. - """ - if not isinstance(path_val, str) or not path_val: - return False - return not os.path.isabs(path_val) - - -def _find_base_dir(start_path: str, target_sub_path: str) -> str | None: - """Helper to find a base directory containing a target sub-path by walking up. - - Starting from the absolute directory of `start_path`, this function recursively - checks if `target_sub_path` exists in the current folder. If not, it walks up the - parent directory tree up to 8 levels. This is crucial for resolving paths relative - to import-specific golden directories when tests/validation are run from - different working directories (such as the repository root in CI/CD). - - Args: - start_path: The file or directory path to start the upward search from. - target_sub_path: The name of the subdirectory or file (e.g., 'golden_data') - to search for within the parent tree. - - Returns: - The absolute path of the directory containing `target_sub_path` if found, - or None if the root was reached or the 8-level limit was exceeded. - """ - if not start_path: - return None - curr = os.path.abspath(start_path) - for _ in range(8): # limit to 8 levels up - if os.path.exists(os.path.join(curr, target_sub_path)): - return curr - parent = os.path.dirname(curr) - if parent == curr: - break - curr = parent - return None From 800087ebde257dd48b7a96618b5ed46dc14d7b10 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Mon, 15 Jun 2026 10:05:07 +0000 Subject: [PATCH 20/24] feat: add path-resolution helper functions to util.py --- tools/import_validation/util.py | 49 +++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/tools/import_validation/util.py b/tools/import_validation/util.py index 7cd5ecc6da..ca2f8d9ddb 100644 --- a/tools/import_validation/util.py +++ b/tools/import_validation/util.py @@ -109,3 +109,52 @@ def filter_dataframe(df: pd.DataFrame, matching_indices.update(df[combined_condition].index) return df.loc[sorted(list(matching_indices))] + + +def _is_relative_local(path_val: str) -> bool: + """Checks if a path is a relative, local file path. + + This function identifies path strings that represent local relative files + (e.g., 'golden_data/un_wpp.csv') as opposed to absolute paths. It filters + out non-strings, empty strings, and absolute local paths. + + Args: + path_val: The file path string to evaluate. + + Returns: + True if the path represents a relative, local file path; False otherwise. + """ + if not isinstance(path_val, str) or not path_val: + return False + return not os.path.isabs(path_val) + + +def _find_base_dir(start_path: str, target_sub_path: str) -> str | None: + """Helper to find a base directory containing a target sub-path by walking up. + + Starting from the absolute directory of `start_path`, this function recursively + checks if `target_sub_path` exists in the current folder. If not, it walks up the + parent directory tree up to 8 levels. This is crucial for resolving paths relative + to import-specific golden directories when tests/validation are run from + different working directories (such as the repository root in CI/CD). + + Args: + start_path: The file or directory path to start the upward search from. + target_sub_path: The name of the subdirectory or file (e.g., 'golden_data') + to search for within the parent tree. + + Returns: + The absolute path of the directory containing `target_sub_path` if found, + or None if the root was reached or the 8-level limit was exceeded. + """ + if not start_path: + return None + curr = os.path.abspath(start_path) + for _ in range(8): # limit to 8 levels up + if os.path.exists(os.path.join(curr, target_sub_path)): + return curr + parent = os.path.dirname(curr) + if parent == curr: + break + curr = parent + return None From 41ee2bdabf05be68520005d5923cbd87440a8ed0 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Mon, 15 Jun 2026 10:10:47 +0000 Subject: [PATCH 21/24] testing --- tools/import_validation/util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/import_validation/util.py b/tools/import_validation/util.py index ca2f8d9ddb..74d970c6a5 100644 --- a/tools/import_validation/util.py +++ b/tools/import_validation/util.py @@ -13,7 +13,6 @@ # limitations under the License. """Utility functions for the import validation tool.""" -import os import pandas as pd import re From 8958e7e1e6949aeb566247c164c64637b4a8654e Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Mon, 15 Jun 2026 10:12:19 +0000 Subject: [PATCH 22/24] testing --- tools/import_validation/util.py | 51 +-------------------------------- 1 file changed, 1 insertion(+), 50 deletions(-) diff --git a/tools/import_validation/util.py b/tools/import_validation/util.py index 74d970c6a5..5d1a1703ae 100644 --- a/tools/import_validation/util.py +++ b/tools/import_validation/util.py @@ -107,53 +107,4 @@ def filter_dataframe(df: pd.DataFrame, case=False) matching_indices.update(df[combined_condition].index) - return df.loc[sorted(list(matching_indices))] - - -def _is_relative_local(path_val: str) -> bool: - """Checks if a path is a relative, local file path. - - This function identifies path strings that represent local relative files - (e.g., 'golden_data/un_wpp.csv') as opposed to absolute paths. It filters - out non-strings, empty strings, and absolute local paths. - - Args: - path_val: The file path string to evaluate. - - Returns: - True if the path represents a relative, local file path; False otherwise. - """ - if not isinstance(path_val, str) or not path_val: - return False - return not os.path.isabs(path_val) - - -def _find_base_dir(start_path: str, target_sub_path: str) -> str | None: - """Helper to find a base directory containing a target sub-path by walking up. - - Starting from the absolute directory of `start_path`, this function recursively - checks if `target_sub_path` exists in the current folder. If not, it walks up the - parent directory tree up to 8 levels. This is crucial for resolving paths relative - to import-specific golden directories when tests/validation are run from - different working directories (such as the repository root in CI/CD). - - Args: - start_path: The file or directory path to start the upward search from. - target_sub_path: The name of the subdirectory or file (e.g., 'golden_data') - to search for within the parent tree. - - Returns: - The absolute path of the directory containing `target_sub_path` if found, - or None if the root was reached or the 8-level limit was exceeded. - """ - if not start_path: - return None - curr = os.path.abspath(start_path) - for _ in range(8): # limit to 8 levels up - if os.path.exists(os.path.join(curr, target_sub_path)): - return curr - parent = os.path.dirname(curr) - if parent == curr: - break - curr = parent - return None + return df.loc[sorted(list(matching_indices))] \ No newline at end of file From 14b56309995a5c25a2a9a125883f1e8d321de899 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Mon, 15 Jun 2026 10:13:54 +0000 Subject: [PATCH 23/24] testing --- tools/import_validation/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/import_validation/util.py b/tools/import_validation/util.py index 5d1a1703ae..c26675eb8e 100644 --- a/tools/import_validation/util.py +++ b/tools/import_validation/util.py @@ -107,4 +107,4 @@ def filter_dataframe(df: pd.DataFrame, case=False) matching_indices.update(df[combined_condition].index) - return df.loc[sorted(list(matching_indices))] \ No newline at end of file + return df.loc[sorted(list(matching_indices))] From d0fdae5d4db7f3391ce84348eedc802ed26824b1 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Mon, 15 Jun 2026 11:48:32 +0000 Subject: [PATCH 24/24] testing --- tools/import_validation/runner.py | 3 ++- tools/import_validation/validator_goldens.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/import_validation/runner.py b/tools/import_validation/runner.py index a852ff812d..964ed98285 100644 --- a/tools/import_validation/runner.py +++ b/tools/import_validation/runner.py @@ -247,7 +247,8 @@ def run_validations(self) -> tuple[bool, list[ValidationResult]]: elif isinstance(val, list): rule_params[path_key] = [ os.path.join(config_dir, item) - if isinstance(item, str) and item and not os.path.isabs(item) else item + if isinstance(item, str) and item and + not os.path.isabs(item) else item for item in val ] print( diff --git a/tools/import_validation/validator_goldens.py b/tools/import_validation/validator_goldens.py index 48e2937968..82c1781ad9 100644 --- a/tools/import_validation/validator_goldens.py +++ b/tools/import_validation/validator_goldens.py @@ -301,7 +301,8 @@ def load_nodes_from_file(files: str) -> dict: # Clean up "dcid:" prefixes from values (column headers are kept as is) clean_node = {} for k, v in node.items(): - clean_val = v.removeprefix("dcid:") if isinstance(v, str) else v + clean_val = v.removeprefix("dcid:") if isinstance( + v, str) else v clean_node[k] = clean_val nodes[len(nodes)] = clean_node else: