diff --git a/scripts/world_bank/wdi/README.md b/scripts/world_bank/wdi/README.md index ef1f0f5dc6..239d0f7277 100644 --- a/scripts/world_bank/wdi/README.md +++ b/scripts/world_bank/wdi/README.md @@ -146,5 +146,24 @@ If you want to perform "only download", run the below command: python3 worldbank.py --mode=download ``` +### Added golden files and increased the threshold with golden checks in validation_config.json. + +The `GOLDENS_CHECK` validator confirms that the import includes a specific set of expected records. This is useful for verifying that critical StatVars, Places, or specific metadata combinations are consistently present in the output. + +The validator compares the input data (usually from the stats data source) against one or more "golden" files (MCF or CSV). + +If any combination of values in a golden file row is missing from the input, the validation fails. The missing golden rows are then listed in the validation report JSON. + +If you want to get goldens, run the below command: +```bash +#goldens from output csv +python3 validator_goldens.py --validate_goldens_input=../../scripts/world_bank/wdi/output/WorldBank.csv --generate_goldens=golden_data/golden_observations.csv --goldens_must_include="ISO3166Alpha3:gs://unresolved_mcf/import_validation/top_100k_places.csv" --generate_goldens_property_sets="ISO3166Alpha3" +``` + +#goldens from summary reports +```bash +python3 validator_goldens.py --validate_goldens_input="summary_report.csv" --generate_goldens=golden_data/golden_summary_report.csv --generate_goldens_property_sets="StatVar|Units|MinDate|MeasurementMethods|observationPeriod" +``` + We highly recommend the use of the import validation tool for this import which you can find in https://github.com/datacommonsorg/tools/tree/master/import-validation-helper. diff --git a/scripts/world_bank/wdi/golden_data/golden_summary_report.csv b/scripts/world_bank/wdi/golden_data/golden_summary_report.csv new file mode 100644 index 0000000000..bf2d3335a1 --- /dev/null +++ b/scripts/world_bank/wdi/golden_data/golden_summary_report.csv @@ -0,0 +1,71 @@ +"NumPlaces","StatVar","ScalingFactors","MeasurementMethods","Units","observationPeriods","MinDate" +"186","Count_Death_IntentionalSelfHarm_Male_AsFractionOf_Count_Person_Male","[]","[]","[Per100000Males]","[P1Y]","2000" +"203","Amount_EconomicActivity_GrossNationalIncome_PurchasingPowerParity","[]","[]","[InternationalDollar]","[P1Y]","1990" +"165","Count_Person_Upto4Years_Wasting_AsFractionOf_Count_Person_Upto4Years","[100]","[JointChildMalnutritionEstimate]","[Percent]","[P1Y]","1983" +"144","Count_Person_25OrMoreYears_DoctorateDegree_AsFractionOf_Count_Person_25OrMoreYears","[]","[]","[]","[P1Y]","1994" +"204","Amount_Emissions_CarbonDioxide_PerCapita","[]","[]","[MetricTon]","[P1Y]","1970" +"184","Count_Person_25OrMoreYears_Male_TertiaryEducation_AsFractionOf_Count_Person_25OrMoreYears_Male","[]","[]","[]","[P1Y]","1970" +"218","LifeExpectancy_Person_Female","[]","[]","[Year]","[P1Y]","1960" +"139","Count_Person_25OrMoreYears_Male_DoctorateDegree_AsFractionOf_Count_Person_25OrMoreYears_Male","[]","[]","[]","[P1Y]","1994" +"197","Count_CriminalActivities_MurderAndNonNegligentManslaughter_AsFractionOf_Count_Person","[]","[]","[Per100000Persons]","[P1Y]","1990" +"194","Amount_EconomicActivity_ExpenditureActivity_HealthcareExpenditure_AsFractionOf_Count_Person","[]","[]","[InternationalDollar, USDollar]","[P1Y]","2000" +"202","Amount_EconomicActivity_ExpenditureActivity_EducationExpenditure_Government_AsFractionOf_Amount_EconomicActivity_ExpenditureActivity_Government","[100]","[]","[Percent]","[P1Y]","1980" +"188","Count_Person_25OrMoreYears_Male_BachelorsDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Male","[]","[]","[]","[P1Y]","1970" +"218","FertilityRate_Person_Female","[]","[]","[]","[]","1960" +"218","Count_Person_Rural","[]","[WorldBankEstimate]","[]","[P1Y]","1960" +"183","Count_Person_25OrMoreYears_Female_TertiaryEducation_AsFractionOf_Count_Person_25OrMoreYears_Female","[]","[]","[]","[P1Y]","1970" +"218","Count_Person_Urban","[]","[WorldBankEstimate]","[]","[P1Y]","1960" +"165","Count_Person_Upto4Years_Overweight_AsFractionOf_Count_Person_Upto4Years","[]","[]","[]","[P1Y]","1983" +"218","LifeExpectancy_Person_Male","[]","[]","[Year]","[P1Y]","1960" +"218","Count_BirthEvent_LiveBirth_AsFractionOf_Count_Person","[]","[]","[Per1000Persons]","[P1Y]","1960" +"197","MortalityRate_Person_Upto4Years_AsFractionOf_Count_BirthEvent_LiveBirth","[]","[]","[Per1000LiveBirths]","[P1Y]","1960" +"218","Count_Person","[]","[]","[]","[P1Y]","1960" +"160","Count_Person_Upto4Years_Male_Wasting_AsFractionOf_Count_Person_Upto4Years_Male","[100]","[JointChildMalnutritionEstimate]","[Percent]","[P1Y]","1986" +"204","Amount_EconomicActivity_ExpenditureActivity_EducationExpenditure_Government_AsFractionOf_Amount_EconomicActivity_GrossDomesticProduction_Nominal","[100]","[]","[Percent]","[P1Y]","1970" +"188","Count_Person_25OrMoreYears_BachelorsDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears","[]","[]","[]","[P1Y]","1970" +"165","Count_Person_15OrMoreYears_Female_Smoking_AsFractionOf_Count_Person_15OrMoreYears_Female","[]","[AgeAdjustedPrevalence]","[]","[P1Y]","2000" +"165","Count_Person_15OrMoreYears_Smoking_AsFractionOf_Count_Person_15OrMoreYears","[]","[AgeAdjustedPrevalence]","[]","[P1Y]","2000" +"203","Amount_EconomicActivity_GrossNationalIncome_PurchasingPowerParity_PerCapita","[]","[]","[InternationalDollar]","[P1Y]","1990" +"160","Count_Person_Upto4Years_Male_Overweight_AsFractionOf_Count_Person_Upto4Years_Male","[]","[]","[]","[P1Y]","1986" +"195","Amount_EconomicActivity_ExpenditureActivity_TertiaryEducationExpenditure_Government_AsFractionOf_Amount_EconomicActivity_ExpenditureActivity_EducationExpenditure_Government","[]","[]","[]","[P1Y]","1970" +"159","Count_Person_Upto4Years_Male_SevereWasting_AsFractionOf_Count_Person_Upto4Years_Male","[100]","[JointChildMalnutritionEstimate]","[Percent]","[P1Y]","1986" +"151","Amount_Consumption_Electricity_PerCapita","[]","[]","[KilowattHour]","[P1Y]","1990" +"180","Amount_Consumption_Energy_PerCapita","[]","[]","[KilogramOfOilEquivalent]","[P1Y]","1990" +"186","Count_Death_IntentionalSelfHarm_Female_AsFractionOf_Count_Person_Female","[]","[]","[Per100000Females]","[P1Y]","2000" +"165","Count_Person_15OrMoreYears_Male_Smoking_AsFractionOf_Count_Person_15OrMoreYears_Male","[]","[AgeAdjustedPrevalence]","[]","[P1Y]","2000" +"149","Count_CriminalActivities_MurderAndNonNegligentManslaughter_Male_AsFractionOf_Count_Person_Male","[]","[]","[Per100000Males]","[P1Y]","1990" +"200","Amount_Remittance_InwardRemittance_AsFractionOf_Amount_EconomicActivity_GrossDomesticProduction_Nominal","[100]","[WorldBankEstimate]","[Percent]","[P1Y]","1970" +"188","Count_Person_15To64Years_InLaborForce_AsFractionOf_Count_Person_15To64Years","[]","[]","[]","[P1Y]","1990" +"171","GiniIndex_EconomicActivity","[]","[WorldBankEstimate]","[]","[P1Y]","1963" +"162","Count_Person_25OrMoreYears_Female_MastersDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Female","[]","[]","[]","[P1Y]","1990" +"170","Count_Person_25OrMoreYears_MastersDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears","[]","[]","[]","[P1Y]","1990" +"152","Count_CriminalActivities_MurderAndNonNegligentManslaughter_Female_AsFractionOf_Count_Person_Female","[]","[]","[Per100000Females]","[P1Y]","1990" +"188","Count_Person_15To64Years_Female_InLaborForce_AsFractionOf_Count_Person_15To64Years_Female","[]","[]","[]","[P1Y]","1990" +"104","Amount_Stock_AsFractionOf_Amount_EconomicActivity_GrossDomesticProduction_Nominal","[100]","[]","[Percent]","[P1Y]","1975" +"131","Count_Person_25OrMoreYears_Female_DoctorateDegree_AsFractionOf_Count_Person_25OrMoreYears_Female","[]","[]","[]","[P1Y]","1994" +"215","GrowthRate_Amount_EconomicActivity_GrossDomesticProduction","[]","[]","[]","[P1Y]","1961" +"218","Count_Death_AsAFractionOfCount_Person","[]","[WorldBankWeightedAverage]","[Per1000Persons]","[P1Y]","1960" +"215","Amount_EconomicActivity_GrossDomesticProduction_Nominal","[]","[]","[USDollar]","[P1Y]","1960" +"188","Count_Person_15To64Years_Male_InLaborForce_AsFractionOf_Count_Person_15To64Years_Male","[]","[]","[]","[P1Y]","1990" +"200","Amount_Remittance_InwardRemittance","[]","[WorldBankEstimate]","[USDollar]","[P1Y]","1970" +"161","Count_Person_Upto4Years_SevereWasting_AsFractionOf_Count_Person_Upto4Years","[100]","[JointChildMalnutritionEstimate]","[Percent]","[P1Y]","1983" +"188","Count_Person_25OrMoreYears_Female_BachelorsDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Female","[]","[]","[]","[P1Y]","1970" +"167","Count_Person_25OrMoreYears_Male_MastersDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Male","[]","[]","[]","[P1Y]","1990" +"188","Amount_Consumption_Alcohol_15OrMoreYears_AsFractionOf_Count_Person_15OrMoreYears","[]","[WorldHealthOrganizationEstimates]","[Liter]","[P1Y]","2000" +"188","Count_Person_15OrMoreYears_InLaborForce_Female_AsFractionOf_Count_Person_InLaborForce","[]","[]","[]","[P1Y]","1990" +"215","Count_Product_MobileCellularSubscription_AsFractionOf_Count_Person","[]","[]","[]","[P1Y]","1960" +"188","Count_Person_InLaborForce","[]","[InternationalLaborOrganization]","[]","[P1Y]","1990" +"186","Count_Death_IntentionalSelfHarm_AsFractionOf_Count_Person","[]","[]","[Per100000Persons]","[P1Y]","2000" +"197","Count_Death_0Years_AsFractionOf_Count_BirthEvent_LiveBirth","[]","[UnitedNationsIGMEEstimate]","[Per1000LiveBirths]","[P1Y]","1960" +"160","Count_Person_Upto4Years_Female_Wasting_AsFractionOf_Count_Person_Upto4Years_Female","[100]","[JointChildMalnutritionEstimate]","[Percent]","[P1Y]","1986" +"203","Amount_Remittance_OutwardRemittance","[]","[WorldBankEstimate]","[USDollar]","[P1Y]","1970" +"160","Count_Person_Upto4Years_Female_Overweight_AsFractionOf_Count_Person_Upto4Years_Female","[]","[]","[]","[P1Y]","1986" +"214","Count_Person_IsInternetUser_PerCapita","[100]","[]","[]","[P1Y]","1990" +"210","Amount_Production_ElectricityFromNuclearSources_AsFractionOf_Amount_Production_Energy","[]","[]","[]","[P1Y]","1990" +"159","Count_Person_Upto4Years_Female_SevereWasting_AsFractionOf_Count_Person_Upto4Years_Female","[100]","[JointChildMalnutritionEstimate]","[Percent]","[P1Y]","1986" +"184","Count_Person_25OrMoreYears_TertiaryEducation_AsFractionOf_Count_Person_25OrMoreYears","[]","[]","[]","[P1Y]","1970" +"210","Amount_Production_ElectricityFromOilGasOrCoalSources_AsFractionOf_Amount_Production_Energy","[]","[]","[]","[P1Y]","1990" +"218","GrowthRate_Count_Person","[]","[]","[]","[P1Y]","1961" +"213","Amount_Consumption_RenewableEnergy_AsFractionOf_Amount_Consumption_Energy","[]","[]","[]","[P1Y]","1990" +"104","Amount_Stock","[]","[]","[USDollar]","[P1Y]","1975" +"218","LifeExpectancy_Person","[]","[]","[Year]","[]","1960" diff --git a/scripts/world_bank/wdi/manifest.json b/scripts/world_bank/wdi/manifest.json index bc3927141e..eb427c0472 100644 --- a/scripts/world_bank/wdi/manifest.json +++ b/scripts/world_bank/wdi/manifest.json @@ -20,7 +20,8 @@ "WorldBankCountries.csv", "schema_csvs/WorldBankIndicators_prod.csv" ], - "cron_schedule": "0 11 * * 2" + "cron_schedule": "0 11 * * 2", + "validation_config_file": "validation_config.json" } ] } \ No newline at end of file diff --git a/scripts/world_bank/wdi/validation_config.json b/scripts/world_bank/wdi/validation_config.json new file mode 100644 index 0000000000..bb6d9edfbf --- /dev/null +++ b/scripts/world_bank/wdi/validation_config.json @@ -0,0 +1,20 @@ +{ + "schema_version": "1.0", + "rules": [ + { + "rule_id": "check_deleted_records_percent", + "description": "Checks that the percentage of deleted points is within the threshold.", + "validator": "DELETED_RECORDS_PERCENT", + "params": { + "threshold": 0.1 + } + }, + { + "rule_id": "check_goldens_summary_report", + "validator": "GOLDENS_CHECK", + "params": { + "golden_files": "golden_data/golden_summary_report.csv" + } + } + ] +} \ No newline at end of file