// verifyGeneticReferences provides functions to run a check to make sure the genetic resources are valid and have no conflicts package verifyGeneticReferences // We check to make sure: // 1. No identifier collisions exist // 2. No disease/trait name collisions exist // 4. Verifies the minimum and maximum risk weights for each polygenic disease locus // 5. Each identifier is the correct format (3 bytes encoded hex) // Identifiers are 3 bytes/24 bits long, so there is at least a 1 in 16 million chance that two will collide when generating them randomly import "seekia/resources/geneticReferences/locusMetadata" import "seekia/resources/geneticReferences/monogenicDiseases" import "seekia/resources/geneticReferences/polygenicDiseases" import "seekia/resources/geneticReferences/traits" import "seekia/internal/helpers" import "seekia/internal/encoding" import "testing" import "strings" import "slices" func TestGeneticReferences(t *testing.T){ verifyIdentifier := func(inputIdentifier string)bool{ decodedBytes, err := encoding.DecodeHexStringToBytes(inputIdentifier) if (err != nil) { return false } if (len(decodedBytes) != 3){ return false } return true } verifyBase := func(inputBase string)bool{ if (inputBase != "A" && inputBase != "G" && inputBase != "C" && inputBase != "T" && inputBase != "I" && inputBase != "D"){ return false } return true } verifyBasePair := func(inputBasePair string)bool{ baseA, baseB, delimiterFound := strings.Cut(inputBasePair, ";") if (delimiterFound == false){ return false } baseIsValid := verifyBase(baseA) if (baseIsValid == false){ return false } baseIsValid = verifyBase(baseB) if (baseIsValid == false){ return false } return true } verifyReferencesMap := func(inputReferencesMap map[string]string)bool{ if (len(inputReferencesMap) == 0){ return true } for referenceName, referenceLink := range inputReferencesMap{ if (referenceName == ""){ return false } if (referenceLink == ""){ return false } } return true } monogenicDiseases.InitializeMonogenicDiseaseVariables() monogenicDiseasesObjectsList, err := monogenicDiseases.GetMonogenicDiseaseObjectsList() if (err != nil){ t.Fatalf("Failed to get monogenic disease objects list: " + err.Error()) } // We use this map to make sure all RSIDs have metadata in locusMetadata allRSIDsMap := make(map[int64]struct{}) allIdentifiersMap := make(map[string]struct{}) monogenicDiseaseNamesMap := make(map[string]struct{}) for _, diseaseObject := range monogenicDiseasesObjectsList{ diseaseName := diseaseObject.DiseaseName diseaseGeneName := diseaseObject.GeneName dominantOrRecessive := diseaseObject.DominantOrRecessive variantsList := diseaseObject.VariantsList diseaseReferencesMap := diseaseObject.References if (diseaseName == ""){ t.Fatalf("Monogenic Disease name is empty.") } _, exists := monogenicDiseaseNamesMap[diseaseName] if (exists == true){ t.Fatalf("Monogenic Disease name collision found: " + diseaseName) } monogenicDiseaseNamesMap[diseaseName] = struct{}{} // Monogenic disease names cannot contain underscores // This is because when we encode monogenic disease names in user profiles, we replace the whitespace with underscores // We have to be able to reliably undo this containsUnderscore := strings.Contains(diseaseName, "_") if (containsUnderscore == true){ t.Fatalf("Monogenic Disease name contains underscore: " + diseaseName) } if (diseaseGeneName == ""){ t.Fatalf("Monogenic Disease gene name is empty: " + diseaseName) } if (dominantOrRecessive != "Dominant" && dominantOrRecessive != "Recessive"){ t.Fatalf("Monogenic Disease dominantOrRecessive is invalid: " + diseaseName) } referencesAreValid := verifyReferencesMap(diseaseReferencesMap) if (referencesAreValid == false){ t.Fatalf("Monogenic Disease references are invalid: " + diseaseName) } if (len(variantsList) == 0){ t.Fatalf("Monogenic Disease contains no variants: " + diseaseName) } for _, variantObject := range variantsList{ variantIdentifier := variantObject.VariantIdentifier variantRSID := variantObject.VariantRSID variantNamesList := variantObject.VariantNames variantHealthyBase := variantObject.HealthyBase variantDefectiveBase := variantObject.DefectiveBase variantReferences := variantObject.References allRSIDsMap[variantRSID] = struct{}{} identifierIsValid := verifyIdentifier(variantIdentifier) if (identifierIsValid == false){ t.Fatalf(diseaseName + " Invalid variant identifier found: " + variantIdentifier) } _, exists := allIdentifiersMap[variantIdentifier] if (exists == true){ t.Fatalf(diseaseName + " Duplicate variant identifier found: " + variantIdentifier) } allIdentifiersMap[variantIdentifier] = struct{}{} if (len(variantNamesList) == 0){ t.Fatalf("Variant names list is empty: " + variantIdentifier) } for _, variantName := range variantNamesList{ if (variantName == ""){ t.Fatalf("Variant name is empty: " + variantIdentifier) } } healthyBaseIsValid := verifyBase(variantHealthyBase) defectiveBaseIsValid := verifyBase(variantDefectiveBase) if (healthyBaseIsValid == false || defectiveBaseIsValid == false){ t.Fatalf(diseaseName + " Invalid healthy/defective base found: " + variantIdentifier) } if (variantHealthyBase == variantDefectiveBase){ t.Fatalf(diseaseName + " Identical healthy/defective bases found: " + variantIdentifier) } referencesAreValid := verifyReferencesMap(variantReferences) if (referencesAreValid == false){ t.Fatalf("Disease variant references map is invalid: " + variantIdentifier) } } } polygenicDiseases.InitializePolygenicDiseaseVariables() polygenicDiseaseObjectsList, err := polygenicDiseases.GetPolygenicDiseaseObjectsList() if (err != nil) { t.Fatalf("Failed to get polygenicDisease objects list: " + err.Error()) } polygenicDiseaseNamesMap := make(map[string]struct{}) for _, diseaseObject := range polygenicDiseaseObjectsList{ diseaseName := diseaseObject.DiseaseName diseaseDescription := diseaseObject.DiseaseDescription diseaseEffectedSex := diseaseObject.EffectedSex diseaseLociList := diseaseObject.LociList diseaseReferencesMap := diseaseObject.References if (diseaseName == ""){ t.Fatalf("PolygenicDisease name is empty.") } _, exists := polygenicDiseaseNamesMap[diseaseName] if (exists == true){ t.Fatalf("PolygenicDisease name collision found: " + diseaseName) } polygenicDiseaseNamesMap[diseaseName] = struct{}{} if (diseaseDescription == ""){ t.Fatalf("PolygenicDisease description is empty for disease: " + diseaseName) } if (diseaseEffectedSex != "Male" && diseaseEffectedSex != "Female" && diseaseEffectedSex != "Both"){ t.Fatalf("PolygenicDisease effected sex is invalid: " + diseaseEffectedSex) } referencesAreValid := verifyReferencesMap(diseaseReferencesMap) if (referencesAreValid == false){ t.Fatalf("PolygenicDisease references map is invalid for disease: " + diseaseName) } // We use this map to make sure each disease locus references a unique rsid allPolygenicDiseaseRSIDsMap := make(map[int64]struct{}) for _, locusObject := range diseaseLociList{ locusIdentifier := locusObject.LocusIdentifier locusRSID := locusObject.LocusRSID riskWeightsMap := locusObject.RiskWeightsMap oddsRatiosMap := locusObject.OddsRatiosMap minimumWeight := locusObject.MinimumRiskWeight maximumWeight := locusObject.MaximumRiskWeight allRSIDsMap[locusRSID] = struct{}{} identifierIsValid := verifyIdentifier(locusIdentifier) if (identifierIsValid == false){ t.Fatalf(diseaseName + " Invalid locus identifier found: " + locusIdentifier) } _, exists := allIdentifiersMap[locusIdentifier] if (exists == true){ t.Fatalf(diseaseName + " Duplicate locus identifier found: " + locusIdentifier) } allIdentifiersMap[locusIdentifier] = struct{}{} _, exists = allPolygenicDiseaseRSIDsMap[locusRSID] if (exists == true){ rsidString := helpers.ConvertInt64ToString(locusRSID) t.Fatalf(diseaseName + " RSID Collision found: " + rsidString) } allPolygenicDiseaseRSIDsMap[locusRSID] = struct{}{} if (len(riskWeightsMap) == 0){ t.Fatalf("Empty base weights map found: " + locusIdentifier) } trueMinimumWeight := 100000 trueMaximumWeight := -100000 for basePair, basePairWeight := range riskWeightsMap{ isValid := verifyBasePair(basePair) if (isValid == false){ t.Fatalf("Base pair weights map contains invalid base pair: " + locusIdentifier) } if (basePairWeight < trueMinimumWeight){ trueMinimumWeight = basePairWeight } if (basePairWeight > trueMaximumWeight){ trueMaximumWeight = basePairWeight } } if (trueMinimumWeight != minimumWeight){ t.Fatalf(diseaseName + ": Invalid minimum base pair weight found: " + locusIdentifier) } if (trueMaximumWeight != maximumWeight){ t.Fatalf(diseaseName + ": Invalid maximum base pair weight found: " + locusIdentifier) } for basePair, _ := range oddsRatiosMap{ isValid := verifyBasePair(basePair) if (isValid == false){ t.Fatalf("Odds ratio weights map contains invalid base pair: " + locusIdentifier) } } //TODO: Make sure that duplicate base pairs have same weight, odds ratios and probabilities } } traits.InitializeTraitVariables() traitObjectsList, err := traits.GetTraitObjectsList() if (err != nil){ t.Fatalf("Failed to get trait objects list: " + err.Error()) } traitNamesMap := make(map[string]struct{}) for _, traitObject := range traitObjectsList{ traitName := traitObject.TraitName traitDescription := traitObject.TraitDescription traitLociList := traitObject.LociList traitRulesList := traitObject.RulesList traitOutcomesList := traitObject.OutcomesList traitReferencesMap := traitObject.References if (traitName == ""){ t.Fatalf("Empty trait name exists.") } _, exists := traitNamesMap[traitName] if (exists == true){ t.Fatalf("Duplicate trait name exists: " + traitName) } traitNamesMap[traitName] = struct{}{} if (traitDescription == ""){ t.Fatalf("Empty trait description exists for trait: " + traitName) } if (len(traitOutcomesList) != 0){ if (len(traitOutcomesList) < 2){ t.Fatalf("Not enough trait outcomes for trait: " + traitName) } for _, traitOutcome := range traitOutcomesList{ if (traitOutcome == ""){ t.Fatalf("Empty trait outcome exists for trait: " + traitName) } } } else { // If there are no outcomes, then no rules can exist if (len(traitRulesList) != 0){ t.Fatalf("Trait outcomes list is empty, trait rules list is not.") } } referencesAreValid := verifyReferencesMap(traitReferencesMap) if (referencesAreValid == false){ t.Fatalf("Invalid references exist for trait: " + traitName) } if (len(traitLociList) == 0){ t.Fatalf("No trait loci exist for trait: " + traitName) } for _, locusRSID := range traitLociList{ allRSIDsMap[locusRSID] = struct{}{} } containsDuplicates, duplicateLocus := helpers.CheckIfListContainsDuplicates(traitLociList) if (containsDuplicates == true){ duplicateLocusString := helpers.ConvertInt64ToString(duplicateLocus) t.Fatalf("traitLociList contains duplicates for trait: " + traitName + ". RSID: " + duplicateLocusString) } if (len(traitRulesList) == 0){ // No rules exist. continue } for _, ruleObject := range traitRulesList{ ruleIdentifier := ruleObject.RuleIdentifier ruleLociList := ruleObject.LociList ruleOutcomePointsMap := ruleObject.OutcomePointsMap ruleReferences := ruleObject.References identifierIsValid := verifyIdentifier(ruleIdentifier) if (identifierIsValid == false){ t.Fatalf("Invalid identifier exists: " + ruleIdentifier) } _, exists := allIdentifiersMap[ruleIdentifier] if (exists == true){ t.Fatalf("Duplicate identifier exists: " + ruleIdentifier) } allIdentifiersMap[ruleIdentifier] = struct{}{} if (len(ruleOutcomePointsMap) == 0){ t.Fatalf("Rule contains empty rule outcome points map: " + ruleIdentifier) } for outcomeName, _ := range ruleOutcomePointsMap{ isValid := slices.Contains(traitOutcomesList, outcomeName) if (isValid == false){ t.Fatalf("Rule outcome points map contains invalid outcome: " + outcomeName) } } if (len(ruleLociList) == 0){ t.Fatalf("Rule contains empty rule loci list: " + ruleIdentifier) } for _, locusObject := range ruleLociList{ locusIdentifier := locusObject.LocusIdentifier locusRSID := locusObject.LocusRSID locusBasePairsList := locusObject.BasePairsList allRSIDsMap[locusRSID] = struct{}{} isValid := verifyIdentifier(locusIdentifier) if (isValid == false){ t.Fatalf("Trait rule Locus identifier is invalid: " + locusIdentifier) } listContainsItem := slices.Contains(traitLociList, locusRSID) if (listContainsItem == false){ t.Fatalf("Rule locus contains rsid which is not contained within traitLociList.") } if (len(locusBasePairsList) == 0){ t.Fatalf("Trait rule locus base pairs list is empty: " + locusIdentifier) } for _, locusBasePair := range locusBasePairsList{ basePairIsValid := verifyBasePair(locusBasePair) if (basePairIsValid == false){ t.Fatalf("Rule Locus base pairs list contains invalid base pair: " + locusBasePair) } } } referencesAreValid := verifyReferencesMap(ruleReferences) if (referencesAreValid == false){ t.Fatalf("Invalid references map for trait rule locus: " + ruleIdentifier) } } } err = locusMetadata.InitializeLocusMetadataVariables() if (err != nil){ t.Fatalf("Failed to initialize locus metadata variables: " + err.Error()) } locusMetadataObjectsList, err := locusMetadata.GetLocusMetadataObjectsList() if (err != nil){ t.Fatalf("GetLocusMetadataObjectsList failed: " + err.Error()) } // We use the locusPositionsMap to make sure there are no locations that refer to the same position on the same chromosome type locusPositionStruct struct{ chromosome int position int } locusPositionsMap := make(map[locusPositionStruct]struct{}) // We use the companyAliasesMap to make sure there are no company alias collisions. // // We only care about alias collisions within each company. // Multiple companies can refer to the same location with the same alias. // type companyAliasStruct struct{ geneticsCompany locusMetadata.GeneticsCompany locusAlias string } companyAliasesMap := make(map[companyAliasStruct]struct{}) // We use this map to make sure that locus metadata rsIDs do not collide. // We don't want any duplicate rsIDs within any of the loci. locusMetadataRSIDsMap := make(map[int64]struct{}) for _, locusMetadataObject := range locusMetadataObjectsList{ rsidsList := locusMetadataObject.RSIDsList locusChromosome := locusMetadataObject.Chromosome locusPosition := locusMetadataObject.Position geneNamesList := locusMetadataObject.GeneNamesList locusCompanyAliasesMap := locusMetadataObject.CompanyAliases referencesMap := locusMetadataObject.References if (len(rsidsList) == 0){ t.Fatalf("locusMetadataObjectsList contains locus with empty RSIDs list.") } // The primary RSID is the only rsID which should appear in the genetic references // The primary RSID is the first rsID in the locus rsIDs list primaryRSID := rsidsList[0] _, exists := allRSIDsMap[primaryRSID] if (exists == false){ t.Fatalf("locusMetadataObjectsList contains unnecessary locus: No matching rsids exist.") } for index, rsID := range rsidsList{ _, exists := locusMetadataRSIDsMap[rsID] if (exists == true){ RSIDString := helpers.ConvertInt64ToString(rsID) t.Fatalf("locusMetadataObjectsList contains duplicate RSID: " + RSIDString) } locusMetadataRSIDsMap[rsID] = struct{}{} if (index != 0){ // This is not a primary rsID _, exists = allRSIDsMap[rsID] if (exists == true){ rsIDString := helpers.ConvertInt64ToString(rsID) t.Fatalf("allRSIDsMap contains non-primary rsID: " + rsIDString) } } } if (locusChromosome == 0){ // 0 is uninitialized. t.Fatalf("locusMetadataObjectsList contains locus with 0 chromosome.") } if (locusPosition == 0){ // 0 is uninitialized. t.Fatalf("locusMetadataObjectsList contains locus with 0 position.") } locusPositionObject := locusPositionStruct{ chromosome: locusChromosome, position: locusPosition, } _, exists = locusPositionsMap[locusPositionObject] if (exists == true){ t.Fatalf("locusMetadataObjectsList contains locus position collision.") } locusPositionsMap[locusPositionObject] = struct{}{} if (len(geneNamesList) != 0){ for _, geneName := range geneNamesList{ if (geneName == ""){ t.Fatalf("locusMetadataObjectsList contains locus with empty geneName in geneNamesList.") } } } for companyObject, companyAliasesList := range locusCompanyAliasesMap{ for _, locusCompanyAlias := range companyAliasesList{ companyAliasObject := companyAliasStruct{ geneticsCompany: companyObject, locusAlias: locusCompanyAlias, } _, exists := companyAliasesMap[companyAliasObject] if (exists == true){ t.Fatalf("locusMetadataObjectsList contains companyAlias collision: " + locusCompanyAlias) } companyAliasesMap[companyAliasObject] = struct{}{} } } isValid := verifyReferencesMap(referencesMap) if (isValid == false){ t.Fatalf("locusMetadataObjectsList contains invalid references map.") } } missingLociList := make([]int64, 0) for rsID, _ := range allRSIDsMap{ _, exists := locusMetadataRSIDsMap[rsID] if (exists == false){ missingLociList = append(missingLociList, rsID) } } if (len(missingLociList) != 0){ missingLociStringsList := make([]string, 0, len(missingLociList)) for _, rsID := range missingLociList{ rsIDString := helpers.ConvertInt64ToString(rsID) missingLociStringsList = append(missingLociStringsList, rsIDString) } missingLociListFormatted := strings.Join(missingLociStringsList, ", ") t.Fatalf("locusMetadata is missing loci: " + missingLociListFormatted) } } /* // We use this to determine the greatest possible number of variants tested // This needs to be updated in profileFormat whenever a new monogenic disease is added which exceeds this value func TestGetHighestPossibleMonogenicDiseaseVariantCount(t *testing.T){ monogenicDiseases.InitializeMonogenicDiseaseVariables() monogenicDiseasesObjectsList, err := monogenicDiseases.GetMonogenicDiseaseObjectsList() if (err != nil){ t.Fatalf("Failed to get monogenic disease objects list: " + err.Error()) } highestCount := 0 for _, diseaseObject := range monogenicDiseasesObjectsList{ diseaseVariantsList := diseaseObject.VariantsList diseaseNumberOfVariants := len(diseaseVariantsList) if (diseaseNumberOfVariants > highestCount){ highestCount = diseaseNumberOfVariants } } highestVariantCountString := helpers.ConvertIntToString(highestCount) log.Println("Most monogenic disease variants: " + highestVariantCountString) } */