seekia/resources/geneticReferences/geneticReferences_test.go
2024-08-14 03:37:18 +00:00

639 lines
19 KiB
Go

// verifyGeneticReferences provides functions to run a check to make sure the genetic resources are valid and have no conflicts
package verifyGeneticReferences
// We check to make sure:
// 1. No identifier collisions exist
// 2. No disease/trait name collisions exist
// 4. Verifies the minimum and maximum risk weights for each polygenic disease locus
// 5. Each identifier is the correct format (3 bytes encoded hex)
// Identifiers are 3 bytes/24 bits long, so there is at least a 1 in 16 million chance that two will collide when generating them randomly
import "seekia/resources/geneticReferences/locusMetadata"
import "seekia/resources/geneticReferences/monogenicDiseases"
import "seekia/resources/geneticReferences/polygenicDiseases"
import "seekia/resources/geneticReferences/traits"
import "seekia/internal/helpers"
import "seekia/internal/encoding"
import "testing"
import "strings"
import "slices"
func TestGeneticReferences(t *testing.T){
verifyIdentifier := func(inputIdentifier string)bool{
_, err := encoding.DecodeHexStringTo3ByteArray(inputIdentifier)
if (err != nil) {
return false
}
return true
}
verifyBase := func(inputBase string)bool{
if (inputBase != "A" && inputBase != "G" && inputBase != "C" && inputBase != "T" && inputBase != "I" && inputBase != "D"){
return false
}
return true
}
verifyBasePair := func(inputBasePair string)bool{
baseA, baseB, delimiterFound := strings.Cut(inputBasePair, ";")
if (delimiterFound == false){
return false
}
baseIsValid := verifyBase(baseA)
if (baseIsValid == false){
return false
}
baseIsValid = verifyBase(baseB)
if (baseIsValid == false){
return false
}
return true
}
verifyReferencesMap := func(inputReferencesMap map[string]string)bool{
if (len(inputReferencesMap) == 0){
return true
}
for referenceName, referenceLink := range inputReferencesMap{
if (referenceName == ""){
return false
}
if (referenceLink == ""){
return false
}
}
return true
}
monogenicDiseases.InitializeMonogenicDiseaseVariables()
monogenicDiseasesObjectsList, err := monogenicDiseases.GetMonogenicDiseaseObjectsList()
if (err != nil){
t.Fatalf("Failed to get monogenic disease objects list: " + err.Error())
}
// We use this map to make sure all RSIDs have metadata in locusMetadata
allRSIDsMap := make(map[int64]struct{})
allIdentifiersMap := make(map[string]struct{})
monogenicDiseaseNamesMap := make(map[string]struct{})
for _, diseaseObject := range monogenicDiseasesObjectsList{
diseaseName := diseaseObject.DiseaseName
diseaseGeneName := diseaseObject.GeneName
dominantOrRecessive := diseaseObject.DominantOrRecessive
variantsList := diseaseObject.VariantsList
diseaseReferencesMap := diseaseObject.References
if (diseaseName == ""){
t.Fatalf("Monogenic Disease name is empty.")
}
_, exists := monogenicDiseaseNamesMap[diseaseName]
if (exists == true){
t.Fatalf("Monogenic Disease name collision found: " + diseaseName)
}
monogenicDiseaseNamesMap[diseaseName] = struct{}{}
// Monogenic disease names cannot contain underscores
// This is because when we encode monogenic disease names in user profiles, we replace the whitespace with underscores
// We have to be able to reliably undo this
containsUnderscore := strings.Contains(diseaseName, "_")
if (containsUnderscore == true){
t.Fatalf("Monogenic Disease name contains underscore: " + diseaseName)
}
if (diseaseGeneName == ""){
t.Fatalf("Monogenic Disease gene name is empty: " + diseaseName)
}
if (dominantOrRecessive != "Dominant" && dominantOrRecessive != "Recessive"){
t.Fatalf("Monogenic Disease dominantOrRecessive is invalid: " + diseaseName)
}
referencesAreValid := verifyReferencesMap(diseaseReferencesMap)
if (referencesAreValid == false){
t.Fatalf("Monogenic Disease references are invalid: " + diseaseName)
}
if (len(variantsList) == 0){
t.Fatalf("Monogenic Disease contains no variants: " + diseaseName)
}
for _, variantObject := range variantsList{
variantIdentifier := variantObject.VariantIdentifier
variantRSID := variantObject.VariantRSID
variantNamesList := variantObject.VariantNames
variantHealthyBase := variantObject.HealthyBase
variantDefectiveBase := variantObject.DefectiveBase
variantReferences := variantObject.References
allRSIDsMap[variantRSID] = struct{}{}
identifierIsValid := verifyIdentifier(variantIdentifier)
if (identifierIsValid == false){
t.Fatalf(diseaseName + " Invalid variant identifier found: " + variantIdentifier)
}
_, exists := allIdentifiersMap[variantIdentifier]
if (exists == true){
t.Fatalf(diseaseName + " Duplicate variant identifier found: " + variantIdentifier)
}
allIdentifiersMap[variantIdentifier] = struct{}{}
if (len(variantNamesList) == 0){
t.Fatalf("Variant names list is empty: " + variantIdentifier)
}
for _, variantName := range variantNamesList{
if (variantName == ""){
t.Fatalf("Variant name is empty: " + variantIdentifier)
}
}
healthyBaseIsValid := verifyBase(variantHealthyBase)
defectiveBaseIsValid := verifyBase(variantDefectiveBase)
if (healthyBaseIsValid == false || defectiveBaseIsValid == false){
t.Fatalf(diseaseName + " Invalid healthy/defective base found: " + variantIdentifier)
}
if (variantHealthyBase == variantDefectiveBase){
t.Fatalf(diseaseName + " Identical healthy/defective bases found: " + variantIdentifier)
}
referencesAreValid := verifyReferencesMap(variantReferences)
if (referencesAreValid == false){
t.Fatalf("Disease variant references map is invalid: " + variantIdentifier)
}
}
}
err = polygenicDiseases.InitializePolygenicDiseaseVariables()
if (err != nil){
t.Fatalf("InitializePolygenicDiseaseVariables failed: " + err.Error())
}
polygenicDiseaseObjectsList, err := polygenicDiseases.GetPolygenicDiseaseObjectsList()
if (err != nil) {
t.Fatalf("Failed to get polygenicDisease objects list: " + err.Error())
}
polygenicDiseaseNamesMap := make(map[string]struct{})
for _, diseaseObject := range polygenicDiseaseObjectsList{
diseaseName := diseaseObject.DiseaseName
diseaseDescription := diseaseObject.DiseaseDescription
diseaseEffectedSex := diseaseObject.EffectedSex
diseaseLocusReferencesMap := diseaseObject.LocusReferencesMap
diseaseLociList := diseaseObject.LociList
diseaseReferencesMap := diseaseObject.References
if (diseaseName == ""){
t.Fatalf("PolygenicDisease name is empty.")
}
_, exists := polygenicDiseaseNamesMap[diseaseName]
if (exists == true){
t.Fatalf("PolygenicDisease name collision found: " + diseaseName)
}
polygenicDiseaseNamesMap[diseaseName] = struct{}{}
if (diseaseDescription == ""){
t.Fatalf("PolygenicDisease description is empty for disease: " + diseaseName)
}
if (diseaseEffectedSex != "Male" && diseaseEffectedSex != "Female" && diseaseEffectedSex != "Both"){
t.Fatalf("PolygenicDisease effected sex is invalid: " + diseaseEffectedSex)
}
for rsID, referencesMap := range diseaseLocusReferencesMap{
containsItem := slices.Contains(diseaseLociList, rsID)
if (containsItem == false){
t.Fatalf("Polygenic disease diseaseLocusReferencesMap contains disease locus that is not inside of the disease's loci list.")
}
allRSIDsMap[rsID] = struct{}{}
referencesAreValid := verifyReferencesMap(referencesMap)
if (referencesAreValid == false){
t.Fatalf("PolygenicDisease references map is invalid for disease locus.")
}
}
containsDuplicates, _ := helpers.CheckIfListContainsDuplicates(diseaseLociList)
if (containsDuplicates == true){
t.Fatalf("Polygenic disease object contains diseaseLociList with duplicate rsIDs.")
}
if (len(diseaseLocusReferencesMap) > len(diseaseLociList)){
t.Fatalf("Polygenic disease contains locus references map that is longer than the diseaseLociList")
}
referencesAreValid := verifyReferencesMap(diseaseReferencesMap)
if (referencesAreValid == false){
t.Fatalf("PolygenicDisease references map is invalid for disease: " + diseaseName)
}
}
err = traits.InitializeTraitVariables()
if (err != nil){
t.Fatalf("InitializeTraitVariables failed: " + err.Error())
}
traitObjectsList, err := traits.GetTraitObjectsList()
if (err != nil){
t.Fatalf("Failed to get trait objects list: " + err.Error())
}
traitNamesMap := make(map[string]struct{})
for _, traitObject := range traitObjectsList{
traitName := traitObject.TraitName
traitDescription := traitObject.TraitDescription
traitDiscreteOrNumeric := traitObject.DiscreteOrNumeric
traitLocusReferencesMap := traitObject.LocusReferencesMap
traitLociList := traitObject.LociList
traitLociList_Rules := traitObject.LociList_Rules
traitRulesList := traitObject.RulesList
traitOutcomesList := traitObject.OutcomesList
traitReferencesMap := traitObject.ReferencesMap
if (traitName == ""){
t.Fatalf("Empty trait name exists.")
}
_, exists := traitNamesMap[traitName]
if (exists == true){
t.Fatalf("Duplicate trait name exists: " + traitName)
}
traitNamesMap[traitName] = struct{}{}
if (traitDescription == ""){
t.Fatalf("Empty trait description exists for trait: " + traitName)
}
if (traitDiscreteOrNumeric != "Discrete" && traitDiscreteOrNumeric != "Numeric"){
t.Fatalf("Invalid DiscreteOrNumeric for trait: " + traitDiscreteOrNumeric)
}
if (len(traitOutcomesList) != 0){
if (len(traitOutcomesList) < 2){
t.Fatalf("Not enough trait outcomes for trait: " + traitName)
}
for _, traitOutcome := range traitOutcomesList{
if (traitOutcome == ""){
t.Fatalf("Empty trait outcome exists for trait: " + traitName)
}
}
} else {
// If there are no outcomes, then no rules can exist
if (len(traitRulesList) != 0){
t.Fatalf("Trait outcomes list is empty, trait rules list is not.")
}
}
referencesAreValid := verifyReferencesMap(traitReferencesMap)
if (referencesAreValid == false){
t.Fatalf("Invalid references exist for trait: " + traitName)
}
if (len(traitLocusReferencesMap) == 0){
t.Fatalf("No trait locus references exist for trait: " + traitName)
}
for locusRSID, locusReferences := range traitLocusReferencesMap{
allRSIDsMap[locusRSID] = struct{}{}
if (locusReferences == nil){
t.Fatalf("A trait locus has no references map: " + traitName)
}
if (len(locusReferences) == 0){
t.Fatalf("A trait locus has no references: " + traitName)
}
locusExists := slices.Contains(traitLociList, locusRSID)
if (locusExists == false){
t.Fatalf("traitLocusReferencesMap contains rsID which does not exist in traitLociList")
}
}
if (len(traitLociList) == 0){
t.Fatalf("No trait loci exist for trait: " + traitName)
}
for _, rsID := range traitLociList{
allRSIDsMap[rsID] = struct{}{}
}
for _, rsID := range traitLociList_Rules{
locusExists := slices.Contains(traitLociList, rsID)
if (locusExists == false){
t.Fatalf("traitLociList_Rules contains locus not present in traitLociList")
}
}
if (len(traitRulesList) == 0){
// No rules exist.
continue
}
for _, ruleObject := range traitRulesList{
ruleIdentifier := ruleObject.RuleIdentifier
ruleLociList := ruleObject.LociList
ruleOutcomePointsMap := ruleObject.OutcomePointsMap
ruleReferencesMap := ruleObject.ReferencesMap
identifierIsValid := verifyIdentifier(ruleIdentifier)
if (identifierIsValid == false){
t.Fatalf("Invalid identifier exists: " + ruleIdentifier)
}
_, exists := allIdentifiersMap[ruleIdentifier]
if (exists == true){
t.Fatalf("Duplicate identifier exists: " + ruleIdentifier)
}
allIdentifiersMap[ruleIdentifier] = struct{}{}
if (len(ruleOutcomePointsMap) == 0){
t.Fatalf("Rule contains empty rule outcome points map: " + ruleIdentifier)
}
for outcomeName, _ := range ruleOutcomePointsMap{
isValid := slices.Contains(traitOutcomesList, outcomeName)
if (isValid == false){
t.Fatalf("Rule outcome points map contains invalid outcome: " + outcomeName)
}
}
if (len(ruleLociList) == 0){
t.Fatalf("Rule contains empty rule loci list: " + ruleIdentifier)
}
for _, locusObject := range ruleLociList{
locusIdentifier := locusObject.LocusIdentifier
locusRSID := locusObject.LocusRSID
locusBasePairsList := locusObject.BasePairsList
allRSIDsMap[locusRSID] = struct{}{}
isValid := verifyIdentifier(locusIdentifier)
if (isValid == false){
t.Fatalf("Trait rule Locus identifier is invalid: " + locusIdentifier)
}
_, mapContainsItem := traitLocusReferencesMap[locusRSID]
if (mapContainsItem == false){
t.Fatalf("Rule locus contains rsid which is not contained within LocusReferencesMap.")
}
sliceContainsItem := slices.Contains(traitLociList, locusRSID)
if (sliceContainsItem == false){
t.Fatalf("Rule locus contains rsid which is not contained within traitLociList.")
}
sliceContainsItem = slices.Contains(traitLociList_Rules, locusRSID)
if (sliceContainsItem == false){
t.Fatalf("Rule locus contains rsid which is not contained within traitLociList_Rules.")
}
if (len(locusBasePairsList) == 0){
t.Fatalf("Trait rule locus base pairs list is empty: " + locusIdentifier)
}
for _, locusBasePair := range locusBasePairsList{
basePairIsValid := verifyBasePair(locusBasePair)
if (basePairIsValid == false){
t.Fatalf("Rule Locus base pairs list contains invalid base pair: " + locusBasePair)
}
}
}
referencesAreValid := verifyReferencesMap(ruleReferencesMap)
if (referencesAreValid == false){
t.Fatalf("Invalid references map for trait rule locus: " + ruleIdentifier)
}
}
}
err = locusMetadata.InitializeLocusMetadataVariables()
if (err != nil){
t.Fatalf("Failed to initialize locus metadata variables: " + err.Error())
}
locusMetadataObjectsList, err := locusMetadata.GetLocusMetadataObjectsList()
if (err != nil){
t.Fatalf("GetLocusMetadataObjectsList failed: " + err.Error())
}
// We use the locusPositionsMap to make sure there are no locations that refer to the same position on the same chromosome
type locusPositionStruct struct{
chromosome int
position int
}
locusPositionsMap := make(map[locusPositionStruct]struct{})
// We use the companyAliasesMap to make sure there are no company alias collisions.
//
// We only care about alias collisions within each company.
// Multiple companies can refer to the same location with the same alias.
type companyAliasStruct struct{
geneticsCompany locusMetadata.GeneticsCompany
locusAlias string
}
companyAliasesMap := make(map[companyAliasStruct]struct{})
// We use this map to make sure that locus metadata rsIDs do not collide.
// We don't want any duplicate rsIDs within any of the loci.
locusMetadataRSIDsMap := make(map[int64]struct{})
for _, locusMetadataObject := range locusMetadataObjectsList{
rsidsList := locusMetadataObject.RSIDsList
locusChromosome := locusMetadataObject.Chromosome
locusPosition := locusMetadataObject.Position
geneInfoIsKnown := locusMetadataObject.GeneInfoIsKnown
geneExists := locusMetadataObject.GeneExists
geneNamesList := locusMetadataObject.GeneNamesList
locusCompanyAliasesMap := locusMetadataObject.CompanyAliases
referencesMap := locusMetadataObject.References
if (len(rsidsList) == 0){
t.Fatalf("locusMetadataObjectsList contains locus with empty RSIDs list.")
}
// The primary RSID is the only rsID which should appear in the genetic references
// The primary RSID is the first rsID in the locus rsIDs list
primaryRSID := rsidsList[0]
_, exists := allRSIDsMap[primaryRSID]
if (exists == false){
t.Fatalf("locusMetadataObjectsList contains unnecessary locus: No matching rsids exist.")
}
for index, rsID := range rsidsList{
_, exists := locusMetadataRSIDsMap[rsID]
if (exists == true){
rsidString := helpers.ConvertInt64ToString(rsID)
t.Fatalf("locusMetadataObjectsList contains duplicate RSID: " + rsidString)
}
locusMetadataRSIDsMap[rsID] = struct{}{}
if (index != 0){
// This is not a primary rsID
_, exists = allRSIDsMap[rsID]
if (exists == true){
rsIDString := helpers.ConvertInt64ToString(rsID)
t.Fatalf("allRSIDsMap contains non-primary rsID: " + rsIDString)
}
}
}
if (locusChromosome == 0){
// 0 is uninitialized.
t.Fatalf("locusMetadataObjectsList contains locus with 0 chromosome.")
}
if (locusPosition == 0){
// 0 is uninitialized.
t.Fatalf("locusMetadataObjectsList contains locus with 0 position.")
}
locusPositionObject := locusPositionStruct{
chromosome: locusChromosome,
position: locusPosition,
}
_, exists = locusPositionsMap[locusPositionObject]
if (exists == true){
t.Fatalf("locusMetadataObjectsList contains locus position collision.")
}
locusPositionsMap[locusPositionObject] = struct{}{}
if (geneInfoIsKnown == true && geneExists == true){
if (len(geneNamesList) == 0){
t.Fatalf("locusMetadataObjectsList contains locus with known gene and empty geneNamesList.")
}
for _, geneName := range geneNamesList{
if (geneName == ""){
t.Fatalf("locusMetadataObjectsList contains locus with empty geneName in geneNamesList.")
}
}
}
for companyObject, companyAliasesList := range locusCompanyAliasesMap{
for _, locusCompanyAlias := range companyAliasesList{
companyAliasObject := companyAliasStruct{
geneticsCompany: companyObject,
locusAlias: locusCompanyAlias,
}
_, exists := companyAliasesMap[companyAliasObject]
if (exists == true){
t.Fatalf("locusMetadataObjectsList contains companyAlias collision: " + locusCompanyAlias)
}
companyAliasesMap[companyAliasObject] = struct{}{}
}
}
isValid := verifyReferencesMap(referencesMap)
if (isValid == false){
t.Fatalf("locusMetadataObjectsList contains invalid references map.")
}
}
//TODO: Check to make sure that there are no identical company aliases for different loci
missingLociList := make([]int64, 0)
for rsID, _ := range allRSIDsMap{
_, exists := locusMetadataRSIDsMap[rsID]
if (exists == false){
missingLociList = append(missingLociList, rsID)
}
}
if (len(missingLociList) != 0){
missingLociStringsList := make([]string, 0, len(missingLociList))
for _, rsID := range missingLociList{
rsIDString := helpers.ConvertInt64ToString(rsID)
missingLociStringsList = append(missingLociStringsList, rsIDString)
}
missingLociListFormatted := strings.Join(missingLociStringsList, ", ")
t.Fatalf("locusMetadata is missing loci: " + missingLociListFormatted)
}
}
/*
// We use this to determine the greatest possible number of variants tested
// This needs to be updated in profileFormat whenever a new monogenic disease is added which exceeds this value
func TestGetHighestPossibleMonogenicDiseaseVariantCount(t *testing.T){
monogenicDiseases.InitializeMonogenicDiseaseVariables()
monogenicDiseasesObjectsList, err := monogenicDiseases.GetMonogenicDiseaseObjectsList()
if (err != nil){
t.Fatalf("Failed to get monogenic disease objects list: " + err.Error())
}
highestCount := 0
for _, diseaseObject := range monogenicDiseasesObjectsList{
diseaseVariantsList := diseaseObject.VariantsList
diseaseNumberOfVariants := len(diseaseVariantsList)
if (diseaseNumberOfVariants > highestCount){
highestCount = diseaseNumberOfVariants
}
}
highestVariantCountString := helpers.ConvertIntToString(highestCount)
log.Println("Most monogenic disease variants: " + highestVariantCountString)
}
*/