// prepareRawGenomes provides a function to read and merge raw genomes into formatted genome maps which are ready to analyze. // Each raw genome comes from a genome file from a company such as 23andMe, AncestryDNA, etc... // Two combined genomes are created: OnlyExcludeConflicts and OnlyIncludeShared package prepareRawGenomes import "seekia/resources/geneticReferences/locusMetadata" import "seekia/internal/genetics/locusValue" import "seekia/internal/genetics/readRawGenomes" import "seekia/internal/helpers" import "errors" import "strings" type RawGenomeWithMetadata struct{ GenomeIdentifier [16]byte GenomeIsPhased bool RawGenomeMap map[int64]readRawGenomes.RawGenomeLocusValue } type GenomeWithMetadata struct{ // "Single"/"OnlyExcludeConflicts"/"OnlyIncludeShared" GenomeType string GenomeIdentifier [16]byte // Map Structure: RSID -> Locus base pair value object GenomeMap map[int64]locusValue.LocusValue } //Outputs: // -bool: Raw genome string is valid // -RawGenomeWithMetadata // -error (returns non-nil if called with invalid genomeIdentifier) func CreateRawGenomeWithMetadataObject(genomeIdentifier [16]byte, rawGenomeString string)(bool, RawGenomeWithMetadata, error){ rawDataReader := strings.NewReader(rawGenomeString) _, _, _, _, genomeIsPhased, rawGenomeMap, err := readRawGenomes.ReadRawGenomeFile(rawDataReader) if (err != nil) { // The raw genome is not valid return false, RawGenomeWithMetadata{}, nil } newRawGenomeWithMetadataObject := RawGenomeWithMetadata{ GenomeIdentifier: genomeIdentifier, GenomeIsPhased: genomeIsPhased, RawGenomeMap: rawGenomeMap, } return true, newRawGenomeWithMetadataObject, nil } // Function takes a map of raw genome strings and converts it into a map list containing a map for each genome // If there exists more than 1 genome, it also creates the two combined genomes: Only Include Shared/Only Exclude Conflicts // Inputs: // -[]RawGenomeWithMetadata // -func(int)error: Update Percentage Complete Function //Outputs: // -[]GenomeWithMetadata: Genomes with metadata list // -[][16]byte: All raw genome identifiers list (not including combined genomes) // -bool: Combined genomes exist // -[16]byte: Only exclude conflicts genome identifier // -[16]byte: Only include shared genome identifier // -error func GetGenomesWithMetadataListFromRawGenomesList(inputGenomesList []RawGenomeWithMetadata, updatePercentageCompleteFunction func(int)error)([]GenomeWithMetadata, [][16]byte, bool, [16]byte, [16]byte, error){ // The reading of genomes will take up the first 20% of the percentage range // The creation of multiple genomes will take up the last 80% of the percentage range // Structure: A list of genomes. // Each map stores a genome from a company or a combined genome. genomesWithMetadataList := make([]GenomeWithMetadata, 0) numberOfGenomesRead := 0 totalNumberOfGenomesToRead := len(inputGenomesList) allRawGenomeIdentifiersList := make([][16]byte, 0) for _, rawGenomeWithMetadataObject := range inputGenomesList{ newPercentageCompletion, err := helpers.ScaleNumberProportionally(true, numberOfGenomesRead, 0, totalNumberOfGenomesToRead, 0, 20) if (err != nil) { return nil, nil, false, [16]byte{}, [16]byte{}, err } err = updatePercentageCompleteFunction(newPercentageCompletion) if (err != nil) { return nil, nil, false, [16]byte{}, [16]byte{}, err } genomeIdentifier := rawGenomeWithMetadataObject.GenomeIdentifier genomeIsPhased := rawGenomeWithMetadataObject.GenomeIsPhased rawGenomeMap := rawGenomeWithMetadataObject.RawGenomeMap // Now we convert rawGenomeMap to a genomeMap // Map Structure: RSID -> Locus Value genomeMap := make(map[int64]locusValue.LocusValue) // We use this list to check for alias collisions later allRSIDsList := make([]int64, 0) for rsID, locusBasePairValue := range rawGenomeMap{ locusAllele2Exists := locusBasePairValue.Allele2Exists if (locusAllele2Exists == false){ // This SNP contains less than 2 bases // We don't support reading these kinds of SNP values yet continue } locusAllele1 := locusBasePairValue.Allele1 locusAllele2 := locusBasePairValue.Allele2 getLocusIsPhasedBool := func()bool{ if (locusAllele1 == locusAllele2){ // Locus has to be phased, because phase flip does not change value return true } return genomeIsPhased } locusIsPhased := getLocusIsPhasedBool() locusValueObject := locusValue.LocusValue{ LocusIsPhased: locusIsPhased, Base1Value: locusAllele1, Base2Value: locusAllele2, } genomeMap[rsID] = locusValueObject allRSIDsList = append(allRSIDsList, rsID) } // Now we check for rsID aliases // rsID aliases are multiple rsids that refer to the same location // If a single genome file contained two identical rsids whose value conflicted, the company must have made an error in reporting for _, rsID := range allRSIDsList{ rsidLocusValue, exists := genomeMap[rsID] if (exists == false){ // This must have been an alias that was deleted continue } aliasExists, rsidAliasesList, err := locusMetadata.GetRSIDAliases(rsID) if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err } if (aliasExists == false){ continue } checkIfRSIDCollisionExists := func()bool{ for _, rsidAlias := range rsidAliasesList{ aliasLocusValue, exists := genomeMap[rsidAlias] if (exists == false){ continue } if (aliasLocusValue != rsidLocusValue){ // A collision exists with an alias rsID // The company must be creating invalid results, or our alias list is invalid return true } } return false } rsidCollisionExists := checkIfRSIDCollisionExists() if (rsidCollisionExists == true){ // We will delete this rsID // We cannot trust any of the results delete(genomeMap, rsID) } // We delete all aliases from the genome map // We do this to save space for _, rsidAlias := range rsidAliasesList{ delete(genomeMap, rsidAlias) } } if (len(genomeMap) == 0){ // No valid locations exist continue } genomeWithMetadataObject := GenomeWithMetadata{ GenomeType: "Single", GenomeIdentifier: genomeIdentifier, GenomeMap: genomeMap, } genomesWithMetadataList = append(genomesWithMetadataList, genomeWithMetadataObject) allRawGenomeIdentifiersList = append(allRawGenomeIdentifiersList, genomeIdentifier) numberOfGenomesRead += 1 } containsDuplicates, _ := helpers.CheckIfListContainsDuplicates(allRawGenomeIdentifiersList) if (containsDuplicates == true){ return nil, nil, false, [16]byte{}, [16]byte{}, errors.New("GetGenomesWithMetadataListFromRawGenomesList called with inputGenomesList containing duplicate genomeIdentifiers.") } err := updatePercentageCompleteFunction(20) if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err } if (len(genomesWithMetadataList) == 1){ // Only 1 genome exists. // No genome combining is needed. err = updatePercentageCompleteFunction(100) if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err } return genomesWithMetadataList, allRawGenomeIdentifiersList, false, [16]byte{}, [16]byte{}, nil } // Now we create the shared genomes // The "OnlyExcludeConflicts" genome only excludes an SNP when there is disagreement about its value // The "OnlyIncludeShared" genome only includes SNPs that at least 2 genome files share. This will be the most reliable genome. // For both of the genomes, if more than 2 base pairs exist for the same rsid, the most reported SNP value will be used // For instance, if 3 genome files are imported, and 1 disagrees with the other two, the other two's snp value will be used // If there is a tie, the rsid will be omitted // This map stores all RSIDs across all genomes allRSIDsMap := make(map[int64]struct{}) finalIndex := len(genomesWithMetadataList) - 1 for index, genomeWithMetadataObject := range genomesWithMetadataList{ newPercentageCompletion, err := helpers.ScaleNumberProportionally(true, index, 0, finalIndex, 20, 50) if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err } err = updatePercentageCompleteFunction(newPercentageCompletion) if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err } genomeMap := genomeWithMetadataObject.GenomeMap for rsID, _ := range genomeMap{ allRSIDsMap[rsID] = struct{}{} } } onlyExcludeConflictsGenomeMap := make(map[int64]locusValue.LocusValue) onlyIncludeSharedGenomeMap := make(map[int64]locusValue.LocusValue) index := 0 finalIndex = len(allRSIDsMap) - 1 // This map stores the rsid aliases that we have already added // We will skip these, because we will have already dealt with them addedRSIDAliasesMap := make(map[int64]struct{}) for rsID, _ := range allRSIDsMap{ newPercentageCompletion, err := helpers.ScaleNumberProportionally(true, index, 0, finalIndex, 50, 100) if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err } err = updatePercentageCompleteFunction(newPercentageCompletion) if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err } index += 1 _, exists := addedRSIDAliasesMap[rsID] if (exists == true){ // This rsID is an alias for an rsid we already added continue } anyAliasesExist, rsidAliasesList, err := locusMetadata.GetRSIDAliases(rsID) if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err } if (anyAliasesExist == true){ for _, rsidAlias := range rsidAliasesList{ addedRSIDAliasesMap[rsidAlias] = struct{}{} } } // We first check to see which base pairs are most commonly reported, without any regard for the order (is phased status) // After doing this, we will see if we have enough information to determine if the phase is known locusValuesList := make([]locusValue.LocusValue, 0) for _, genomeWithMetadataObject := range genomesWithMetadataList{ genomeMap := genomeWithMetadataObject.GenomeMap locusValueObject, exists := genomeMap[rsID] if (exists == true){ locusValuesList = append(locusValuesList, locusValueObject) continue } // We did not find the rsid // We see if its alias exists if (anyAliasesExist == false){ continue } for _, rsidAlias := range rsidAliasesList{ aliasLocusValueObject, exists := genomeMap[rsidAlias] if (exists == true){ locusValuesList = append(locusValuesList, aliasLocusValueObject) // We only want to add 1 alias value to the locusValuesList // We already pruned the genomeMap of any multiple alias values, so searching further is unnecessary break } } } //TODO: Our current methods could be improved here. // Currently, we are sorting base pairs for genomes which report the phase as being known, and ones which do not // In a better system, we should have 3 categories we keep track of for each letter pair: // 1. AB (unknown phase) // 2. BA (known phase) // 3. AB (known phase) // We should develop a way to weigh these against each other to return the most accurate results // The way we do this should depend on how common phase-flips are compared to invalid base recordings // Basically, we want base-pair-order disagreements between genomes who are confident about phase to be accounted for // We use this map to store the sorted base pair counts // We sort the base pairs in unicode order // Map Structure: Sorted RSID Base pair value -> Number of genomes that contain the value sortedLocusBasePairCountsMap := make(map[string]int) for _, locusValueObject := range locusValuesList{ base1Value := locusValueObject.Base1Value base2Value := locusValueObject.Base2Value getSortedBasePairValue := func()string{ if (base1Value < base2Value){ result := base1Value + ";" + base2Value return result } result := base2Value + ";" + base1Value return result } sortedBasePairValue := getSortedBasePairValue() sortedLocusBasePairCountsMap[sortedBasePairValue] += 1 } // Now we figure out which sorted base pair value has the highest count mostRecordedSortedBasePair := "" mostRecordedSortedBasePairCount := 0 tieExists := false for basePair, basePairCount := range sortedLocusBasePairCountsMap{ if (basePairCount > mostRecordedSortedBasePairCount){ mostRecordedSortedBasePair = basePair mostRecordedSortedBasePairCount = basePairCount tieExists = false continue } if (basePairCount == mostRecordedSortedBasePairCount){ // There is a tie between differing basePair values tieExists = true continue } } if (tieExists == true){ // The most recorded sorted base pair value has a tie with a different base pair value // We will not add this locus to either combined map continue } // Now we determine the phase for the most recorded sorted base pair // The phase will be either known/unknown for each combined genome //Outputs: // -string: Locus Base 1 // -string: Locus Base 2 // -bool: Phase is known (Only exclude conflicts) // -bool: Phase is known (Only include shared) // -error getLocusBasePair := func()(string, string, bool, bool, error){ sortedBase1, sortedBase2, delimiterFound := strings.Cut(mostRecordedSortedBasePair, ";") if (delimiterFound == false){ return "", "", false, false, errors.New("mostRecordedSortedBasePair missing ; delimeter") } // We cycle through our locus objects to see how many report that the phase for this base pair is known // basePair1 == sortedBase1, sortedBase2 // basePair2 == sortedBase2, sortedBase1 // We count how many genomes report each base pair order as being the true phased order basePair1Count := 0 basePair2Count := 0 for _, locusObject := range locusValuesList{ locusIsPhased := locusObject.LocusIsPhased if (locusIsPhased == false){ // This genome does not claim to know the phase of this locus base pair. // We skip it. continue } locusBase1 := locusObject.Base1Value locusBase2 := locusObject.Base2Value if (locusBase1 == sortedBase1 && locusBase2 == sortedBase2){ basePair1Count += 1 } else if (locusBase1 == sortedBase2 && locusBase2 == sortedBase1){ basePair2Count += 1 } } if (basePair1Count == 0 && basePair2Count == 0){ // No phase is known for this locus for any of the genomes return sortedBase1, sortedBase2, false, false, nil } if (basePair1Count == basePair2Count){ // There is a tie. Thus, phase is unknown. return sortedBase1, sortedBase2, false, false, nil } if (basePair1Count > basePair2Count){ if (basePair1Count < 2){ // There are not enough phaseIsKnown advocate genomes to be able to say this phase // is known for the OnlyIncludeShared genome. return sortedBase1, sortedBase2, true, false, nil } return sortedBase1, sortedBase2, true, true, nil } // basePair1Count < basePair2Count if (basePair2Count < 2){ // There are not enough phaseIsKnown advocate genomes to be able to say this phase // is known for the OnlyIncludeShared genome. return sortedBase2, sortedBase1, true, false, nil } return sortedBase2, sortedBase1, true, true, nil } locusBase1, locusBase2, phaseIsKnown_OnlyExcludeConflicts, phaseIsKnown_OnlyIncludeShared, err := getLocusBasePair() if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err } // Now we add to the combined genome maps // The OnlyExcludeConflicts will only omit when there is a tie // The OnlyIncludeShared requires at least 2 to agree getLocusIsPhased_OnlyExcludeConflicts := func()bool{ if (locusBase1 == locusBase2){ // These kinds of loci are always phased, becauses swapping the alleles changes nothing. return true } return phaseIsKnown_OnlyExcludeConflicts } locusIsPhased_OnlyExcludeConflicts := getLocusIsPhased_OnlyExcludeConflicts() onlyExcludeConflictsLocusValue := locusValue.LocusValue{ LocusIsPhased: locusIsPhased_OnlyExcludeConflicts, Base1Value: locusBase1, Base2Value: locusBase2, } onlyExcludeConflictsGenomeMap[rsID] = onlyExcludeConflictsLocusValue if (mostRecordedSortedBasePairCount >= 2){ getLocusIsPhased_OnlyIncludeShared := func()bool{ if (locusBase1 == locusBase2){ // These kinds of loci are always phased, becauses swapping the alleles changes nothing. return true } return phaseIsKnown_OnlyIncludeShared } locusIsPhased_OnlyIncludeShared := getLocusIsPhased_OnlyIncludeShared() onlyIncludeSharedLocusValue := locusValue.LocusValue{ LocusIsPhased: locusIsPhased_OnlyIncludeShared, Base1Value: locusBase1, Base2Value: locusBase2, } onlyIncludeSharedGenomeMap[rsID] = onlyIncludeSharedLocusValue } } onlyExcludeConflictsGenomeIdentifier, err := helpers.GetNewRandom16ByteArray() if (err != nil) { return nil, nil, false, [16]byte{}, [16]byte{}, err } onlyExcludeConflictsGenomeWithMetadataObject := GenomeWithMetadata{ GenomeType: "OnlyExcludeConflicts", GenomeIdentifier: onlyExcludeConflictsGenomeIdentifier, GenomeMap: onlyExcludeConflictsGenomeMap, } onlyIncludeSharedGenomeIdentifier, err := helpers.GetNewRandom16ByteArray() if (err != nil) { return nil, nil, false, [16]byte{}, [16]byte{}, err } onlyIncludeSharedGenomeWithMetadataObject := GenomeWithMetadata{ GenomeType: "OnlyIncludeShared", GenomeIdentifier: onlyIncludeSharedGenomeIdentifier, GenomeMap: onlyIncludeSharedGenomeMap, } genomesWithMetadataList = append(genomesWithMetadataList, onlyExcludeConflictsGenomeWithMetadataObject, onlyIncludeSharedGenomeWithMetadataObject) err = updatePercentageCompleteFunction(100) if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err } return genomesWithMetadataList, allRawGenomeIdentifiersList, true, onlyExcludeConflictsGenomeIdentifier, onlyIncludeSharedGenomeIdentifier, nil }