2024-04-11 15:51:56 +02:00
// prepareRawGenomes provides a function to read and merge raw genomes into formatted genome maps which are ready to analyze.
// Each raw genome comes from a genome file from a company such as 23andMe, AncestryDNA, etc...
// Two combined genomes are created: OnlyExcludeConflicts and OnlyIncludeShared
package prepareRawGenomes
import "seekia/resources/geneticReferences/locusMetadata"
import "seekia/internal/genetics/locusValue"
import "seekia/internal/genetics/readRawGenomes"
import "seekia/internal/helpers"
import "errors"
import "strings"
type RawGenomeWithMetadata struct {
2024-06-02 10:43:39 +02:00
GenomeIdentifier [ 16 ] byte
2024-04-11 15:51:56 +02:00
GenomeIsPhased bool
RawGenomeMap map [ int64 ] readRawGenomes . RawGenomeLocusValue
}
type GenomeWithMetadata struct {
// "Single"/"OnlyExcludeConflicts"/"OnlyIncludeShared"
GenomeType string
2024-06-02 10:43:39 +02:00
GenomeIdentifier [ 16 ] byte
2024-04-11 15:51:56 +02:00
// Map Structure: RSID -> Locus base pair value object
GenomeMap map [ int64 ] locusValue . LocusValue
}
//Outputs:
// -bool: Raw genome string is valid
// -RawGenomeWithMetadata
// -error (returns non-nil if called with invalid genomeIdentifier)
2024-06-02 10:43:39 +02:00
func CreateRawGenomeWithMetadataObject ( genomeIdentifier [ 16 ] byte , rawGenomeString string ) ( bool , RawGenomeWithMetadata , error ) {
2024-04-11 15:51:56 +02:00
rawDataReader := strings . NewReader ( rawGenomeString )
_ , _ , _ , _ , genomeIsPhased , rawGenomeMap , err := readRawGenomes . ReadRawGenomeFile ( rawDataReader )
if ( err != nil ) {
// The raw genome is not valid
return false , RawGenomeWithMetadata { } , nil
}
newRawGenomeWithMetadataObject := RawGenomeWithMetadata {
GenomeIdentifier : genomeIdentifier ,
GenomeIsPhased : genomeIsPhased ,
RawGenomeMap : rawGenomeMap ,
}
return true , newRawGenomeWithMetadataObject , nil
}
// Function takes a map of raw genome strings and converts it into a map list containing a map for each genome
// If there exists more than 1 genome, it also creates the two combined genomes: Only Include Shared/Only Exclude Conflicts
// Inputs:
// -[]RawGenomeWithMetadata
// -func(int)error: Update Percentage Complete Function
//Outputs:
// -[]GenomeWithMetadata: Genomes with metadata list
2024-06-02 10:43:39 +02:00
// -[][16]byte: All raw genome identifiers list (not including combined genomes)
2024-04-11 15:51:56 +02:00
// -bool: Combined genomes exist
2024-06-02 10:43:39 +02:00
// -[16]byte: Only exclude conflicts genome identifier
// -[16]byte: Only include shared genome identifier
2024-04-11 15:51:56 +02:00
// -error
2024-06-02 10:43:39 +02:00
func GetGenomesWithMetadataListFromRawGenomesList ( inputGenomesList [ ] RawGenomeWithMetadata , updatePercentageCompleteFunction func ( int ) error ) ( [ ] GenomeWithMetadata , [ ] [ 16 ] byte , bool , [ 16 ] byte , [ 16 ] byte , error ) {
2024-04-11 15:51:56 +02:00
// The reading of genomes will take up the first 20% of the percentage range
// The creation of multiple genomes will take up the last 80% of the percentage range
// Structure: A list of genomes.
2024-06-02 10:43:39 +02:00
// Each map stores a genome from a company or a combined genome.
2024-04-11 15:51:56 +02:00
genomesWithMetadataList := make ( [ ] GenomeWithMetadata , 0 )
numberOfGenomesRead := 0
totalNumberOfGenomesToRead := len ( inputGenomesList )
2024-06-02 10:43:39 +02:00
allRawGenomeIdentifiersList := make ( [ ] [ 16 ] byte , 0 )
2024-04-11 15:51:56 +02:00
for _ , rawGenomeWithMetadataObject := range inputGenomesList {
newPercentageCompletion , err := helpers . ScaleNumberProportionally ( true , numberOfGenomesRead , 0 , totalNumberOfGenomesToRead , 0 , 20 )
2024-06-02 10:43:39 +02:00
if ( err != nil ) { return nil , nil , false , [ 16 ] byte { } , [ 16 ] byte { } , err }
2024-04-11 15:51:56 +02:00
err = updatePercentageCompleteFunction ( newPercentageCompletion )
2024-06-02 10:43:39 +02:00
if ( err != nil ) { return nil , nil , false , [ 16 ] byte { } , [ 16 ] byte { } , err }
2024-04-11 15:51:56 +02:00
genomeIdentifier := rawGenomeWithMetadataObject . GenomeIdentifier
genomeIsPhased := rawGenomeWithMetadataObject . GenomeIsPhased
rawGenomeMap := rawGenomeWithMetadataObject . RawGenomeMap
// Now we convert rawGenomeMap to a genomeMap
// Map Structure: RSID -> Locus Value
genomeMap := make ( map [ int64 ] locusValue . LocusValue )
// We use this list to check for alias collisions later
allRSIDsList := make ( [ ] int64 , 0 )
for rsID , locusBasePairValue := range rawGenomeMap {
locusAllele2Exists := locusBasePairValue . Allele2Exists
if ( locusAllele2Exists == false ) {
// This SNP contains less than 2 bases
// We don't support reading these kinds of SNP values yet
continue
}
locusAllele1 := locusBasePairValue . Allele1
locusAllele2 := locusBasePairValue . Allele2
2024-07-05 23:01:43 +02:00
getLocusIsPhasedBool := func ( ) bool {
if ( locusAllele1 == locusAllele2 ) {
// Locus has to be phased, because phase flip does not change value
return true
}
return genomeIsPhased
}
locusIsPhased := getLocusIsPhasedBool ( )
2024-04-11 15:51:56 +02:00
locusValueObject := locusValue . LocusValue {
2024-07-05 23:01:43 +02:00
LocusIsPhased : locusIsPhased ,
2024-04-11 15:51:56 +02:00
Base1Value : locusAllele1 ,
Base2Value : locusAllele2 ,
}
genomeMap [ rsID ] = locusValueObject
allRSIDsList = append ( allRSIDsList , rsID )
}
// Now we check for rsID aliases
// rsID aliases are multiple rsids that refer to the same location
// If a single genome file contained two identical rsids whose value conflicted, the company must have made an error in reporting
for _ , rsID := range allRSIDsList {
rsidLocusValue , exists := genomeMap [ rsID ]
if ( exists == false ) {
// This must have been an alias that was deleted
continue
}
aliasExists , rsidAliasesList , err := locusMetadata . GetRSIDAliases ( rsID )
2024-06-02 10:43:39 +02:00
if ( err != nil ) { return nil , nil , false , [ 16 ] byte { } , [ 16 ] byte { } , err }
2024-04-11 15:51:56 +02:00
if ( aliasExists == false ) {
continue
}
checkIfRSIDCollisionExists := func ( ) bool {
for _ , rsidAlias := range rsidAliasesList {
aliasLocusValue , exists := genomeMap [ rsidAlias ]
if ( exists == false ) {
continue
}
if ( aliasLocusValue != rsidLocusValue ) {
// A collision exists with an alias rsID
// The company must be creating invalid results, or our alias list is invalid
return true
}
}
return false
}
rsidCollisionExists := checkIfRSIDCollisionExists ( )
if ( rsidCollisionExists == true ) {
// We will delete this rsID
// We cannot trust any of the results
delete ( genomeMap , rsID )
}
// We delete all aliases from the genome map
// We do this to save space
for _ , rsidAlias := range rsidAliasesList {
delete ( genomeMap , rsidAlias )
}
}
if ( len ( genomeMap ) == 0 ) {
// No valid locations exist
continue
}
genomeWithMetadataObject := GenomeWithMetadata {
GenomeType : "Single" ,
GenomeIdentifier : genomeIdentifier ,
GenomeMap : genomeMap ,
}
genomesWithMetadataList = append ( genomesWithMetadataList , genomeWithMetadataObject )
allRawGenomeIdentifiersList = append ( allRawGenomeIdentifiersList , genomeIdentifier )
numberOfGenomesRead += 1
}
containsDuplicates , _ := helpers . CheckIfListContainsDuplicates ( allRawGenomeIdentifiersList )
if ( containsDuplicates == true ) {
2024-06-02 10:43:39 +02:00
return nil , nil , false , [ 16 ] byte { } , [ 16 ] byte { } , errors . New ( "GetGenomesWithMetadataListFromRawGenomesList called with inputGenomesList containing duplicate genomeIdentifiers." )
2024-04-11 15:51:56 +02:00
}
err := updatePercentageCompleteFunction ( 20 )
2024-06-02 10:43:39 +02:00
if ( err != nil ) { return nil , nil , false , [ 16 ] byte { } , [ 16 ] byte { } , err }
2024-04-11 15:51:56 +02:00
if ( len ( genomesWithMetadataList ) == 1 ) {
// Only 1 genome exists.
// No genome combining is needed.
err = updatePercentageCompleteFunction ( 100 )
2024-06-02 10:43:39 +02:00
if ( err != nil ) { return nil , nil , false , [ 16 ] byte { } , [ 16 ] byte { } , err }
2024-04-11 15:51:56 +02:00
2024-06-02 10:43:39 +02:00
return genomesWithMetadataList , allRawGenomeIdentifiersList , false , [ 16 ] byte { } , [ 16 ] byte { } , nil
2024-04-11 15:51:56 +02:00
}
// Now we create the shared genomes
// The "OnlyExcludeConflicts" genome only excludes an SNP when there is disagreement about its value
// The "OnlyIncludeShared" genome only includes SNPs that at least 2 genome files share. This will be the most reliable genome.
// For both of the genomes, if more than 2 base pairs exist for the same rsid, the most reported SNP value will be used
// For instance, if 3 genome files are imported, and 1 disagrees with the other two, the other two's snp value will be used
// If there is a tie, the rsid will be omitted
// This map stores all RSIDs across all genomes
allRSIDsMap := make ( map [ int64 ] struct { } )
finalIndex := len ( genomesWithMetadataList ) - 1
for index , genomeWithMetadataObject := range genomesWithMetadataList {
newPercentageCompletion , err := helpers . ScaleNumberProportionally ( true , index , 0 , finalIndex , 20 , 50 )
2024-06-02 10:43:39 +02:00
if ( err != nil ) { return nil , nil , false , [ 16 ] byte { } , [ 16 ] byte { } , err }
2024-04-11 15:51:56 +02:00
err = updatePercentageCompleteFunction ( newPercentageCompletion )
2024-06-02 10:43:39 +02:00
if ( err != nil ) { return nil , nil , false , [ 16 ] byte { } , [ 16 ] byte { } , err }
2024-04-11 15:51:56 +02:00
genomeMap := genomeWithMetadataObject . GenomeMap
for rsID , _ := range genomeMap {
allRSIDsMap [ rsID ] = struct { } { }
}
}
onlyExcludeConflictsGenomeMap := make ( map [ int64 ] locusValue . LocusValue )
onlyIncludeSharedGenomeMap := make ( map [ int64 ] locusValue . LocusValue )
index := 0
finalIndex = len ( allRSIDsMap ) - 1
// This map stores the rsid aliases that we have already added
// We will skip these, because we will have already dealt with them
addedRSIDAliasesMap := make ( map [ int64 ] struct { } )
for rsID , _ := range allRSIDsMap {
newPercentageCompletion , err := helpers . ScaleNumberProportionally ( true , index , 0 , finalIndex , 50 , 100 )
2024-06-02 10:43:39 +02:00
if ( err != nil ) { return nil , nil , false , [ 16 ] byte { } , [ 16 ] byte { } , err }
2024-04-11 15:51:56 +02:00
err = updatePercentageCompleteFunction ( newPercentageCompletion )
2024-06-02 10:43:39 +02:00
if ( err != nil ) { return nil , nil , false , [ 16 ] byte { } , [ 16 ] byte { } , err }
2024-04-11 15:51:56 +02:00
index += 1
_ , exists := addedRSIDAliasesMap [ rsID ]
if ( exists == true ) {
// This rsID is an alias for an rsid we already added
continue
}
anyAliasesExist , rsidAliasesList , err := locusMetadata . GetRSIDAliases ( rsID )
2024-06-02 10:43:39 +02:00
if ( err != nil ) { return nil , nil , false , [ 16 ] byte { } , [ 16 ] byte { } , err }
2024-04-11 15:51:56 +02:00
if ( anyAliasesExist == true ) {
for _ , rsidAlias := range rsidAliasesList {
addedRSIDAliasesMap [ rsidAlias ] = struct { } { }
}
}
// We first check to see which base pairs are most commonly reported, without any regard for the order (is phased status)
// After doing this, we will see if we have enough information to determine if the phase is known
locusValuesList := make ( [ ] locusValue . LocusValue , 0 )
for _ , genomeWithMetadataObject := range genomesWithMetadataList {
genomeMap := genomeWithMetadataObject . GenomeMap
locusValueObject , exists := genomeMap [ rsID ]
if ( exists == true ) {
locusValuesList = append ( locusValuesList , locusValueObject )
continue
}
// We did not find the rsid
// We see if its alias exists
if ( anyAliasesExist == false ) {
continue
}
for _ , rsidAlias := range rsidAliasesList {
aliasLocusValueObject , exists := genomeMap [ rsidAlias ]
if ( exists == true ) {
locusValuesList = append ( locusValuesList , aliasLocusValueObject )
// We only want to add 1 alias value to the locusValuesList
// We already pruned the genomeMap of any multiple alias values, so searching further is unnecessary
break
}
}
}
//TODO: Our current methods could be improved here.
// Currently, we are sorting base pairs for genomes which report the phase as being known, and ones which do not
// In a better system, we should have 3 categories we keep track of for each letter pair:
// 1. AB (unknown phase)
// 2. BA (known phase)
// 3. AB (known phase)
// We should develop a way to weigh these against each other to return the most accurate results
// The way we do this should depend on how common phase-flips are compared to invalid base recordings
// Basically, we want base-pair-order disagreements between genomes who are confident about phase to be accounted for
// We use this map to store the sorted base pair counts
// We sort the base pairs in unicode order
// Map Structure: Sorted RSID Base pair value -> Number of genomes that contain the value
sortedLocusBasePairCountsMap := make ( map [ string ] int )
for _ , locusValueObject := range locusValuesList {
base1Value := locusValueObject . Base1Value
base2Value := locusValueObject . Base2Value
getSortedBasePairValue := func ( ) string {
if ( base1Value < base2Value ) {
result := base1Value + ";" + base2Value
return result
}
result := base2Value + ";" + base1Value
return result
}
sortedBasePairValue := getSortedBasePairValue ( )
sortedLocusBasePairCountsMap [ sortedBasePairValue ] += 1
}
// Now we figure out which sorted base pair value has the highest count
mostRecordedSortedBasePair := ""
mostRecordedSortedBasePairCount := 0
tieExists := false
for basePair , basePairCount := range sortedLocusBasePairCountsMap {
if ( basePairCount > mostRecordedSortedBasePairCount ) {
mostRecordedSortedBasePair = basePair
mostRecordedSortedBasePairCount = basePairCount
tieExists = false
continue
}
if ( basePairCount == mostRecordedSortedBasePairCount ) {
// There is a tie between differing basePair values
tieExists = true
continue
}
}
if ( tieExists == true ) {
// The most recorded sorted base pair value has a tie with a different base pair value
// We will not add this locus to either combined map
continue
}
// Now we determine the phase for the most recorded sorted base pair
// The phase will be either known/unknown for each combined genome
//Outputs:
// -string: Locus Base 1
// -string: Locus Base 2
// -bool: Phase is known (Only exclude conflicts)
// -bool: Phase is known (Only include shared)
// -error
getLocusBasePair := func ( ) ( string , string , bool , bool , error ) {
sortedBase1 , sortedBase2 , delimiterFound := strings . Cut ( mostRecordedSortedBasePair , ";" )
if ( delimiterFound == false ) {
return "" , "" , false , false , errors . New ( "mostRecordedSortedBasePair missing ; delimeter" )
}
// We cycle through our locus objects to see how many report that the phase for this base pair is known
// basePair1 == sortedBase1, sortedBase2
// basePair2 == sortedBase2, sortedBase1
// We count how many genomes report each base pair order as being the true phased order
basePair1Count := 0
basePair2Count := 0
for _ , locusObject := range locusValuesList {
locusIsPhased := locusObject . LocusIsPhased
if ( locusIsPhased == false ) {
// This genome does not claim to know the phase of this locus base pair.
// We skip it.
continue
}
locusBase1 := locusObject . Base1Value
locusBase2 := locusObject . Base2Value
if ( locusBase1 == sortedBase1 && locusBase2 == sortedBase2 ) {
basePair1Count += 1
} else if ( locusBase1 == sortedBase2 && locusBase2 == sortedBase1 ) {
basePair2Count += 1
}
}
if ( basePair1Count == 0 && basePair2Count == 0 ) {
// No phase is known for this locus for any of the genomes
return sortedBase1 , sortedBase2 , false , false , nil
}
if ( basePair1Count == basePair2Count ) {
// There is a tie. Thus, phase is unknown.
return sortedBase1 , sortedBase2 , false , false , nil
}
if ( basePair1Count > basePair2Count ) {
if ( basePair1Count < 2 ) {
// There are not enough phaseIsKnown advocate genomes to be able to say this phase
// is known for the OnlyIncludeShared genome.
return sortedBase1 , sortedBase2 , true , false , nil
}
return sortedBase1 , sortedBase2 , true , true , nil
}
// basePair1Count < basePair2Count
if ( basePair2Count < 2 ) {
// There are not enough phaseIsKnown advocate genomes to be able to say this phase
// is known for the OnlyIncludeShared genome.
return sortedBase2 , sortedBase1 , true , false , nil
}
return sortedBase2 , sortedBase1 , true , true , nil
}
locusBase1 , locusBase2 , phaseIsKnown_OnlyExcludeConflicts , phaseIsKnown_OnlyIncludeShared , err := getLocusBasePair ( )
2024-06-02 10:43:39 +02:00
if ( err != nil ) { return nil , nil , false , [ 16 ] byte { } , [ 16 ] byte { } , err }
2024-04-11 15:51:56 +02:00
// Now we add to the combined genome maps
// The OnlyExcludeConflicts will only omit when there is a tie
// The OnlyIncludeShared requires at least 2 to agree
2024-07-05 23:01:43 +02:00
getLocusIsPhased_OnlyExcludeConflicts := func ( ) bool {
if ( locusBase1 == locusBase2 ) {
// These kinds of loci are always phased, becauses swapping the alleles changes nothing.
return true
}
return phaseIsKnown_OnlyExcludeConflicts
}
locusIsPhased_OnlyExcludeConflicts := getLocusIsPhased_OnlyExcludeConflicts ( )
2024-04-11 15:51:56 +02:00
onlyExcludeConflictsLocusValue := locusValue . LocusValue {
2024-07-05 23:01:43 +02:00
LocusIsPhased : locusIsPhased_OnlyExcludeConflicts ,
2024-04-11 15:51:56 +02:00
Base1Value : locusBase1 ,
Base2Value : locusBase2 ,
}
onlyExcludeConflictsGenomeMap [ rsID ] = onlyExcludeConflictsLocusValue
if ( mostRecordedSortedBasePairCount >= 2 ) {
2024-07-05 23:01:43 +02:00
getLocusIsPhased_OnlyIncludeShared := func ( ) bool {
if ( locusBase1 == locusBase2 ) {
// These kinds of loci are always phased, becauses swapping the alleles changes nothing.
return true
}
return phaseIsKnown_OnlyIncludeShared
}
locusIsPhased_OnlyIncludeShared := getLocusIsPhased_OnlyIncludeShared ( )
2024-04-11 15:51:56 +02:00
onlyIncludeSharedLocusValue := locusValue . LocusValue {
2024-07-05 23:01:43 +02:00
LocusIsPhased : locusIsPhased_OnlyIncludeShared ,
2024-04-11 15:51:56 +02:00
Base1Value : locusBase1 ,
Base2Value : locusBase2 ,
}
onlyIncludeSharedGenomeMap [ rsID ] = onlyIncludeSharedLocusValue
}
}
2024-06-02 10:43:39 +02:00
onlyExcludeConflictsGenomeIdentifier , err := helpers . GetNewRandom16ByteArray ( )
if ( err != nil ) { return nil , nil , false , [ 16 ] byte { } , [ 16 ] byte { } , err }
2024-04-11 15:51:56 +02:00
onlyExcludeConflictsGenomeWithMetadataObject := GenomeWithMetadata {
GenomeType : "OnlyExcludeConflicts" ,
GenomeIdentifier : onlyExcludeConflictsGenomeIdentifier ,
GenomeMap : onlyExcludeConflictsGenomeMap ,
}
2024-06-02 10:43:39 +02:00
onlyIncludeSharedGenomeIdentifier , err := helpers . GetNewRandom16ByteArray ( )
if ( err != nil ) { return nil , nil , false , [ 16 ] byte { } , [ 16 ] byte { } , err }
2024-04-11 15:51:56 +02:00
onlyIncludeSharedGenomeWithMetadataObject := GenomeWithMetadata {
GenomeType : "OnlyIncludeShared" ,
GenomeIdentifier : onlyIncludeSharedGenomeIdentifier ,
GenomeMap : onlyIncludeSharedGenomeMap ,
}
genomesWithMetadataList = append ( genomesWithMetadataList , onlyExcludeConflictsGenomeWithMetadataObject , onlyIncludeSharedGenomeWithMetadataObject )
err = updatePercentageCompleteFunction ( 100 )
2024-06-02 10:43:39 +02:00
if ( err != nil ) { return nil , nil , false , [ 16 ] byte { } , [ 16 ] byte { } , err }
2024-04-11 15:51:56 +02:00
return genomesWithMetadataList , allRawGenomeIdentifiersList , true , onlyExcludeConflictsGenomeIdentifier , onlyIncludeSharedGenomeIdentifier , nil
}