seekia/internal/genetics/prepareRawGenomes/prepareRawGenomes.go

575 lines
18 KiB
Go
Raw Normal View History

// prepareRawGenomes provides a function to read and merge raw genomes into formatted genome maps which are ready to analyze.
// Each raw genome comes from a genome file from a company such as 23andMe, AncestryDNA, etc...
// Two combined genomes are created: OnlyExcludeConflicts and OnlyIncludeShared
package prepareRawGenomes
import "seekia/resources/geneticReferences/locusMetadata"
import "seekia/internal/genetics/locusValue"
import "seekia/internal/genetics/readRawGenomes"
import "seekia/internal/helpers"
import "errors"
import "strings"
type RawGenomeWithMetadata struct{
GenomeIdentifier [16]byte
GenomeIsPhased bool
RawGenomeMap map[int64]readRawGenomes.RawGenomeLocusValue
}
type GenomeWithMetadata struct{
// "Single"/"OnlyExcludeConflicts"/"OnlyIncludeShared"
GenomeType string
GenomeIdentifier [16]byte
// Map Structure: RSID -> Locus base pair value object
GenomeMap map[int64]locusValue.LocusValue
}
//Outputs:
// -bool: Raw genome string is valid
// -RawGenomeWithMetadata
// -error (returns non-nil if called with invalid genomeIdentifier)
func CreateRawGenomeWithMetadataObject(genomeIdentifier [16]byte, rawGenomeString string)(bool, RawGenomeWithMetadata, error){
rawDataReader := strings.NewReader(rawGenomeString)
_, _, _, _, genomeIsPhased, rawGenomeMap, err := readRawGenomes.ReadRawGenomeFile(rawDataReader)
if (err != nil) {
// The raw genome is not valid
return false, RawGenomeWithMetadata{}, nil
}
newRawGenomeWithMetadataObject := RawGenomeWithMetadata{
GenomeIdentifier: genomeIdentifier,
GenomeIsPhased: genomeIsPhased,
RawGenomeMap: rawGenomeMap,
}
return true, newRawGenomeWithMetadataObject, nil
}
// Function takes a map of raw genome strings and converts it into a map list containing a map for each genome
// If there exists more than 1 genome, it also creates the two combined genomes: Only Include Shared/Only Exclude Conflicts
// Inputs:
// -[]RawGenomeWithMetadata
// -func(int)error: Update Percentage Complete Function
//Outputs:
// -bool: Any useful locations exist in any of the provided genomes
// -[]GenomeWithMetadata: Genomes with metadata list
// -[][16]byte: All raw genome identifiers list (not including combined genomes)
// -bool: Combined genomes exist
// -[16]byte: Only exclude conflicts genome identifier
// -[16]byte: Only include shared genome identifier
// -error
func GetGenomesWithMetadataListFromRawGenomesList(inputGenomesList []RawGenomeWithMetadata, updatePercentageCompleteFunction func(int)error)(bool, []GenomeWithMetadata, [][16]byte, bool, [16]byte, [16]byte, error){
if (len(inputGenomesList) == 0){
return false, nil, nil, false, [16]byte{}, [16]byte{}, errors.New("GetGenomesWithMetadataListFromRawGenomesList called with empty inputGenomesList")
}
// The reading of genomes will take up the first 20% of the percentage range
// The creation of multiple genomes will take up the last 80% of the percentage range
// Structure: A list of genomes.
// Each map stores a genome from a company or a combined genome.
genomesWithMetadataList := make([]GenomeWithMetadata, 0)
finalIndex := len(inputGenomesList) - 1
allRawGenomeIdentifiersList := make([][16]byte, 0)
for index, rawGenomeWithMetadataObject := range inputGenomesList{
newPercentageCompletion, err := helpers.ScaleIntProportionally(true, index, 0, finalIndex, 0, 20)
if (err != nil) { return false, nil, nil, false, [16]byte{}, [16]byte{}, err }
err = updatePercentageCompleteFunction(newPercentageCompletion)
if (err != nil) { return false, nil, nil, false, [16]byte{}, [16]byte{}, err }
genomeIdentifier := rawGenomeWithMetadataObject.GenomeIdentifier
genomeIsPhased := rawGenomeWithMetadataObject.GenomeIsPhased
rawGenomeMap := rawGenomeWithMetadataObject.RawGenomeMap
// Now we convert rawGenomeMap to a genomeMap
anyValuesExist, genomeMap, err := ConvertRawGenomeToGenomeMap(rawGenomeMap, genomeIsPhased)
if (err != nil) { return false, nil, nil, false, [16]byte{}, [16]byte{}, err }
if (anyValuesExist == false){
// This genome is not useful
// No useful locations exist
continue
}
genomeWithMetadataObject := GenomeWithMetadata{
GenomeType: "Single",
GenomeIdentifier: genomeIdentifier,
GenomeMap: genomeMap,
}
genomesWithMetadataList = append(genomesWithMetadataList, genomeWithMetadataObject)
allRawGenomeIdentifiersList = append(allRawGenomeIdentifiersList, genomeIdentifier)
}
if (len(genomesWithMetadataList) == 0){
// None of the provided genomes contained any useful locations
return false, nil, nil, false, [16]byte{}, [16]byte{}, nil
}
containsDuplicates, _ := helpers.CheckIfListContainsDuplicates(allRawGenomeIdentifiersList)
if (containsDuplicates == true){
return false, nil, nil, false, [16]byte{}, [16]byte{}, errors.New("GetGenomesWithMetadataListFromRawGenomesList called with inputGenomesList containing duplicate genomeIdentifiers.")
}
err := updatePercentageCompleteFunction(20)
if (err != nil){ return false, nil, nil, false, [16]byte{}, [16]byte{}, err }
if (len(genomesWithMetadataList) <= 1){
// <=1 genome exists.
// No genome combining is needed.
err = updatePercentageCompleteFunction(100)
if (err != nil){ return false, nil, nil, false, [16]byte{}, [16]byte{}, err }
return true, genomesWithMetadataList, allRawGenomeIdentifiersList, false, [16]byte{}, [16]byte{}, nil
}
// Now we create the shared genomes
// The "OnlyExcludeConflicts" genome only excludes an SNP when there is disagreement about its value
// The "OnlyIncludeShared" genome only includes SNPs that at least 2 genome files share. This will be the most reliable genome.
// For both of the genomes, if more than 2 base pairs exist for the same rsid, the most reported SNP value will be used
// For instance, if 3 genome files are imported, and 1 disagrees with the other two, the other two's snp value will be used
// If there is a tie, the rsid will be omitted
// This map stores all RSIDs across all genomes
allRSIDsMap := make(map[int64]struct{})
finalIndex = len(genomesWithMetadataList) - 1
for index, genomeWithMetadataObject := range genomesWithMetadataList{
newPercentageCompletion, err := helpers.ScaleIntProportionally(true, index, 0, finalIndex, 20, 50)
if (err != nil){ return false, nil, nil, false, [16]byte{}, [16]byte{}, err }
err = updatePercentageCompleteFunction(newPercentageCompletion)
if (err != nil){ return false, nil, nil, false, [16]byte{}, [16]byte{}, err }
genomeMap := genomeWithMetadataObject.GenomeMap
for rsID, _ := range genomeMap{
allRSIDsMap[rsID] = struct{}{}
}
}
onlyExcludeConflictsGenomeMap := make(map[int64]locusValue.LocusValue)
onlyIncludeSharedGenomeMap := make(map[int64]locusValue.LocusValue)
index := 0
finalIndex = len(allRSIDsMap) - 1
// This map stores the rsid aliases that we have already added
// We will skip these, because we will have already dealt with them
addedRSIDAliasesMap := make(map[int64]struct{})
for rsID, _ := range allRSIDsMap{
newPercentageCompletion, err := helpers.ScaleIntProportionally(true, index, 0, finalIndex, 50, 100)
if (err != nil){ return false, nil, nil, false, [16]byte{}, [16]byte{}, err }
err = updatePercentageCompleteFunction(newPercentageCompletion)
if (err != nil){ return false, nil, nil, false, [16]byte{}, [16]byte{}, err }
index += 1
_, exists := addedRSIDAliasesMap[rsID]
if (exists == true){
// This rsID is an alias for an rsid we already added
continue
}
anyAliasesExist, rsidAliasesList, err := locusMetadata.GetRSIDAliases(rsID)
if (err != nil){ return false, nil, nil, false, [16]byte{}, [16]byte{}, err }
if (anyAliasesExist == true){
for _, rsidAlias := range rsidAliasesList{
addedRSIDAliasesMap[rsidAlias] = struct{}{}
}
}
// We first check to see which base pairs are most commonly reported, without any regard for the order (is phased status)
// After doing this, we will see if we have enough information to determine if the phase is known
locusValuesList := make([]locusValue.LocusValue, 0)
for _, genomeWithMetadataObject := range genomesWithMetadataList{
genomeMap := genomeWithMetadataObject.GenomeMap
locusValueObject, exists := genomeMap[rsID]
if (exists == true){
locusValuesList = append(locusValuesList, locusValueObject)
continue
}
// We did not find the rsid
// We see if its alias exists
if (anyAliasesExist == false){
continue
}
for _, rsidAlias := range rsidAliasesList{
aliasLocusValueObject, exists := genomeMap[rsidAlias]
if (exists == true){
locusValuesList = append(locusValuesList, aliasLocusValueObject)
// We only want to add 1 alias value to the locusValuesList
// We already pruned the genomeMap of any multiple alias values, so searching further is unnecessary
break
}
}
}
//TODO: Our current methods could be improved here.
// Currently, we are sorting base pairs for genomes which report the phase as being known, and ones which do not
// In a better system, we should have 3 categories we keep track of for each letter pair:
// 1. AB (unknown phase)
// 2. BA (known phase)
// 3. AB (known phase)
// We should develop a way to weigh these against each other to return the most accurate results
// The way we do this should depend on how common phase-flips are compared to invalid base recordings
// Basically, we want base-pair-order disagreements between genomes who are confident about phase to be accounted for
// We use this map to store the sorted base pair counts
// We sort the base pairs in unicode order
// Map Structure: Sorted RSID Base pair value -> Number of genomes that contain the value
sortedLocusBasePairCountsMap := make(map[string]int)
for _, locusValueObject := range locusValuesList{
base1Value := locusValueObject.Base1Value
base2Value := locusValueObject.Base2Value
getSortedBasePairValue := func()string{
if (base1Value < base2Value){
result := base1Value + ";" + base2Value
return result
}
result := base2Value + ";" + base1Value
return result
}
sortedBasePairValue := getSortedBasePairValue()
sortedLocusBasePairCountsMap[sortedBasePairValue] += 1
}
// Now we figure out which sorted base pair value has the highest count
mostRecordedSortedBasePair := ""
mostRecordedSortedBasePairCount := 0
tieExists := false
for basePair, basePairCount := range sortedLocusBasePairCountsMap{
if (basePairCount > mostRecordedSortedBasePairCount){
mostRecordedSortedBasePair = basePair
mostRecordedSortedBasePairCount = basePairCount
tieExists = false
continue
}
if (basePairCount == mostRecordedSortedBasePairCount){
// There is a tie between differing basePair values
tieExists = true
continue
}
}
if (tieExists == true){
// The most recorded sorted base pair value has a tie with a different base pair value
// We will not add this locus to either combined map
continue
}
// Now we determine the phase for the most recorded sorted base pair
// The phase will be either known/unknown for each combined genome
//Outputs:
// -string: Locus Base 1
// -string: Locus Base 2
// -bool: Phase is known (Only exclude conflicts)
// -bool: Phase is known (Only include shared)
// -error
getLocusBasePair := func()(string, string, bool, bool, error){
sortedBase1, sortedBase2, delimiterFound := strings.Cut(mostRecordedSortedBasePair, ";")
if (delimiterFound == false){
return "", "", false, false, errors.New("mostRecordedSortedBasePair missing ; delimeter")
}
// We cycle through our locus objects to see how many report that the phase for this base pair is known
// basePair1 == sortedBase1, sortedBase2
// basePair2 == sortedBase2, sortedBase1
// We count how many genomes report each base pair order as being the true phased order
basePair1Count := 0
basePair2Count := 0
for _, locusObject := range locusValuesList{
locusIsPhased := locusObject.LocusIsPhased
if (locusIsPhased == false){
// This genome does not claim to know the phase of this locus base pair.
// We skip it.
continue
}
locusBase1 := locusObject.Base1Value
locusBase2 := locusObject.Base2Value
if (locusBase1 == sortedBase1 && locusBase2 == sortedBase2){
basePair1Count += 1
} else if (locusBase1 == sortedBase2 && locusBase2 == sortedBase1){
basePair2Count += 1
}
}
if (basePair1Count == 0 && basePair2Count == 0){
// No phase is known for this locus for any of the genomes
return sortedBase1, sortedBase2, false, false, nil
}
if (basePair1Count == basePair2Count){
// There is a tie. Thus, phase is unknown.
return sortedBase1, sortedBase2, false, false, nil
}
if (basePair1Count > basePair2Count){
if (basePair1Count < 2){
// There are not enough phaseIsKnown advocate genomes to be able to say this phase
// is known for the OnlyIncludeShared genome.
return sortedBase1, sortedBase2, true, false, nil
}
return sortedBase1, sortedBase2, true, true, nil
}
// basePair1Count < basePair2Count
if (basePair2Count < 2){
// There are not enough phaseIsKnown advocate genomes to be able to say this phase
// is known for the OnlyIncludeShared genome.
return sortedBase2, sortedBase1, true, false, nil
}
return sortedBase2, sortedBase1, true, true, nil
}
locusBase1, locusBase2, phaseIsKnown_OnlyExcludeConflicts, phaseIsKnown_OnlyIncludeShared, err := getLocusBasePair()
if (err != nil){ return false, nil, nil, false, [16]byte{}, [16]byte{}, err }
// Now we add to the combined genome maps
// The OnlyExcludeConflicts will only omit when there is a tie
// The OnlyIncludeShared requires at least 2 to agree
getLocusIsPhased_OnlyExcludeConflicts := func()bool{
if (locusBase1 == locusBase2){
// These kinds of loci are always phased, becauses swapping the alleles changes nothing.
return true
}
return phaseIsKnown_OnlyExcludeConflicts
}
locusIsPhased_OnlyExcludeConflicts := getLocusIsPhased_OnlyExcludeConflicts()
onlyExcludeConflictsLocusValue := locusValue.LocusValue{
LocusIsPhased: locusIsPhased_OnlyExcludeConflicts,
Base1Value: locusBase1,
Base2Value: locusBase2,
}
onlyExcludeConflictsGenomeMap[rsID] = onlyExcludeConflictsLocusValue
if (mostRecordedSortedBasePairCount >= 2){
getLocusIsPhased_OnlyIncludeShared := func()bool{
if (locusBase1 == locusBase2){
// These kinds of loci are always phased, becauses swapping the alleles changes nothing.
return true
}
return phaseIsKnown_OnlyIncludeShared
}
locusIsPhased_OnlyIncludeShared := getLocusIsPhased_OnlyIncludeShared()
onlyIncludeSharedLocusValue := locusValue.LocusValue{
LocusIsPhased: locusIsPhased_OnlyIncludeShared,
Base1Value: locusBase1,
Base2Value: locusBase2,
}
onlyIncludeSharedGenomeMap[rsID] = onlyIncludeSharedLocusValue
}
}
onlyExcludeConflictsGenomeIdentifier, err := helpers.GetNewRandom16ByteArray()
if (err != nil) { return false, nil, nil, false, [16]byte{}, [16]byte{}, err }
onlyExcludeConflictsGenomeWithMetadataObject := GenomeWithMetadata{
GenomeType: "OnlyExcludeConflicts",
GenomeIdentifier: onlyExcludeConflictsGenomeIdentifier,
GenomeMap: onlyExcludeConflictsGenomeMap,
}
onlyIncludeSharedGenomeIdentifier, err := helpers.GetNewRandom16ByteArray()
if (err != nil) { return false, nil, nil, false, [16]byte{}, [16]byte{}, err }
onlyIncludeSharedGenomeWithMetadataObject := GenomeWithMetadata{
GenomeType: "OnlyIncludeShared",
GenomeIdentifier: onlyIncludeSharedGenomeIdentifier,
GenomeMap: onlyIncludeSharedGenomeMap,
}
genomesWithMetadataList = append(genomesWithMetadataList, onlyExcludeConflictsGenomeWithMetadataObject, onlyIncludeSharedGenomeWithMetadataObject)
err = updatePercentageCompleteFunction(100)
if (err != nil){ return false, nil, nil, false, [16]byte{}, [16]byte{}, err }
return true, genomesWithMetadataList, allRawGenomeIdentifiersList, true, onlyExcludeConflictsGenomeIdentifier, onlyIncludeSharedGenomeIdentifier, nil
}
//Outputs:
// -bool: Genome has any useful locations
// -map[int64]locusValue.LocusValue
// -error
func ConvertRawGenomeToGenomeMap(rawGenomeMap map[int64]readRawGenomes.RawGenomeLocusValue, genomeIsPhased bool)(bool, map[int64]locusValue.LocusValue, error){
// Map Structure: RSID -> Locus Value
genomeMap := make(map[int64]locusValue.LocusValue)
// We use this list to check for alias collisions later
allRSIDsList := make([]int64, 0)
for rsID, locusBasePairValue := range rawGenomeMap{
locusAllele2Exists := locusBasePairValue.Allele2Exists
if (locusAllele2Exists == false){
// This SNP contains less than 2 bases
// We don't support reading these kinds of SNP values yet
continue
}
locusAllele1 := locusBasePairValue.Allele1
locusAllele2 := locusBasePairValue.Allele2
getLocusIsPhasedBool := func()bool{
if (locusAllele1 == locusAllele2){
// Locus has to be phased, because phase flip does not change value
return true
}
return genomeIsPhased
}
locusIsPhased := getLocusIsPhasedBool()
locusValueObject := locusValue.LocusValue{
Base1Value: locusAllele1,
Base2Value: locusAllele2,
LocusIsPhased: locusIsPhased,
}
genomeMap[rsID] = locusValueObject
allRSIDsList = append(allRSIDsList, rsID)
}
// Now we check for rsID aliases
// rsID aliases are multiple rsids that refer to the same location
// If a single genome file contained two identical rsids whose value conflicted, the company must have made an error in reporting
for _, rsID := range allRSIDsList{
rsidLocusValue, exists := genomeMap[rsID]
if (exists == false){
// This must have been an alias that was deleted
continue
}
aliasExists, rsidAliasesList, err := locusMetadata.GetRSIDAliases(rsID)
if (err != nil){ return false, nil, err }
if (aliasExists == false){
continue
}
checkIfRSIDCollisionExists := func()bool{
for _, rsidAlias := range rsidAliasesList{
aliasLocusValue, exists := genomeMap[rsidAlias]
if (exists == false){
continue
}
if (aliasLocusValue != rsidLocusValue){
// A collision exists with an alias rsID
// The company must be creating invalid results, or our alias list is invalid
return true
}
}
return false
}
rsidCollisionExists := checkIfRSIDCollisionExists()
if (rsidCollisionExists == true){
// We will delete this rsID
// We cannot trust any of the results
delete(genomeMap, rsID)
}
// We delete all aliases from the genome map
// We do this to save space
for _, rsidAlias := range rsidAliasesList{
delete(genomeMap, rsidAlias)
}
}
if (len(genomeMap) == 0){
// No valid locations exist
return false, nil, nil
}
return true, genomeMap, nil
}