516 lines
17 KiB
Go
516 lines
17 KiB
Go
|
|
// prepareRawGenomes provides a function to read and merge raw genomes into formatted genome maps which are ready to analyze.
|
|
// Each raw genome comes from a genome file from a company such as 23andMe, AncestryDNA, etc...
|
|
// Two combined genomes are created: OnlyExcludeConflicts and OnlyIncludeShared
|
|
|
|
package prepareRawGenomes
|
|
|
|
|
|
import "seekia/resources/geneticReferences/locusMetadata"
|
|
|
|
import "seekia/internal/genetics/locusValue"
|
|
import "seekia/internal/genetics/readRawGenomes"
|
|
import "seekia/internal/helpers"
|
|
|
|
import "errors"
|
|
import "strings"
|
|
|
|
|
|
type RawGenomeWithMetadata struct{
|
|
|
|
GenomeIdentifier [16]byte
|
|
|
|
GenomeIsPhased bool
|
|
|
|
RawGenomeMap map[int64]readRawGenomes.RawGenomeLocusValue
|
|
}
|
|
|
|
|
|
type GenomeWithMetadata struct{
|
|
|
|
// "Single"/"OnlyExcludeConflicts"/"OnlyIncludeShared"
|
|
GenomeType string
|
|
|
|
GenomeIdentifier [16]byte
|
|
|
|
// Map Structure: RSID -> Locus base pair value object
|
|
GenomeMap map[int64]locusValue.LocusValue
|
|
}
|
|
|
|
|
|
//Outputs:
|
|
// -bool: Raw genome string is valid
|
|
// -RawGenomeWithMetadata
|
|
// -error (returns non-nil if called with invalid genomeIdentifier)
|
|
func CreateRawGenomeWithMetadataObject(genomeIdentifier [16]byte, rawGenomeString string)(bool, RawGenomeWithMetadata, error){
|
|
|
|
rawDataReader := strings.NewReader(rawGenomeString)
|
|
|
|
_, _, _, _, genomeIsPhased, rawGenomeMap, err := readRawGenomes.ReadRawGenomeFile(rawDataReader)
|
|
if (err != nil) {
|
|
// The raw genome is not valid
|
|
return false, RawGenomeWithMetadata{}, nil
|
|
}
|
|
|
|
newRawGenomeWithMetadataObject := RawGenomeWithMetadata{
|
|
GenomeIdentifier: genomeIdentifier,
|
|
GenomeIsPhased: genomeIsPhased,
|
|
RawGenomeMap: rawGenomeMap,
|
|
}
|
|
|
|
return true, newRawGenomeWithMetadataObject, nil
|
|
}
|
|
|
|
|
|
// Function takes a map of raw genome strings and converts it into a map list containing a map for each genome
|
|
// If there exists more than 1 genome, it also creates the two combined genomes: Only Include Shared/Only Exclude Conflicts
|
|
// Inputs:
|
|
// -[]RawGenomeWithMetadata
|
|
// -func(int)error: Update Percentage Complete Function
|
|
//Outputs:
|
|
// -[]GenomeWithMetadata: Genomes with metadata list
|
|
// -[][16]byte: All raw genome identifiers list (not including combined genomes)
|
|
// -bool: Combined genomes exist
|
|
// -[16]byte: Only exclude conflicts genome identifier
|
|
// -[16]byte: Only include shared genome identifier
|
|
// -error
|
|
func GetGenomesWithMetadataListFromRawGenomesList(inputGenomesList []RawGenomeWithMetadata, updatePercentageCompleteFunction func(int)error)([]GenomeWithMetadata, [][16]byte, bool, [16]byte, [16]byte, error){
|
|
|
|
// The reading of genomes will take up the first 20% of the percentage range
|
|
// The creation of multiple genomes will take up the last 80% of the percentage range
|
|
|
|
// Structure: A list of genomes.
|
|
// Each map stores a genome from a company or a combined genome.
|
|
genomesWithMetadataList := make([]GenomeWithMetadata, 0)
|
|
|
|
numberOfGenomesRead := 0
|
|
totalNumberOfGenomesToRead := len(inputGenomesList)
|
|
|
|
allRawGenomeIdentifiersList := make([][16]byte, 0)
|
|
|
|
for _, rawGenomeWithMetadataObject := range inputGenomesList{
|
|
|
|
newPercentageCompletion, err := helpers.ScaleNumberProportionally(true, numberOfGenomesRead, 0, totalNumberOfGenomesToRead, 0, 20)
|
|
if (err != nil) { return nil, nil, false, [16]byte{}, [16]byte{}, err }
|
|
|
|
err = updatePercentageCompleteFunction(newPercentageCompletion)
|
|
if (err != nil) { return nil, nil, false, [16]byte{}, [16]byte{}, err }
|
|
|
|
genomeIdentifier := rawGenomeWithMetadataObject.GenomeIdentifier
|
|
genomeIsPhased := rawGenomeWithMetadataObject.GenomeIsPhased
|
|
rawGenomeMap := rawGenomeWithMetadataObject.RawGenomeMap
|
|
|
|
// Now we convert rawGenomeMap to a genomeMap
|
|
|
|
// Map Structure: RSID -> Locus Value
|
|
genomeMap := make(map[int64]locusValue.LocusValue)
|
|
|
|
// We use this list to check for alias collisions later
|
|
allRSIDsList := make([]int64, 0)
|
|
|
|
for rsID, locusBasePairValue := range rawGenomeMap{
|
|
|
|
locusAllele2Exists := locusBasePairValue.Allele2Exists
|
|
if (locusAllele2Exists == false){
|
|
// This SNP contains less than 2 bases
|
|
// We don't support reading these kinds of SNP values yet
|
|
continue
|
|
}
|
|
|
|
locusAllele1 := locusBasePairValue.Allele1
|
|
locusAllele2 := locusBasePairValue.Allele2
|
|
|
|
locusValueObject := locusValue.LocusValue{
|
|
LocusIsPhased: genomeIsPhased,
|
|
Base1Value: locusAllele1,
|
|
Base2Value: locusAllele2,
|
|
}
|
|
|
|
genomeMap[rsID] = locusValueObject
|
|
|
|
allRSIDsList = append(allRSIDsList, rsID)
|
|
}
|
|
|
|
// Now we check for rsID aliases
|
|
// rsID aliases are multiple rsids that refer to the same location
|
|
// If a single genome file contained two identical rsids whose value conflicted, the company must have made an error in reporting
|
|
|
|
for _, rsID := range allRSIDsList{
|
|
|
|
rsidLocusValue, exists := genomeMap[rsID]
|
|
if (exists == false){
|
|
// This must have been an alias that was deleted
|
|
continue
|
|
}
|
|
|
|
aliasExists, rsidAliasesList, err := locusMetadata.GetRSIDAliases(rsID)
|
|
if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err }
|
|
if (aliasExists == false){
|
|
continue
|
|
}
|
|
|
|
checkIfRSIDCollisionExists := func()bool{
|
|
|
|
for _, rsidAlias := range rsidAliasesList{
|
|
|
|
aliasLocusValue, exists := genomeMap[rsidAlias]
|
|
if (exists == false){
|
|
continue
|
|
}
|
|
|
|
if (aliasLocusValue != rsidLocusValue){
|
|
// A collision exists with an alias rsID
|
|
// The company must be creating invalid results, or our alias list is invalid
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
rsidCollisionExists := checkIfRSIDCollisionExists()
|
|
if (rsidCollisionExists == true){
|
|
// We will delete this rsID
|
|
// We cannot trust any of the results
|
|
|
|
delete(genomeMap, rsID)
|
|
}
|
|
|
|
// We delete all aliases from the genome map
|
|
// We do this to save space
|
|
|
|
for _, rsidAlias := range rsidAliasesList{
|
|
delete(genomeMap, rsidAlias)
|
|
}
|
|
}
|
|
|
|
if (len(genomeMap) == 0){
|
|
// No valid locations exist
|
|
continue
|
|
}
|
|
|
|
genomeWithMetadataObject := GenomeWithMetadata{
|
|
GenomeType: "Single",
|
|
GenomeIdentifier: genomeIdentifier,
|
|
GenomeMap: genomeMap,
|
|
}
|
|
|
|
genomesWithMetadataList = append(genomesWithMetadataList, genomeWithMetadataObject)
|
|
allRawGenomeIdentifiersList = append(allRawGenomeIdentifiersList, genomeIdentifier)
|
|
|
|
numberOfGenomesRead += 1
|
|
}
|
|
|
|
containsDuplicates, _ := helpers.CheckIfListContainsDuplicates(allRawGenomeIdentifiersList)
|
|
if (containsDuplicates == true){
|
|
return nil, nil, false, [16]byte{}, [16]byte{}, errors.New("GetGenomesWithMetadataListFromRawGenomesList called with inputGenomesList containing duplicate genomeIdentifiers.")
|
|
}
|
|
|
|
err := updatePercentageCompleteFunction(20)
|
|
if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err }
|
|
|
|
if (len(genomesWithMetadataList) == 1){
|
|
|
|
// Only 1 genome exists.
|
|
// No genome combining is needed.
|
|
|
|
err = updatePercentageCompleteFunction(100)
|
|
if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err }
|
|
|
|
return genomesWithMetadataList, allRawGenomeIdentifiersList, false, [16]byte{}, [16]byte{}, nil
|
|
}
|
|
|
|
// Now we create the shared genomes
|
|
// The "OnlyExcludeConflicts" genome only excludes an SNP when there is disagreement about its value
|
|
// The "OnlyIncludeShared" genome only includes SNPs that at least 2 genome files share. This will be the most reliable genome.
|
|
// For both of the genomes, if more than 2 base pairs exist for the same rsid, the most reported SNP value will be used
|
|
// For instance, if 3 genome files are imported, and 1 disagrees with the other two, the other two's snp value will be used
|
|
// If there is a tie, the rsid will be omitted
|
|
|
|
// This map stores all RSIDs across all genomes
|
|
allRSIDsMap := make(map[int64]struct{})
|
|
|
|
finalIndex := len(genomesWithMetadataList) - 1
|
|
|
|
for index, genomeWithMetadataObject := range genomesWithMetadataList{
|
|
|
|
newPercentageCompletion, err := helpers.ScaleNumberProportionally(true, index, 0, finalIndex, 20, 50)
|
|
if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err }
|
|
|
|
err = updatePercentageCompleteFunction(newPercentageCompletion)
|
|
if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err }
|
|
|
|
genomeMap := genomeWithMetadataObject.GenomeMap
|
|
|
|
for rsID, _ := range genomeMap{
|
|
allRSIDsMap[rsID] = struct{}{}
|
|
}
|
|
}
|
|
|
|
onlyExcludeConflictsGenomeMap := make(map[int64]locusValue.LocusValue)
|
|
onlyIncludeSharedGenomeMap := make(map[int64]locusValue.LocusValue)
|
|
|
|
index := 0
|
|
finalIndex = len(allRSIDsMap) - 1
|
|
|
|
// This map stores the rsid aliases that we have already added
|
|
// We will skip these, because we will have already dealt with them
|
|
addedRSIDAliasesMap := make(map[int64]struct{})
|
|
|
|
for rsID, _ := range allRSIDsMap{
|
|
|
|
newPercentageCompletion, err := helpers.ScaleNumberProportionally(true, index, 0, finalIndex, 50, 100)
|
|
if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err }
|
|
|
|
err = updatePercentageCompleteFunction(newPercentageCompletion)
|
|
if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err }
|
|
|
|
index += 1
|
|
|
|
_, exists := addedRSIDAliasesMap[rsID]
|
|
if (exists == true){
|
|
// This rsID is an alias for an rsid we already added
|
|
continue
|
|
}
|
|
|
|
anyAliasesExist, rsidAliasesList, err := locusMetadata.GetRSIDAliases(rsID)
|
|
if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err }
|
|
if (anyAliasesExist == true){
|
|
|
|
for _, rsidAlias := range rsidAliasesList{
|
|
|
|
addedRSIDAliasesMap[rsidAlias] = struct{}{}
|
|
}
|
|
}
|
|
|
|
// We first check to see which base pairs are most commonly reported, without any regard for the order (is phased status)
|
|
// After doing this, we will see if we have enough information to determine if the phase is known
|
|
|
|
locusValuesList := make([]locusValue.LocusValue, 0)
|
|
|
|
for _, genomeWithMetadataObject := range genomesWithMetadataList{
|
|
|
|
genomeMap := genomeWithMetadataObject.GenomeMap
|
|
|
|
locusValueObject, exists := genomeMap[rsID]
|
|
if (exists == true){
|
|
locusValuesList = append(locusValuesList, locusValueObject)
|
|
continue
|
|
}
|
|
|
|
// We did not find the rsid
|
|
// We see if its alias exists
|
|
|
|
if (anyAliasesExist == false){
|
|
continue
|
|
}
|
|
|
|
for _, rsidAlias := range rsidAliasesList{
|
|
|
|
aliasLocusValueObject, exists := genomeMap[rsidAlias]
|
|
if (exists == true){
|
|
locusValuesList = append(locusValuesList, aliasLocusValueObject)
|
|
|
|
// We only want to add 1 alias value to the locusValuesList
|
|
// We already pruned the genomeMap of any multiple alias values, so searching further is unnecessary
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
//TODO: Our current methods could be improved here.
|
|
// Currently, we are sorting base pairs for genomes which report the phase as being known, and ones which do not
|
|
// In a better system, we should have 3 categories we keep track of for each letter pair:
|
|
// 1. AB (unknown phase)
|
|
// 2. BA (known phase)
|
|
// 3. AB (known phase)
|
|
// We should develop a way to weigh these against each other to return the most accurate results
|
|
// The way we do this should depend on how common phase-flips are compared to invalid base recordings
|
|
// Basically, we want base-pair-order disagreements between genomes who are confident about phase to be accounted for
|
|
|
|
// We use this map to store the sorted base pair counts
|
|
// We sort the base pairs in unicode order
|
|
// Map Structure: Sorted RSID Base pair value -> Number of genomes that contain the value
|
|
sortedLocusBasePairCountsMap := make(map[string]int)
|
|
|
|
for _, locusValueObject := range locusValuesList{
|
|
|
|
base1Value := locusValueObject.Base1Value
|
|
base2Value := locusValueObject.Base2Value
|
|
|
|
getSortedBasePairValue := func()string{
|
|
|
|
if (base1Value < base2Value){
|
|
result := base1Value + ";" + base2Value
|
|
return result
|
|
}
|
|
result := base2Value + ";" + base1Value
|
|
return result
|
|
}
|
|
|
|
sortedBasePairValue := getSortedBasePairValue()
|
|
|
|
sortedLocusBasePairCountsMap[sortedBasePairValue] += 1
|
|
}
|
|
|
|
// Now we figure out which sorted base pair value has the highest count
|
|
|
|
mostRecordedSortedBasePair := ""
|
|
mostRecordedSortedBasePairCount := 0
|
|
tieExists := false
|
|
|
|
for basePair, basePairCount := range sortedLocusBasePairCountsMap{
|
|
|
|
if (basePairCount > mostRecordedSortedBasePairCount){
|
|
mostRecordedSortedBasePair = basePair
|
|
mostRecordedSortedBasePairCount = basePairCount
|
|
tieExists = false
|
|
continue
|
|
}
|
|
if (basePairCount == mostRecordedSortedBasePairCount){
|
|
// There is a tie between differing basePair values
|
|
tieExists = true
|
|
continue
|
|
}
|
|
}
|
|
|
|
if (tieExists == true){
|
|
// The most recorded sorted base pair value has a tie with a different base pair value
|
|
// We will not add this locus to either combined map
|
|
continue
|
|
}
|
|
|
|
// Now we determine the phase for the most recorded sorted base pair
|
|
// The phase will be either known/unknown for each combined genome
|
|
|
|
//Outputs:
|
|
// -string: Locus Base 1
|
|
// -string: Locus Base 2
|
|
// -bool: Phase is known (Only exclude conflicts)
|
|
// -bool: Phase is known (Only include shared)
|
|
// -error
|
|
getLocusBasePair := func()(string, string, bool, bool, error){
|
|
|
|
sortedBase1, sortedBase2, delimiterFound := strings.Cut(mostRecordedSortedBasePair, ";")
|
|
if (delimiterFound == false){
|
|
return "", "", false, false, errors.New("mostRecordedSortedBasePair missing ; delimeter")
|
|
}
|
|
|
|
// We cycle through our locus objects to see how many report that the phase for this base pair is known
|
|
|
|
// basePair1 == sortedBase1, sortedBase2
|
|
// basePair2 == sortedBase2, sortedBase1
|
|
|
|
// We count how many genomes report each base pair order as being the true phased order
|
|
basePair1Count := 0
|
|
basePair2Count := 0
|
|
|
|
for _, locusObject := range locusValuesList{
|
|
|
|
locusIsPhased := locusObject.LocusIsPhased
|
|
if (locusIsPhased == false){
|
|
// This genome does not claim to know the phase of this locus base pair.
|
|
// We skip it.
|
|
continue
|
|
}
|
|
|
|
locusBase1 := locusObject.Base1Value
|
|
locusBase2 := locusObject.Base2Value
|
|
|
|
if (locusBase1 == sortedBase1 && locusBase2 == sortedBase2){
|
|
basePair1Count += 1
|
|
|
|
} else if (locusBase1 == sortedBase2 && locusBase2 == sortedBase1){
|
|
basePair2Count += 1
|
|
}
|
|
}
|
|
|
|
if (basePair1Count == 0 && basePair2Count == 0){
|
|
// No phase is known for this locus for any of the genomes
|
|
return sortedBase1, sortedBase2, false, false, nil
|
|
}
|
|
|
|
if (basePair1Count == basePair2Count){
|
|
// There is a tie. Thus, phase is unknown.
|
|
return sortedBase1, sortedBase2, false, false, nil
|
|
}
|
|
|
|
if (basePair1Count > basePair2Count){
|
|
|
|
if (basePair1Count < 2){
|
|
// There are not enough phaseIsKnown advocate genomes to be able to say this phase
|
|
// is known for the OnlyIncludeShared genome.
|
|
|
|
return sortedBase1, sortedBase2, true, false, nil
|
|
}
|
|
|
|
return sortedBase1, sortedBase2, true, true, nil
|
|
}
|
|
|
|
// basePair1Count < basePair2Count
|
|
|
|
if (basePair2Count < 2){
|
|
// There are not enough phaseIsKnown advocate genomes to be able to say this phase
|
|
// is known for the OnlyIncludeShared genome.
|
|
|
|
return sortedBase2, sortedBase1, true, false, nil
|
|
}
|
|
|
|
return sortedBase2, sortedBase1, true, true, nil
|
|
}
|
|
|
|
locusBase1, locusBase2, phaseIsKnown_OnlyExcludeConflicts, phaseIsKnown_OnlyIncludeShared, err := getLocusBasePair()
|
|
if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err }
|
|
|
|
// Now we add to the combined genome maps
|
|
// The OnlyExcludeConflicts will only omit when there is a tie
|
|
// The OnlyIncludeShared requires at least 2 to agree
|
|
|
|
onlyExcludeConflictsLocusValue := locusValue.LocusValue{
|
|
|
|
LocusIsPhased: phaseIsKnown_OnlyExcludeConflicts,
|
|
Base1Value: locusBase1,
|
|
Base2Value: locusBase2,
|
|
}
|
|
|
|
onlyExcludeConflictsGenomeMap[rsID] = onlyExcludeConflictsLocusValue
|
|
|
|
if (mostRecordedSortedBasePairCount >= 2){
|
|
|
|
onlyIncludeSharedLocusValue := locusValue.LocusValue{
|
|
LocusIsPhased: phaseIsKnown_OnlyIncludeShared,
|
|
Base1Value: locusBase1,
|
|
Base2Value: locusBase2,
|
|
}
|
|
|
|
onlyIncludeSharedGenomeMap[rsID] = onlyIncludeSharedLocusValue
|
|
}
|
|
}
|
|
|
|
onlyExcludeConflictsGenomeIdentifier, err := helpers.GetNewRandom16ByteArray()
|
|
if (err != nil) { return nil, nil, false, [16]byte{}, [16]byte{}, err }
|
|
|
|
onlyExcludeConflictsGenomeWithMetadataObject := GenomeWithMetadata{
|
|
GenomeType: "OnlyExcludeConflicts",
|
|
GenomeIdentifier: onlyExcludeConflictsGenomeIdentifier,
|
|
GenomeMap: onlyExcludeConflictsGenomeMap,
|
|
}
|
|
|
|
onlyIncludeSharedGenomeIdentifier, err := helpers.GetNewRandom16ByteArray()
|
|
if (err != nil) { return nil, nil, false, [16]byte{}, [16]byte{}, err }
|
|
|
|
onlyIncludeSharedGenomeWithMetadataObject := GenomeWithMetadata{
|
|
GenomeType: "OnlyIncludeShared",
|
|
GenomeIdentifier: onlyIncludeSharedGenomeIdentifier,
|
|
GenomeMap: onlyIncludeSharedGenomeMap,
|
|
}
|
|
|
|
genomesWithMetadataList = append(genomesWithMetadataList, onlyExcludeConflictsGenomeWithMetadataObject, onlyIncludeSharedGenomeWithMetadataObject)
|
|
|
|
err = updatePercentageCompleteFunction(100)
|
|
if (err != nil){ return nil, nil, false, [16]byte{}, [16]byte{}, err }
|
|
|
|
return genomesWithMetadataList, allRawGenomeIdentifiersList, true, onlyExcludeConflictsGenomeIdentifier, onlyIncludeSharedGenomeIdentifier, nil
|
|
}
|
|
|
|
|