seekia/utilities/importLocusMetadata/importLocusMetadata.go

279 lines
7.6 KiB
Go

// importLocusMetadata.go provides a function to import locus metadata from raw genome files.
// It uses a 23andMe raw genome file to find the chromosomes and positions for new rsIDs.
// The imported loci will be missing the GeneNames list and any references.
// The imported loci may be missing locus aliases
// TODO: Instead of using 23andMe files, use a better full-genome reference which has gene names.
package main
import "seekia/resources/geneticReferences/locusMetadata"
import "seekia/internal/helpers"
import "seekia/internal/localFilesystem"
import "encoding/json"
import "slices"
import "strings"
import "bufio"
import "bytes"
import "log"
func main(){
fileExists, fileBytes, err := localFilesystem.GetFileContents("./23andMeRawGenome.txt")
if (err != nil){
log.Println(err.Error())
return
}
if (fileExists == false){
log.Println("Error: 23AndMeRawGenome.txt does not exist.")
log.Println("You must add a 23andMe raw genome file to the addLocusMetadata folder so we can retrieve locus metadata from the file.")
return
}
fileReader := bytes.NewReader(fileBytes)
fileBufioReader := bufio.NewReader(fileReader)
firstLine, err := fileBufioReader.ReadString('\n')
if (err != nil){
// File does not have another line
log.Println("Malformed 23andMe genome file: Too short.")
return
}
fileIs23andMe := strings.HasPrefix(firstLine, "# This data file generated by 23andMe at:")
if (fileIs23andMe == false){
log.Println("Malformed 23andMe genome file: Missing header.")
return
}
// Now we advance bufio reader to the snp rows
for {
fileLineString, err := fileBufioReader.ReadString('\n')
if (err != nil){
// File does not have another line
log.Println("Malformed 23andMe genome file: Too short.")
return
}
// All SNP rows come after this line:
// "# rsid chromosome position genotype"
lineReached := strings.HasPrefix(fileLineString, "# rsid")
if (lineReached == true){
break
}
}
type LocusInfoObject struct{
Chromosome int
Position int
}
// Map structure: Locus rsID -> Info Object
lociInfoMap := make(map[int64]LocusInfoObject)
for {
fileLineString, err := fileBufioReader.ReadString('\n')
if (err != nil){
// File does not have another line
break
}
if (fileLineString == "\n"){
// This is the final line
break
}
fileLineWithoutNewline := strings.TrimSuffix(fileLineString, "\n")
// Rows look like this
// "rs4477212 1 82154 GG"
// "rs571313759 1 1181945 --" (-- means no entry)
// "i3001920 MT 16470 G" (one base is possible)
rowSlice := strings.Split(fileLineWithoutNewline, "\t")
if (len(rowSlice) != 4){
log.Println("Malformed 23andMe genome data: Invalid SNP row: " + fileLineString)
return
}
locusIdentifierString := rowSlice[0]
locusChromosomeString := rowSlice[1]
locusPositionString := rowSlice[2]
//Outputs:
// -bool: rsID found
// -int64: rsID value
getRSIDIdentifier := func()(bool, int64){
stringWithoutPrefix, prefixExists := strings.CutPrefix(locusIdentifierString, "rs")
if (prefixExists == false){
return false, 0
}
rsidInt64, err := helpers.ConvertStringToInt64(stringWithoutPrefix)
if (err != nil){
return false, 0
}
return true, rsidInt64
}
isRSID, locusRSID := getRSIDIdentifier()
if (isRSID == false){
// RSID is unknown.
// It is probably a custom identifier (Example: i713426)
continue
}
locusChromosome, err := helpers.ConvertStringToInt(locusChromosomeString)
if (err != nil){
// It is probably "MT" or "X" chromosome
continue
}
locusPosition, err := helpers.ConvertStringToInt(locusPositionString)
if (err != nil){
log.Println("23andMe file is malformed: Contains invalid locusPosition: " + locusPositionString)
return
}
locusInfoObject := LocusInfoObject{
Chromosome: locusChromosome,
Position: locusPosition,
}
lociInfoMap[locusRSID] = locusInfoObject
}
// This is a list of rsIDs whose metadata we should add to the locus metadata
lociToAddList := []int64{}
containsDuplicates, _ := helpers.CheckIfListContainsDuplicates(lociToAddList)
if (containsDuplicates == true){
log.Println("lociToAddList contains duplicates.")
return
}
err = locusMetadata.InitializeLocusMetadataVariables()
if (err != nil){
log.Println("ERROR: " + err.Error())
return
}
// This list will store the loci for which no metadata existed
missingLociList := make([]int64, 0)
// Map Structure: Chromosome -> List of locus metadata objects to add
lociToAddMap := make(map[int][]locusMetadata.LocusMetadata)
numberOfImportedLoci := 0
for _, rsID := range lociToAddList{
// First we check to see if locus metadata already exists
exists, _, err := locusMetadata.GetLocusMetadata(rsID)
if (err != nil){
log.Println("ERROR: " + err.Error())
return
}
if (exists == true){
log.Println("lociToAddList contains locus whose metadata already exists.")
return
}
locusInfoObject, exists := lociInfoMap[rsID]
if (exists == false){
// The 23andMe file does not contain metadata for this locus
missingLociList = append(missingLociList, rsID)
continue
}
numberOfImportedLoci += 1
locusChromosome := locusInfoObject.Chromosome
locusPosition := locusInfoObject.Position
newLocusMetadataObject := locusMetadata.LocusMetadata{
RSIDsList: []int64{rsID},
Chromosome: locusChromosome,
Position: locusPosition,
GeneNamesList: []string{"MISSING"},
CompanyAliases: make(map[locusMetadata.GeneticsCompany][]string),
References: make(map[string]string),
}
existingList, exists := lociToAddMap[locusChromosome]
if (exists == false){
lociToAddMap[locusChromosome] = []locusMetadata.LocusMetadata{newLocusMetadataObject}
} else {
existingList = append(existingList, newLocusMetadataObject)
lociToAddMap[locusChromosome] = existingList
}
}
for chromosomeInt, locusMetadataObjectsToAddList := range lociToAddMap{
existingLocusMetadataObjectsList, err := locusMetadata.GetLocusMetadataObjectsListByChromosome(chromosomeInt)
if (err != nil) {
log.Println(err)
return
}
newLocusMetadataObjectsList := slices.Concat(existingLocusMetadataObjectsList, locusMetadataObjectsToAddList)
newChromosomeFileBytes, err := json.MarshalIndent(newLocusMetadataObjectsList, "", "\t")
if (err != nil){
log.Println("ERROR: " + err.Error())
return
}
currentChromosomeString := helpers.ConvertIntToString(chromosomeInt)
locusMetadataFilepath := "../../resources/geneticReferences/locusMetadata/"
err = localFilesystem.CreateOrOverwriteFile(newChromosomeFileBytes, locusMetadataFilepath, "LocusMetadata_Chromosome" + currentChromosomeString + ".json")
if (err != nil){
log.Println("ERROR: " + err.Error())
return
}
}
totalLociToAdd := len(lociToAddList)
totalLociToAddString := helpers.ConvertIntToString(totalLociToAdd)
numberOfImportedLociString := helpers.ConvertIntToString(numberOfImportedLoci)
log.Println("Successfully imported " + numberOfImportedLociString + "/" + totalLociToAddString + " locus metadatas!")
numberOfMissingLoci := len(missingLociList)
numberOfMissingLociString := helpers.ConvertIntToString(numberOfMissingLoci)
log.Println(numberOfMissingLociString + " loci contained no metadata in the 23andMe genome file.")
if (len(missingLociList) == 0){
return
}
missingLociStringsList := make([]string, 0, len(missingLociList))
for _, rsID := range missingLociList{
rsIDString := helpers.ConvertInt64ToString(rsID)
missingLociStringsList = append(missingLociStringsList, rsIDString)
}
missingLociListFormatted := strings.Join(missingLociStringsList, ", ")
log.Println("Missing loci list: " + missingLociListFormatted)
}