279 lines
7.6 KiB
Go
279 lines
7.6 KiB
Go
|
|
// importLocusMetadata.go provides a function to import locus metadata from raw genome files.
|
|
// It uses a 23andMe raw genome file to find the chromosomes and positions for new rsIDs.
|
|
// The imported loci will be missing the GeneNames list and any references.
|
|
// The imported loci may be missing locus aliases
|
|
// TODO: Instead of using 23andMe files, use a better full-genome reference which has gene names.
|
|
|
|
package main
|
|
|
|
import "seekia/resources/geneticReferences/locusMetadata"
|
|
|
|
import "seekia/internal/helpers"
|
|
import "seekia/internal/localFilesystem"
|
|
|
|
import "encoding/json"
|
|
|
|
import "slices"
|
|
import "strings"
|
|
import "bufio"
|
|
import "bytes"
|
|
|
|
import "log"
|
|
|
|
func main(){
|
|
|
|
fileExists, fileBytes, err := localFilesystem.GetFileContents("./23andMeRawGenome.txt")
|
|
if (err != nil){
|
|
log.Println(err.Error())
|
|
return
|
|
}
|
|
if (fileExists == false){
|
|
log.Println("Error: 23AndMeRawGenome.txt does not exist.")
|
|
log.Println("You must add a 23andMe raw genome file to the addLocusMetadata folder so we can retrieve locus metadata from the file.")
|
|
return
|
|
}
|
|
|
|
fileReader := bytes.NewReader(fileBytes)
|
|
|
|
fileBufioReader := bufio.NewReader(fileReader)
|
|
|
|
firstLine, err := fileBufioReader.ReadString('\n')
|
|
if (err != nil){
|
|
// File does not have another line
|
|
log.Println("Malformed 23andMe genome file: Too short.")
|
|
return
|
|
}
|
|
|
|
fileIs23andMe := strings.HasPrefix(firstLine, "# This data file generated by 23andMe at:")
|
|
if (fileIs23andMe == false){
|
|
log.Println("Malformed 23andMe genome file: Missing header.")
|
|
return
|
|
}
|
|
|
|
// Now we advance bufio reader to the snp rows
|
|
for {
|
|
fileLineString, err := fileBufioReader.ReadString('\n')
|
|
if (err != nil){
|
|
// File does not have another line
|
|
log.Println("Malformed 23andMe genome file: Too short.")
|
|
return
|
|
}
|
|
|
|
// All SNP rows come after this line:
|
|
// "# rsid chromosome position genotype"
|
|
lineReached := strings.HasPrefix(fileLineString, "# rsid")
|
|
if (lineReached == true){
|
|
break
|
|
}
|
|
}
|
|
|
|
type LocusInfoObject struct{
|
|
Chromosome int
|
|
Position int
|
|
}
|
|
|
|
// Map structure: Locus rsID -> Info Object
|
|
lociInfoMap := make(map[int64]LocusInfoObject)
|
|
|
|
for {
|
|
|
|
fileLineString, err := fileBufioReader.ReadString('\n')
|
|
if (err != nil){
|
|
// File does not have another line
|
|
break
|
|
}
|
|
if (fileLineString == "\n"){
|
|
// This is the final line
|
|
break
|
|
}
|
|
|
|
fileLineWithoutNewline := strings.TrimSuffix(fileLineString, "\n")
|
|
|
|
// Rows look like this
|
|
// "rs4477212 1 82154 GG"
|
|
// "rs571313759 1 1181945 --" (-- means no entry)
|
|
// "i3001920 MT 16470 G" (one base is possible)
|
|
|
|
rowSlice := strings.Split(fileLineWithoutNewline, "\t")
|
|
|
|
if (len(rowSlice) != 4){
|
|
log.Println("Malformed 23andMe genome data: Invalid SNP row: " + fileLineString)
|
|
return
|
|
}
|
|
|
|
locusIdentifierString := rowSlice[0]
|
|
locusChromosomeString := rowSlice[1]
|
|
locusPositionString := rowSlice[2]
|
|
|
|
//Outputs:
|
|
// -bool: rsID found
|
|
// -int64: rsID value
|
|
getRSIDIdentifier := func()(bool, int64){
|
|
|
|
stringWithoutPrefix, prefixExists := strings.CutPrefix(locusIdentifierString, "rs")
|
|
if (prefixExists == false){
|
|
return false, 0
|
|
}
|
|
|
|
rsidInt64, err := helpers.ConvertStringToInt64(stringWithoutPrefix)
|
|
if (err != nil){
|
|
return false, 0
|
|
}
|
|
|
|
return true, rsidInt64
|
|
}
|
|
|
|
isRSID, locusRSID := getRSIDIdentifier()
|
|
if (isRSID == false){
|
|
// RSID is unknown.
|
|
// It is probably a custom identifier (Example: i713426)
|
|
continue
|
|
}
|
|
|
|
locusChromosome, err := helpers.ConvertStringToInt(locusChromosomeString)
|
|
if (err != nil){
|
|
// It is probably "MT" or "X" chromosome
|
|
continue
|
|
}
|
|
|
|
locusPosition, err := helpers.ConvertStringToInt(locusPositionString)
|
|
if (err != nil){
|
|
log.Println("23andMe file is malformed: Contains invalid locusPosition: " + locusPositionString)
|
|
return
|
|
}
|
|
|
|
locusInfoObject := LocusInfoObject{
|
|
Chromosome: locusChromosome,
|
|
Position: locusPosition,
|
|
}
|
|
|
|
lociInfoMap[locusRSID] = locusInfoObject
|
|
}
|
|
|
|
// This is a list of rsIDs whose metadata we should add to the locus metadata
|
|
lociToAddList := []int64{}
|
|
|
|
containsDuplicates, _ := helpers.CheckIfListContainsDuplicates(lociToAddList)
|
|
if (containsDuplicates == true){
|
|
log.Println("lociToAddList contains duplicates.")
|
|
return
|
|
}
|
|
|
|
err = locusMetadata.InitializeLocusMetadataVariables()
|
|
if (err != nil){
|
|
log.Println("ERROR: " + err.Error())
|
|
return
|
|
}
|
|
|
|
// This list will store the loci for which no metadata existed
|
|
missingLociList := make([]int64, 0)
|
|
|
|
// Map Structure: Chromosome -> List of locus metadata objects to add
|
|
lociToAddMap := make(map[int][]locusMetadata.LocusMetadata)
|
|
|
|
numberOfImportedLoci := 0
|
|
|
|
for _, rsID := range lociToAddList{
|
|
|
|
// First we check to see if locus metadata already exists
|
|
|
|
exists, _, err := locusMetadata.GetLocusMetadata(rsID)
|
|
if (err != nil){
|
|
log.Println("ERROR: " + err.Error())
|
|
return
|
|
}
|
|
if (exists == true){
|
|
log.Println("lociToAddList contains locus whose metadata already exists.")
|
|
return
|
|
}
|
|
|
|
locusInfoObject, exists := lociInfoMap[rsID]
|
|
if (exists == false){
|
|
// The 23andMe file does not contain metadata for this locus
|
|
missingLociList = append(missingLociList, rsID)
|
|
continue
|
|
}
|
|
|
|
numberOfImportedLoci += 1
|
|
|
|
locusChromosome := locusInfoObject.Chromosome
|
|
locusPosition := locusInfoObject.Position
|
|
|
|
newLocusMetadataObject := locusMetadata.LocusMetadata{
|
|
RSIDsList: []int64{rsID},
|
|
Chromosome: locusChromosome,
|
|
Position: locusPosition,
|
|
GeneNamesList: []string{"MISSING"},
|
|
CompanyAliases: make(map[locusMetadata.GeneticsCompany][]string),
|
|
References: make(map[string]string),
|
|
}
|
|
|
|
existingList, exists := lociToAddMap[locusChromosome]
|
|
if (exists == false){
|
|
lociToAddMap[locusChromosome] = []locusMetadata.LocusMetadata{newLocusMetadataObject}
|
|
} else {
|
|
existingList = append(existingList, newLocusMetadataObject)
|
|
lociToAddMap[locusChromosome] = existingList
|
|
}
|
|
}
|
|
|
|
for chromosomeInt, locusMetadataObjectsToAddList := range lociToAddMap{
|
|
|
|
existingLocusMetadataObjectsList, err := locusMetadata.GetLocusMetadataObjectsListByChromosome(chromosomeInt)
|
|
if (err != nil) {
|
|
log.Println(err)
|
|
return
|
|
}
|
|
|
|
newLocusMetadataObjectsList := slices.Concat(existingLocusMetadataObjectsList, locusMetadataObjectsToAddList)
|
|
|
|
newChromosomeFileBytes, err := json.MarshalIndent(newLocusMetadataObjectsList, "", "\t")
|
|
if (err != nil){
|
|
log.Println("ERROR: " + err.Error())
|
|
return
|
|
}
|
|
|
|
currentChromosomeString := helpers.ConvertIntToString(chromosomeInt)
|
|
|
|
locusMetadataFilepath := "../../resources/geneticReferences/locusMetadata/"
|
|
|
|
err = localFilesystem.CreateOrOverwriteFile(newChromosomeFileBytes, locusMetadataFilepath, "LocusMetadata_Chromosome" + currentChromosomeString + ".json")
|
|
if (err != nil){
|
|
log.Println("ERROR: " + err.Error())
|
|
return
|
|
}
|
|
}
|
|
|
|
totalLociToAdd := len(lociToAddList)
|
|
totalLociToAddString := helpers.ConvertIntToString(totalLociToAdd)
|
|
|
|
numberOfImportedLociString := helpers.ConvertIntToString(numberOfImportedLoci)
|
|
|
|
log.Println("Successfully imported " + numberOfImportedLociString + "/" + totalLociToAddString + " locus metadatas!")
|
|
|
|
numberOfMissingLoci := len(missingLociList)
|
|
|
|
numberOfMissingLociString := helpers.ConvertIntToString(numberOfMissingLoci)
|
|
|
|
log.Println(numberOfMissingLociString + " loci contained no metadata in the 23andMe genome file.")
|
|
|
|
if (len(missingLociList) == 0){
|
|
return
|
|
}
|
|
|
|
missingLociStringsList := make([]string, 0, len(missingLociList))
|
|
|
|
for _, rsID := range missingLociList{
|
|
|
|
rsIDString := helpers.ConvertInt64ToString(rsID)
|
|
|
|
missingLociStringsList = append(missingLociStringsList, rsIDString)
|
|
}
|
|
|
|
missingLociListFormatted := strings.Join(missingLociStringsList, ", ")
|
|
|
|
log.Println("Missing loci list: " + missingLociListFormatted)
|
|
}
|
|
|
|
|