// readRawGenomes provides functions to read raw genome files from companies like 23andMe and AncestryDNA. // These are used to create genetic analyses. package readRawGenomes import "seekia/resources/geneticReferences/locusMetadata" import "seekia/internal/helpers" import "bufio" import "strings" import "errors" import "io" import "time" // Returns the import version performed by the current version of Seekia // Seekia will continue to improve the ability to scan raw genome files // This includes building a database of the rs identifiers for company-specific snp identifiers // The raw genome metadata within myGenomes.go must be updated whenever the user updates Seekia and a newer import version exists func GetCurrentCompanyImportVersion(companyName string)(int, error){ if (companyName == "23andMe"){ return 1, nil } if (companyName == "AncestryDNA"){ return 1, nil } return 0, errors.New("GetCurrentCompanyImportVersion called with invalid company name: " + companyName) } type RawGenomeLocusValue struct{ // This is one of the following: "C", "A", "T", "G", "I", "D" Allele1 string // This stores if a second base exists // Base2 will not exist for some chromosomes (Example: Y chromosome) Allele2Exists bool // This is one of the following: "C", "A", "T", "G", "I", "D" Allele2 string } // This function will always adjust all locus values to represent the Plus strand // We only import genes into the genomeMap which are used for the analysis // // TODO: The ability to process genomes with a different strand will be added // A database to know which SNPs are Plus/Minus for different HG reference genomes needs to be added //Outputs: // -string: Company name ("23andMe", "AncestryDNA") // -int: ImportVersion (This will help when Seekia has a new way of importing the files. Each company has its own importVersion) // -int64: Time file was generated // -int64: Total number of loci (we will only read a tiny fraction of these into the genome map) // -This does not count loci which have no value (Example: "--") // -bool: IsPhased (allele order corresponds to haplotype) // -map[int64]RawGenomeLocusValue: RSID -> Locus allele value(s) // -error (file not readable) func ReadRawGenomeFile(fileReader io.Reader) (string, int, int64, int64, bool, map[int64]RawGenomeLocusValue, error) { validBasesList := []string{"C", "A", "T", "G", "I", "D"} verifyBase := func(inputBase string)bool{ for _, baseString := range validBasesList{ if (inputBase == baseString){ return true } } return false } //Outputs: // -bool: Able to read // -int64: RSID int64 readRSIDString := func(inputRSID string)(bool, int64){ stringWithoutPrefix, prefixExists := strings.CutPrefix(inputRSID, "rs") if (prefixExists == false){ return false, 0 } result, err := helpers.ConvertStringToInt64(stringWithoutPrefix) if (err != nil){ return false, 0 } return true, result } fileBufioReader := bufio.NewReader(fileReader) firstLine, err := fileBufioReader.ReadString('\n') if (err != nil){ // File does not have another line return "", 0, 0, 0, false, nil, errors.New("Malformed genome file: Too short.") } fileIsAncestryDNA := strings.HasPrefix(firstLine, "#AncestryDNA raw data download") if (fileIsAncestryDNA == true){ // AncestryDNA represents all locus values on the Plus strand. (allegedly) secondLine, err := fileBufioReader.ReadString('\n') if (err != nil){ // File does not have another line return "", 0, 0, 0, false, nil, errors.New("Malformed genome file: Too short.") } // Second line contains time of creation // Example: // "#This file was generated by AncestryDNA at: 02/22/2022 10:10:10 UTC" // Time is Month -> Day -> Year timeOfCreationString := strings.TrimPrefix(secondLine, "#This file was generated by AncestryDNA at: ") monthString, remainingString, found := strings.Cut(timeOfCreationString, "/") if (found == false){ return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Missing Month") } dayString, remainingString, found := strings.Cut(remainingString, "/") if (found == false){ return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Missing Day") } yearString, _, found := strings.Cut(remainingString, " ") if (found == false){ return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Missing Year") } getMonthObject := func()(time.Month, error){ switch monthString{ case "01":{ return time.January, nil } case "02":{ return time.February, nil } case "03":{ return time.March, nil } case "04":{ return time.April, nil } case "05":{ return time.May, nil } case "06":{ return time.June, nil } case "07":{ return time.July, nil } case "08":{ return time.August, nil } case "09":{ return time.September, nil } case "10":{ return time.October, nil } case "11":{ return time.November, nil } case "12":{ return time.December, nil } } return time.January, errors.New("Malformed AncestryDNA genome file: Invalid month: " + monthString) } monthObject, err := getMonthObject() if (err != nil) { return "", 0, 0, 0, false, nil, err } yearInt, err := helpers.ConvertStringToInt(yearString) if (err != nil) { return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Invalid year: " + yearString) } dayInt, err := helpers.ConvertStringToInt(dayString) if (err != nil) { return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Invalid day: " + dayString) } fileTimeObject := time.Date(yearInt, monthObject, dayInt, 0, 0, 0, 0, time.UTC) fileTimeUnix := fileTimeObject.Unix() // Now we advance bufio reader to the SNP rows for { fileLineString, err := fileBufioReader.ReadString('\n') if (err != nil){ // File does not have another line return "", 0, 0, 0, false, nil, errors.New("Malformed genome file: Too short.") } // All SNP rows comes after this line: // "rsid chromosome position allele1 allele2" lineReached := strings.HasPrefix(fileLineString, "rsid") if (lineReached == true){ break } } numberOfLoci := int64(0) genomeMap := make(map[int64]RawGenomeLocusValue) for { fileLineString, err := fileBufioReader.ReadString('\n') if (err != nil){ // File does not have another line break } if (fileLineString == "\n"){ // This is the final line break } numberOfLoci += 1 fileLineWithoutNewline := strings.TrimSuffix(fileLineString, "\n") // Example row (tab delimited) // "rs9777703 1 928836 T T" rowColumnsSlice := strings.Split(fileLineWithoutNewline, "\t") if (len(rowColumnsSlice) != 5){ return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Invalid row detected: " + fileLineString) } snpIdentifier := rowColumnsSlice[0] isValid, locusRSID := readRSIDString(snpIdentifier) if (isValid == false){ // This must be a custom identifier, not an RSID // TODO: Create database to convert them to RSIDs, like the one we have for 23andMe continue } // We check to see if this rsID is used in any part of the genetic analysis, and skip it if it is not. locusFound, _, err := locusMetadata.GetLocusMetadata(locusRSID) if (err != nil) { return "", 0, 0, 0, false, nil, err } if (locusFound == false){ // This locus is not used for any part of the genetic analysis // We don't need to include it in our genome map continue } alleleABase := rowColumnsSlice[3] alleleBBaseRaw := rowColumnsSlice[4] // Allele B has a control character suffix // We will remove it alleleBBase := string(alleleBBaseRaw[0]) if (alleleABase == "0" || alleleBBase == "0"){ // The data was not successfully recorded // We will skip this line continue } aIsValid := verifyBase(alleleABase) if (aIsValid == false){ return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Invalid base detected: " + alleleABase) } bIsValid := verifyBase(alleleBBase) if (bIsValid == false){ return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Invalid base detected: " + alleleBBase) } locusValueObject := RawGenomeLocusValue{ Allele1: alleleABase, Allele2Exists: true, Allele2: alleleBBase, } genomeMap[locusRSID] = locusValueObject } // Each file usually has 500,000+ SNPs if (numberOfLoci < 1000){ return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Not enough bases.") } return "AncestryDNA", 1, fileTimeUnix, numberOfLoci, false, genomeMap, nil } // 23andMe files have the following first line format: // "# This data file generated by 23andMe at: Mon Jan 01 01:00:00 2022" fileIs23andMe := strings.HasPrefix(firstLine, "# This data file generated by 23andMe at:") if (fileIs23andMe == true){ // 23andMe represents all locus values on the Plus strand. timeOfGenerationString := strings.TrimPrefix(firstLine, "# This data file generated by 23andMe at: ") timeSlice := strings.Split(timeOfGenerationString, " ") if (len(timeSlice) != 5 && len(timeSlice) != 6){ return "", 0, 0, 0, false, nil, errors.New("Malformed 23andMe genome file: Invalid time: " + timeOfGenerationString) } monthString := timeSlice[1] getDayAndYearString := func()(string, string){ if (len(timeSlice) == 5){ dayString := timeSlice[2] yearString := timeSlice[4] return dayString, yearString } // len(timeSlice) == 6 dayString := timeSlice[3] yearString := timeSlice[5] return dayString, yearString } dayString, yearString := getDayAndYearString() getMonthObject := func()(time.Month, error){ switch monthString{ case "Jan":{ return time.January, nil } case "Feb":{ return time.February, nil } case "Mar":{ return time.March, nil } case "Apr":{ return time.April, nil } case "May":{ return time.May, nil } case "Jun":{ return time.June, nil } case "Jul":{ return time.July, nil } case "Aug":{ return time.August, nil } case "Sep":{ return time.September, nil } case "Oct":{ return time.October, nil } case "Nov":{ return time.November, nil } case "Dec":{ return time.December, nil } } return time.January, errors.New("Malformed 23andMe genome file: Invalid month: " + monthString) } monthObject, err := getMonthObject() if (err != nil) { return "", 0, 0, 0, false, nil, err } // We have to remove control character and newline suffix yearExtractedString := string(yearString[:4]) yearInt, err := helpers.ConvertStringToInt(yearExtractedString) if (err != nil) { return "", 0, 0, 0, false, nil, errors.New("Malformed 23andMe genome file: Invalid year: " + yearExtractedString) } dayInt, err := helpers.ConvertStringToInt(dayString) if (err != nil) { return "", 0, 0, 0, false, nil, errors.New("Malformed 23andMe genome file: Invalid day: " + dayString) } fileTimeObject := time.Date(yearInt, monthObject, dayInt, 0, 0, 0, 0, time.UTC) fileTimeUnix := fileTimeObject.Unix() // Now we advance bufio reader to the snp rows for { fileLineString, err := fileBufioReader.ReadString('\n') if (err != nil){ // File does not have another line return "", 0, 0, 0, false, nil, errors.New("Malformed genome file: Too short.") } // All SNP rows come after this line: // "# rsid chromosome position genotype" lineReached := strings.HasPrefix(fileLineString, "# rsid") if (lineReached == true){ break } } numberOfLoci := int64(0) genomeMap := make(map[int64]RawGenomeLocusValue) for { fileLineString, err := fileBufioReader.ReadString('\n') if (err != nil){ // File does not have another line break } if (fileLineString == "\n"){ // This is the final line break } fileLineWithoutNewline := strings.TrimSuffix(fileLineString, "\n") // Rows look like this // "rs4477212 1 82154 GG" // "rs571313759 1 1181945 --" (-- means no entry) // "i3001920 MT 16470 G" (one base is possible) rowSlice := strings.Split(fileLineWithoutNewline, "\t") if (len(rowSlice) != 4){ return "", 0, 0, 0, false, nil, errors.New("Malformed 23andMe genome data: Invalid SNP row: " + fileLineString) } snpIdentifier := rowSlice[0] snpValueRaw := rowSlice[3] if (len(snpValueRaw) < 2){ return "", 0, 0, 0, false, nil, errors.New("Malformed 23andMe genome data: Invalid SNP row snp value: " + fileLineString) } if (snpValueRaw[0] == '-'){ // Locus value is "--" // Locus value does not exist continue } numberOfLoci += 1 //Outputs: // -bool: rsID found // -int64: rsID value // -error getRSIDIdentifier := func()(bool, int64, error){ isRSID, rsidInt64 := readRSIDString(snpIdentifier) if (isRSID == true){ return true, rsidInt64, nil } // We see if it is a custom locus alias aliasFound, rsidInt64, err := locusMetadata.GetCompanyAliasRSID("23andMe", snpIdentifier) if (err != nil) { return false, 0, err } if (aliasFound == true){ return true, rsidInt64, nil } return false, 0, nil } rsidFound, locusRSID, err := getRSIDIdentifier() if (err != nil) { return "", 0, 0, 0, false, nil, err } if (rsidFound == false){ // RSID is unknown. // It is probably a custom identifier that represents a locus we don't use for the analysis. continue } locusFound, _, err := locusMetadata.GetLocusMetadata(locusRSID) if (err != nil) { return "", 0, 0, 0, false, nil, err } if (locusFound == false){ // This locus is not used for any part of the genetic analysis // We don't need to include it in our genome map continue } getLocusBasesList := func()([]rune, error){ locusBasesList := make([]rune, 0) finalIndex := len(snpValueRaw) - 1 for index, character := range snpValueRaw{ baseIsValid := verifyBase(string(character)) if (baseIsValid == false){ if (index == finalIndex){ // The final index of snpValueRaw is sometimes a control character return locusBasesList, nil } return nil, errors.New("Malformed 23andMe genome file: Invalid SNP base: " + string(character)) } locusBasesList = append(locusBasesList, character) } return locusBasesList, nil } locusBasesList, err := getLocusBasesList() if (err != nil){ return "", 0, 0, 0, false, nil, err } allele1 := string(locusBasesList[0]) locusValueObject := RawGenomeLocusValue{ Allele1: allele1, } if (len(locusBasesList) > 1){ allele2 := string(locusBasesList[1]) locusValueObject.Allele2Exists = true locusValueObject.Allele2 = allele2 } genomeMap[locusRSID] = locusValueObject } return "23andMe", 1, fileTimeUnix, numberOfLoci, false, genomeMap, nil } return "", 0, 0, 0, false, nil, errors.New("Cannot read genome file: File format not known.") }