seekia/internal/genetics/readRawGenomes/readRawGenomes.go

547 lines
15 KiB
Go
Raw Normal View History

// readRawGenomes provides functions to read raw genome files from companies like 23andMe and AncestryDNA.
// These are used to create genetic analyses.
package readRawGenomes
import "seekia/resources/geneticReferences/locusMetadata"
import "seekia/internal/helpers"
import "bufio"
import "strings"
import "errors"
import "io"
import "time"
// Returns the import version performed by the current version of Seekia
// Seekia will continue to improve the ability to scan raw genome files
// This includes building a database of the rs identifiers for company-specific snp identifiers
// The raw genome metadata within myGenomes.go must be updated whenever the user updates Seekia and a newer import version exists
func GetCurrentCompanyImportVersion(companyName string)(int, error){
if (companyName == "23andMe"){
return 1, nil
}
if (companyName == "AncestryDNA"){
return 1, nil
}
return 0, errors.New("GetCurrentCompanyImportVersion called with invalid company name: " + companyName)
}
type RawGenomeLocusValue struct{
// This is one of the following: "C", "A", "T", "G", "I", "D"
Allele1 string
// This stores if a second base exists
// Base2 will not exist for some chromosomes (Example: Y chromosome)
Allele2Exists bool
// This is one of the following: "C", "A", "T", "G", "I", "D"
Allele2 string
}
// This function will always adjust all locus values to represent the Plus strand
// We only import genes into the genomeMap which are used for the analysis
//
// TODO: The ability to process genomes with a different strand will be added
// A database to know which SNPs are Plus/Minus for different HG reference genomes needs to be added
//Outputs:
// -string: Company name ("23andMe", "AncestryDNA")
// -int: ImportVersion (This will help when Seekia has a new way of importing the files. Each company has its own importVersion)
// -int64: Time file was generated
// -int64: Total number of loci (we will only read a tiny fraction of these into the genome map)
// -This does not count loci which have no value (Example: "--")
// -bool: IsPhased (allele order corresponds to haplotype)
// -map[int64]RawGenomeLocusValue: RSID -> Locus allele value(s)
// -error (file not readable)
func ReadRawGenomeFile(fileReader io.Reader) (string, int, int64, int64, bool, map[int64]RawGenomeLocusValue, error) {
validBasesList := []string{"C", "A", "T", "G", "I", "D"}
verifyBase := func(inputBase string)bool{
for _, baseString := range validBasesList{
if (inputBase == baseString){
return true
}
}
return false
}
//Outputs:
// -bool: Able to read
// -int64: RSID int64
readRSIDString := func(inputRSID string)(bool, int64){
stringWithoutPrefix, prefixExists := strings.CutPrefix(inputRSID, "rs")
if (prefixExists == false){
return false, 0
}
result, err := helpers.ConvertStringToInt64(stringWithoutPrefix)
if (err != nil){
return false, 0
}
return true, result
}
fileBufioReader := bufio.NewReader(fileReader)
firstLine, err := fileBufioReader.ReadString('\n')
if (err != nil){
// File does not have another line
return "", 0, 0, 0, false, nil, errors.New("Malformed genome file: Too short.")
}
fileIsAncestryDNA := strings.HasPrefix(firstLine, "#AncestryDNA raw data download")
if (fileIsAncestryDNA == true){
// AncestryDNA represents all locus values on the Plus strand. (allegedly)
secondLine, err := fileBufioReader.ReadString('\n')
if (err != nil){
// File does not have another line
return "", 0, 0, 0, false, nil, errors.New("Malformed genome file: Too short.")
}
// Second line contains time of creation
// Example:
// "#This file was generated by AncestryDNA at: 02/22/2022 10:10:10 UTC"
// Time is Month -> Day -> Year
timeOfCreationString := strings.TrimPrefix(secondLine, "#This file was generated by AncestryDNA at: ")
monthString, remainingString, found := strings.Cut(timeOfCreationString, "/")
if (found == false){
return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Missing Month")
}
dayString, remainingString, found := strings.Cut(remainingString, "/")
if (found == false){
return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Missing Day")
}
yearString, _, found := strings.Cut(remainingString, " ")
if (found == false){
return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Missing Year")
}
getMonthObject := func()(time.Month, error){
switch monthString{
case "01":{
return time.January, nil
}
case "02":{
return time.February, nil
}
case "03":{
return time.March, nil
}
case "04":{
return time.April, nil
}
case "05":{
return time.May, nil
}
case "06":{
return time.June, nil
}
case "07":{
return time.July, nil
}
case "08":{
return time.August, nil
}
case "09":{
return time.September, nil
}
case "10":{
return time.October, nil
}
case "11":{
return time.November, nil
}
case "12":{
return time.December, nil
}
}
return time.January, errors.New("Malformed AncestryDNA genome file: Invalid month: " + monthString)
}
monthObject, err := getMonthObject()
if (err != nil) { return "", 0, 0, 0, false, nil, err }
yearInt, err := helpers.ConvertStringToInt(yearString)
if (err != nil) {
return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Invalid year: " + yearString)
}
dayInt, err := helpers.ConvertStringToInt(dayString)
if (err != nil) {
return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Invalid day: " + dayString)
}
fileTimeObject := time.Date(yearInt, monthObject, dayInt, 0, 0, 0, 0, time.UTC)
fileTimeUnix := fileTimeObject.Unix()
// Now we advance bufio reader to the SNP rows
for {
fileLineString, err := fileBufioReader.ReadString('\n')
if (err != nil){
// File does not have another line
return "", 0, 0, 0, false, nil, errors.New("Malformed genome file: Too short.")
}
// All SNP rows comes after this line:
// "rsid chromosome position allele1 allele2"
lineReached := strings.HasPrefix(fileLineString, "rsid")
if (lineReached == true){
break
}
}
numberOfLoci := int64(0)
genomeMap := make(map[int64]RawGenomeLocusValue)
for {
fileLineString, err := fileBufioReader.ReadString('\n')
if (err != nil){
// File does not have another line
break
}
if (fileLineString == "\n"){
// This is the final line
break
}
numberOfLoci += 1
fileLineWithoutNewline := strings.TrimSuffix(fileLineString, "\n")
// Example row (tab delimited)
// "rs9777703 1 928836 T T"
rowColumnsSlice := strings.Split(fileLineWithoutNewline, "\t")
if (len(rowColumnsSlice) != 5){
return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Invalid row detected: " + fileLineString)
}
snpIdentifier := rowColumnsSlice[0]
isValid, locusRSID := readRSIDString(snpIdentifier)
if (isValid == false){
// This must be a custom identifier, not an RSID
// TODO: Create database to convert them to RSIDs, like the one we have for 23andMe
continue
}
// We check to see if this rsID is used in any part of the genetic analysis, and skip it if it is not.
locusFound, _, err := locusMetadata.GetLocusMetadata(locusRSID)
if (err != nil) { return "", 0, 0, 0, false, nil, err }
if (locusFound == false){
// This locus is not used for any part of the genetic analysis
// We don't need to include it in our genome map
continue
}
alleleABase := rowColumnsSlice[3]
alleleBBaseRaw := rowColumnsSlice[4]
// Allele B has a control character suffix
// We will remove it
alleleBBase := string(alleleBBaseRaw[0])
if (alleleABase == "0" || alleleBBase == "0"){
// The data was not successfully recorded
// We will skip this line
continue
}
aIsValid := verifyBase(alleleABase)
if (aIsValid == false){
return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Invalid base detected: " + alleleABase)
}
bIsValid := verifyBase(alleleBBase)
if (bIsValid == false){
return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Invalid base detected: " + alleleBBase)
}
locusValueObject := RawGenomeLocusValue{
Allele1: alleleABase,
Allele2Exists: true,
Allele2: alleleBBase,
}
genomeMap[locusRSID] = locusValueObject
}
// Each file usually has 500,000+ SNPs
if (numberOfLoci < 1000){
return "", 0, 0, 0, false, nil, errors.New("Malformed AncestryDNA genome file: Not enough bases.")
}
return "AncestryDNA", 1, fileTimeUnix, numberOfLoci, false, genomeMap, nil
}
// 23andMe files have the following first line format:
// "# This data file generated by 23andMe at: Mon Jan 01 01:00:00 2022"
fileIs23andMe := strings.HasPrefix(firstLine, "# This data file generated by 23andMe at:")
if (fileIs23andMe == true){
// 23andMe represents all locus values on the Plus strand.
timeOfGenerationString := strings.TrimPrefix(firstLine, "# This data file generated by 23andMe at: ")
timeSlice := strings.Split(timeOfGenerationString, " ")
if (len(timeSlice) != 5 && len(timeSlice) != 6){
return "", 0, 0, 0, false, nil, errors.New("Malformed 23andMe genome file: Invalid time: " + timeOfGenerationString)
}
monthString := timeSlice[1]
getDayAndYearString := func()(string, string){
if (len(timeSlice) == 5){
dayString := timeSlice[2]
yearString := timeSlice[4]
return dayString, yearString
}
// len(timeSlice) == 6
dayString := timeSlice[3]
yearString := timeSlice[5]
return dayString, yearString
}
dayString, yearString := getDayAndYearString()
getMonthObject := func()(time.Month, error){
switch monthString{
case "Jan":{
return time.January, nil
}
case "Feb":{
return time.February, nil
}
case "Mar":{
return time.March, nil
}
case "Apr":{
return time.April, nil
}
case "May":{
return time.May, nil
}
case "Jun":{
return time.June, nil
}
case "Jul":{
return time.July, nil
}
case "Aug":{
return time.August, nil
}
case "Sep":{
return time.September, nil
}
case "Oct":{
return time.October, nil
}
case "Nov":{
return time.November, nil
}
case "Dec":{
return time.December, nil
}
}
return time.January, errors.New("Malformed 23andMe genome file: Invalid month: " + monthString)
}
monthObject, err := getMonthObject()
if (err != nil) { return "", 0, 0, 0, false, nil, err }
// We have to remove control character and newline suffix
yearExtractedString := string(yearString[:4])
yearInt, err := helpers.ConvertStringToInt(yearExtractedString)
if (err != nil) {
return "", 0, 0, 0, false, nil, errors.New("Malformed 23andMe genome file: Invalid year: " + yearExtractedString)
}
dayInt, err := helpers.ConvertStringToInt(dayString)
if (err != nil) {
return "", 0, 0, 0, false, nil, errors.New("Malformed 23andMe genome file: Invalid day: " + dayString)
}
fileTimeObject := time.Date(yearInt, monthObject, dayInt, 0, 0, 0, 0, time.UTC)
fileTimeUnix := fileTimeObject.Unix()
// Now we advance bufio reader to the snp rows
for {
fileLineString, err := fileBufioReader.ReadString('\n')
if (err != nil){
// File does not have another line
return "", 0, 0, 0, false, nil, errors.New("Malformed genome file: Too short.")
}
// All SNP rows come after this line:
// "# rsid chromosome position genotype"
lineReached := strings.HasPrefix(fileLineString, "# rsid")
if (lineReached == true){
break
}
}
numberOfLoci := int64(0)
genomeMap := make(map[int64]RawGenomeLocusValue)
for {
fileLineString, err := fileBufioReader.ReadString('\n')
if (err != nil){
// File does not have another line
break
}
if (fileLineString == "\n"){
// This is the final line
break
}
fileLineWithoutNewline := strings.TrimSuffix(fileLineString, "\n")
// Rows look like this
// "rs4477212 1 82154 GG"
// "rs571313759 1 1181945 --" (-- means no entry)
// "i3001920 MT 16470 G" (one base is possible)
rowSlice := strings.Split(fileLineWithoutNewline, "\t")
if (len(rowSlice) != 4){
return "", 0, 0, 0, false, nil, errors.New("Malformed 23andMe genome data: Invalid SNP row: " + fileLineString)
}
snpIdentifier := rowSlice[0]
snpValueRaw := rowSlice[3]
if (len(snpValueRaw) < 2){
return "", 0, 0, 0, false, nil, errors.New("Malformed 23andMe genome data: Invalid SNP row snp value: " + fileLineString)
}
if (snpValueRaw[0] == '-'){
// Locus value is "--"
// Locus value does not exist
continue
}
numberOfLoci += 1
//Outputs:
// -bool: rsID found
// -int64: rsID value
// -error
getRSIDIdentifier := func()(bool, int64, error){
isRSID, rsidInt64 := readRSIDString(snpIdentifier)
if (isRSID == true){
return true, rsidInt64, nil
}
// We see if it is a custom locus alias
aliasFound, rsidInt64, err := locusMetadata.GetCompanyAliasRSID("23andMe", snpIdentifier)
if (err != nil) { return false, 0, err }
if (aliasFound == true){
return true, rsidInt64, nil
}
return false, 0, nil
}
rsidFound, locusRSID, err := getRSIDIdentifier()
if (err != nil) { return "", 0, 0, 0, false, nil, err }
if (rsidFound == false){
// RSID is unknown.
// It is probably a custom identifier that represents a locus we don't use for the analysis.
continue
}
locusFound, _, err := locusMetadata.GetLocusMetadata(locusRSID)
if (err != nil) { return "", 0, 0, 0, false, nil, err }
if (locusFound == false){
// This locus is not used for any part of the genetic analysis
// We don't need to include it in our genome map
continue
}
getLocusBasesList := func()([]rune, error){
locusBasesList := make([]rune, 0)
finalIndex := len(snpValueRaw) - 1
for index, character := range snpValueRaw{
baseIsValid := verifyBase(string(character))
if (baseIsValid == false){
if (index == finalIndex){
// The final index of snpValueRaw is sometimes a control character
return locusBasesList, nil
}
return nil, errors.New("Malformed 23andMe genome file: Invalid SNP base: " + string(character))
}
locusBasesList = append(locusBasesList, character)
}
return locusBasesList, nil
}
locusBasesList, err := getLocusBasesList()
if (err != nil){ return "", 0, 0, 0, false, nil, err }
allele1 := string(locusBasesList[0])
locusValueObject := RawGenomeLocusValue{
Allele1: allele1,
}
if (len(locusBasesList) > 1){
allele2 := string(locusBasesList[1])
locusValueObject.Allele2Exists = true
locusValueObject.Allele2 = allele2
}
genomeMap[locusRSID] = locusValueObject
}
return "23andMe", 1, fileTimeUnix, numberOfLoci, false, genomeMap, nil
}
return "", 0, 0, 0, false, nil, errors.New("Cannot read genome file: File format not known.")
}