seekia/internal/genetics/createRawGenomes/createRawGenomes.go

360 lines
12 KiB
Go

// createRawGenomes provides functions to create fake raw genome files
// This package's functions are only used to test the readRawGenomes and createGeneticAnalysis packages.
package createRawGenomes
import "seekia/resources/geneticReferences/locusMetadata"
import "seekia/internal/genetics/readRawGenomes"
import "seekia/internal/helpers"
import "seekia/internal/unixTime"
import "time"
import "errors"
import "strings"
// Only use this function for tests
// Outputs:
// -string: Fake raw genome file string
// -int64: Time of fake file generation
// -int64: Number of loci
// -This does not include loci which have no base pair value in the file (Denoted by: "--")
// -map[int64]readRawGenomes.RawGenomeLocusValue: Raw genome map (rsID -> Locus base pair value)
// -error
func CreateFakeRawGenome_23andMe()(string, int64, int64, map[int64]readRawGenomes.RawGenomeLocusValue, error){
err := locusMetadata.InitializeLocusMetadataVariables()
if (err != nil){
return "", 0, 0, nil, errors.New("InitializeLocusMetadataVariables failed: " + err.Error())
}
yearUnix := unixTime.GetYearUnix()
maximumTime := time.Now().Unix()
minimumTime := maximumTime - (yearUnix*20)
randomUnixTime := helpers.GetRandomInt64WithinRange(minimumTime, maximumTime)
randomTimeObject := time.Unix(randomUnixTime, 0)
timeMonthObject := randomTimeObject.Month()
timeDayInt := randomTimeObject.Day()
timeYearInt := randomTimeObject.Year()
fileCreationTimeObject := time.Date(timeYearInt, timeMonthObject, timeDayInt, 0, 0, 0, 0, time.UTC)
fileCreationTimeUnix := fileCreationTimeObject.Unix()
timeWeekdayString := randomTimeObject.Weekday().String()
timeWeekdayTrimmed := timeWeekdayString[:3]
timeMonthString := timeMonthObject.String()
timeMonthTrimmed := timeMonthString[:3]
timeYearString := helpers.ConvertIntToString(timeYearInt)
getTimeDayFormatted := func()string{
timeDayString := helpers.ConvertIntToString(timeDayInt)
if (len(timeDayString) == 2){
return timeDayString
}
// We have to add 0 prefix
result := "0" + timeDayString
return result
}
timeDayFormatted := getTimeDayFormatted()
fileTimeString := timeWeekdayTrimmed + " " + timeMonthTrimmed + " " + timeDayFormatted + " 12:34:56 " + timeYearString
// We use this builder to create the file string
var fileContentsBuilder strings.Builder
fileHeader := `# This data file generated by 23andMe at: ` + fileTimeString + `
#
# This file contains raw genotype data, including data that is not used in 23andMe reports.
# This data has undergone a general quality review however only a subset of markers have been
# individually validated for accuracy. As such, this data is suitable only for research,
# educational, and informational use and not for medical or other use.
#
# Below is a text version of your data. Fields are TAB-separated
# Each line corresponds to a single SNP. For each SNP, we provide its identifier
# (an rsid or an internal id), its location on the reference human genome, and the
# genotype call oriented with respect to the plus strand on the human reference sequence.
# We are using reference human assembly build 37 (also known as Annotation Release 104).
# Note that it is possible that data downloaded at different times may be different due to ongoing
# improvements in our ability to call genotypes. More information about these changes can be found at:
# https://you.23andme.com/p/<IDENTIFIER>/tools/data/download/
#
# More information on reference human assembly builds:
# https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/
#
# rsid chromosome position genotype
`
_, err = fileContentsBuilder.WriteString(fileHeader)
if (err != nil){
return "", 0, 0, nil, errors.New("Failed to WriteString to string builder: " + err.Error())
}
numberOfLociToAdd := helpers.GetRandomInt64WithinRange(500000, 600000)
numberOfAddedLoci := int64(0)
// We use this map to avoid adding duplicate rsIDs
// Map Structure: rsID -> Nothing
addedRSIDsMap := make(map[int64]struct{})
// We use this map to return the contents of the map so we can verify reading it correctly
// Map Structure: rsID -> Locus value object (Example: G,G, I,D)
fileRSIDsMap := make(map[int64]readRawGenomes.RawGenomeLocusValue)
// We use this map to avoid adding duplicate positions
addedPositionsMap := make(map[int]struct{})
allelePossibilities := []string{"G", "C", "A", "T", "I", "D"}
for numberOfAddedLoci < numberOfLociToAdd{
locusRSID := helpers.GetRandomInt64WithinRange(1, 10000000)
_, exists := addedRSIDsMap[locusRSID]
if (exists == true){
// We try again to get a unique rsid
continue
}
locusChromosome := helpers.GetRandomIntWithinRange(1, 26)
locusPosition := helpers.GetRandomIntWithinRange(1, 10000000)
_, exists = addedPositionsMap[locusPosition]
if (exists == true){
// We try again to get a unique position
continue
}
locusRSIDString := helpers.ConvertInt64ToString(locusRSID)
locusChromosomeString := helpers.ConvertIntToString(locusChromosome)
locusPositionString := helpers.ConvertIntToString(locusPosition)
// Outputs:
// -string: Base pair for file
// -bool: Base pair exists
// -string: Allele A for rsidsMap
// -string: Allele B for rsidsMap
// -error
getBasePair := func()(string, bool, string, string, error){
randomInt := helpers.GetRandomIntWithinRange(1, 1000)
if (randomInt == 1){
// ~1/1000 loci will be unknown
return "--", false, "", "", nil
}
alleleA, err := helpers.GetRandomItemFromList(allelePossibilities)
if (err != nil){ return "", false, "", "", err }
alleleB, err := helpers.GetRandomItemFromList(allelePossibilities)
if (err != nil){ return "", false, "", "", err }
basePairForFile := alleleA + alleleB
return basePairForFile, true, alleleA, alleleB, nil
}
basePairForFile, basePairExists, alleleA, alleleB, err := getBasePair()
if (err != nil){
return "", 0, 0, nil, errors.New("getBasePair failed: " + err.Error())
}
newLine := "rs" + locusRSIDString + "\t" + locusChromosomeString + "\t" + locusPositionString + "\t" + basePairForFile + string(byte(13)) + "\n"
_, err = fileContentsBuilder.WriteString(newLine)
if (err != nil){
return "", 0, 0, nil, errors.New("Failed to WriteString to string builder: " + err.Error())
}
addedRSIDsMap[locusRSID] = struct{}{}
addedPositionsMap[locusPosition] = struct{}{}
if (basePairExists == false){
continue
}
numberOfAddedLoci += 1
locusValueObject := readRawGenomes.RawGenomeLocusValue{
Allele1: alleleA,
Allele2Exists: true,
Allele2: alleleB,
}
fileRSIDsMap[locusRSID] = locusValueObject
}
fileString := fileContentsBuilder.String()
return fileString, fileCreationTimeUnix, numberOfAddedLoci, fileRSIDsMap, nil
}
// Only use this function for tests
// Outputs:
// -string: Fake raw genome file string
// -int64: File creation time
// -int64: Number of loci in file
// -map[int64]readRawGenomes.RawGenomeLocusValue: Raw genome map (rsID -> Locus base pair value)
// -error
func CreateFakeRawGenome_AncestryDNA()(string, int64, int64, map[int64]readRawGenomes.RawGenomeLocusValue, error){
err := locusMetadata.InitializeLocusMetadataVariables()
if (err != nil){
return "", 0, 0, nil, errors.New("InitializeLocusMetadataVariables failed: " + err.Error())
}
yearUnix := unixTime.GetYearUnix()
maximumTime := time.Now().Unix()
minimumTime := maximumTime - (yearUnix*20)
randomUnixTime := helpers.GetRandomInt64WithinRange(minimumTime, maximumTime)
randomTimeObject := time.Unix(randomUnixTime, 0)
timeMonthInt := randomTimeObject.Month()
timeDayInt := randomTimeObject.Day()
timeYearInt := randomTimeObject.Year()
fileCreationTimeObject := time.Date(timeYearInt, timeMonthInt, timeDayInt, 0, 0, 0, 0, time.UTC)
fileCreationTimeUnix := fileCreationTimeObject.Unix()
timeDayString := helpers.ConvertIntToString(timeDayInt)
timeYearString := helpers.ConvertIntToString(timeYearInt)
getTimeMonthFormatted := func()string{
timeMonthString := helpers.ConvertIntToString(int(timeMonthInt))
if (len(timeMonthString) == 2){
return timeMonthString
}
// We have to add 0 prefix
result := "0" + timeMonthString
return result
}
timeMonthFormatted := getTimeMonthFormatted()
fileTimeString := timeMonthFormatted + "/" + timeDayString + "/" + timeYearString
// We use this builder to create the file string
var fileContentsBuilder strings.Builder
fileHeader := `#AncestryDNA raw data download
#This file was generated by AncestryDNA at: ` + fileTimeString + ` 10:00:00 UTC
#Data was collected using AncestryDNA array version: V2.0
#Data is formatted using AncestryDNA converter version: V1.0
#Below is a text version of your DNA file from Ancestry.com DNA, LLC. THIS
#INFORMATION IS FOR YOUR PERSONAL USE AND IS INTENDED FOR GENEALOGICAL RESEARCH
#ONLY. IT IS NOT INTENDED FOR MEDICAL, DIAGNOSTIC, OR HEALTH PURPOSES. THE EXPORTED DATA IS
#SUBJECT TO THE AncestryDNA TERMS AND CONDITIONS, BUT PLEASE BE AWARE THAT THE
#DOWNLOADED DATA WILL NO LONGER BE PROTECTED BY OUR SECURITY MEASURES.
#WHEN YOU DOWNLOAD YOUR RAW DNA DATA, YOU ASSUME ALL RISK OF STORING,
#SECURING AND PROTECTING YOUR DATA. FOR MORE INFORMATION, SEE ANCESTRYDNA FAQS.
#
#Genetic data is provided below as five TAB delimited columns. Each line
#corresponds to a SNP. Column one provides the SNP identifier (rsID where
#possible). Columns two and three contain the chromosome and basepair position
#of the SNP using human reference build 37.1 coordinates. Columns four and five
#contain the two alleles observed at this SNP (genotype). The genotype is reported
#on the forward (+) strand with respect to the human reference.
rsid chromosome position allele1 allele2
`
_, err = fileContentsBuilder.WriteString(fileHeader)
if (err != nil){
return "", 0, 0, nil, errors.New("Failed to WriteString to string builder: " + err.Error())
}
numberOfLociToAdd := helpers.GetRandomInt64WithinRange(500000, 600000)
numberOfAddedLoci := int64(0)
// We use this map to avoid adding duplicate rsIDs and to verify results of file read
// Map Structure: rsID -> Base pair (Example: "G,G", "I,D")
fileRSIDsMap := make(map[int64]readRawGenomes.RawGenomeLocusValue)
// We use this map to avoid adding duplicate positions
addedPositionsMap := make(map[int]struct{})
allelePossibilities := []string{"0", "G", "C", "A", "T", "I", "D"}
for numberOfAddedLoci < numberOfLociToAdd{
locusRSID := helpers.GetRandomInt64WithinRange(1, 10000000)
_, exists := fileRSIDsMap[locusRSID]
if (exists == true){
// We try again to get a unique rsid
continue
}
locusChromosome := helpers.GetRandomIntWithinRange(1, 26)
locusPosition := helpers.GetRandomIntWithinRange(1, 10000000)
_, exists = addedPositionsMap[locusPosition]
if (exists == true){
// We try again to get a unique position
continue
}
locusRSIDString := helpers.ConvertInt64ToString(locusRSID)
locusChromosomeString := helpers.ConvertIntToString(locusChromosome)
locusPositionString := helpers.ConvertIntToString(locusPosition)
alleleA, err := helpers.GetRandomItemFromList(allelePossibilities)
if (err != nil){
return "", 0, 0, nil, errors.New("GetRandomItemFromList failed: " + err.Error())
}
alleleB, err := helpers.GetRandomItemFromList(allelePossibilities)
if (err != nil){
return "", 0, 0, nil, errors.New("GetRandomItemFromList failed: " + err.Error())
}
newLine := "rs" + locusRSIDString + "\t" + locusChromosomeString + "\t" + locusPositionString + "\t" + alleleA + "\t" + alleleB + "\n"
_, err = fileContentsBuilder.WriteString(newLine)
if (err != nil){
return "", 0, 0, nil, errors.New("Failed to WriteString to string builder: " + err.Error())
}
locusValueObject := readRawGenomes.RawGenomeLocusValue{
Allele1: alleleA,
Allele2Exists: true,
Allele2: alleleB,
}
fileRSIDsMap[locusRSID] = locusValueObject
addedPositionsMap[locusPosition] = struct{}{}
numberOfAddedLoci += 1
}
fileString := fileContentsBuilder.String()
return fileString, fileCreationTimeUnix, numberOfAddedLoci, fileRSIDsMap, nil
}