// extractGiantLoci.go provides a function to extract rsIDs from a Genome-Wide Association Study (GWAS) created by the GIANT consortium // These studies are released as files on this website: // https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files // The files are a tab-delimeted file of rsIDs and their effect on a particular trait // The output file is a .gob encoded []int64 of the top 1000 most impactful loci on the trait. // These files are then saved into the following folders: // -Height -> /resources/geneticReferences/traits/rsIDs // -Obesity -> /resources/geneticReferences/polygenicDiseases/rsIDs // The loci metadata for loci from these files is also imported into the locusMetadata package to enable them to be used in Seekia package main // Here are the files I used to extract causal rsIDs // Trait: Height // Download Link: // https://portals.broadinstitute.org/collaboration/giant/images/4/4e/GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_ALL.gz // SHA-256 Checksum: // db18859724675f2f9ba86eff28cb4dacac0629c0b25c9806a6cf2eed6bb8b71e // Trait: Obesity (Waist-to-hip-ratio) // Download Link: // https://portals.broadinstitute.org/collaboration/giant/images/0/09/PublicRelease.WHRadjBMI.C.All.Add.txt.gz // SHA-256 Checksum: // 2a863b0357037ae5c34853342052ed3c59735d6440da0fd15d1cab34b7d49daf import "seekia/resources/geneticReferences/locusMetadata" import "seekia/resources/geneticReferences/modifyLocusMetadata" import "seekia/internal/helpers" import "seekia/internal/localFilesystem" import "log" import "bufio" import "os" import "io" import "math" import "bytes" import "strings" import "encoding/gob" import "errors" import "slices" func main(){ extractGiantLoci := func()error{ // heightOrObesity := "Height" heightOrObesity := "Obesity" filepath := "./Giant" + heightOrObesity + "Study.txt" fileBytes, err := os.ReadFile(filepath) if (err != nil){ return errors.New("Could not open " + filepath + ": " + err.Error()) } fileReader := bytes.NewReader(fileBytes) bufioReader := bufio.NewReader(fileReader) // We first read the header line //These are the columns of the Height file: // Filename: GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_*.gz // - SNPID // -represented as CHR:POS:REF:ALT // - RSID // -RS NUMBER, WHEN AVAILABLE // - CHR // -CHROMOSOME // - The chromosome on which the SNP is located // - POS // -GENOMIC POSITION (BASE PAIR) - hg19/hg37 BUILD // - EFFECT_ALLELE // -The allele that is associated with the effect being studied (Example: the allele associated with increased height) // - OTHER_ALLELE // - EFFECT_ALLELE_FREQ // -(3 significant figures) // -ChatGPT says: The frequency of the effect allele in the study population, reported to 3 significant figures. // - BETA // -(6 significant figures) // -ChatGPT says: The effect size or regression coefficient for the association between // the SNP and the trait of interest, reported to 6 significant figures // - SE // -(3 significant figures) // -Standard error of the effect size, reported to 3 significant figures // - P // -P-VALUE MARGINAL EFFECT // -ChatGPT says: The p-value for the marginal effect of the SNP on the trait of interest // - N // -Total sample size used in the GWAS analysis // These are the columns of the Obesity (WHR) File: // Filename: PublicRelease.WHRadjBMI.C.All.Add.txt.gz // // -1. snpname - dbSNP rsID // -2. chr - chromosome // -3. pos - position // -4. markername - chr:pos // -5. ref - reference allele (hg19 + strand) // -6. alt - alternate allele (hg19 + strand) // -7. beta - beta // -8. se - standard error // -9. pvalue - P value // -10. n - sample size // -11. gmaf/eur_maf - alternate allele frequency in 1000 Genome Combined/European Ancestries // -12. exac_maf/exac_nfe_maf -alternate allele frequency in ExAC Combined/Non-Finnish European Ancestries _, err = bufioReader.ReadString('\n') if (err != nil) { return err } type LocusInfo struct{ Chromosome int Position int Effect float64 } lociInfoMap := make(map[int64]LocusInfo) for { rsidInfoLine, err := bufioReader.ReadString('\n') if (err != nil) { if (err == io.EOF){ // We have reached the end of the file break } // File is corrupt return errors.New("Error reading file: " + err.Error()) } lineElementsSlice := strings.Split(string(rsidInfoLine), "\t") //Outputs: // -bool: Locus information is available // -int64: Locus rsID // -int: Locus Chromosome // -int: Locus Position // -float64: Locus effect // -error getLocusInfo := func()(bool, int64, int, int, float64, error){ if (heightOrObesity == "Height"){ rsidString := lineElementsSlice[1] locusChromosomeString := lineElementsSlice[2] locusPositionString := lineElementsSlice[3] locusEffectString := lineElementsSlice[7] rsidWithoutPrefix, prefixFound := strings.CutPrefix(rsidString, "rs") if (prefixFound == false){ // Some of the rsIDs are not formatted in the "rs123456" format // We skip those // log.Println("rs prefix not found in rsID: " + rsidString) return false, 0, 0, 0, 0, nil } rsID, err := helpers.ConvertStringToInt64(rsidWithoutPrefix) if (err != nil){ return false, 0, 0, 0, 0, errors.New("RSID is invalid: " + err.Error()) } locusChromosome, err := helpers.ConvertStringToInt(locusChromosomeString) if (err != nil){ return false, 0, 0, 0, 0, errors.New("Locus Chromosome is invalid: " + err.Error()) } locusPosition, err := helpers.ConvertStringToInt(locusPositionString) if (err != nil){ return false, 0, 0, 0, 0, errors.New("Locus Position is invalid: " + err.Error()) } locusEffectRaw, err := helpers.ConvertStringToFloat64(locusEffectString) if (err != nil) { if (locusEffectString == ""){ // The database has at least 1 entry with no effect provided return false, 0, 0, 0, 0, nil } return false, 0, 0, 0, 0, errors.New("RSID effect is invalid: " + err.Error()) } return true, rsID, locusChromosome, locusPosition, locusEffectRaw, nil } rsidString := lineElementsSlice[0] locusChromosomeString := lineElementsSlice[1] locusPositionString := lineElementsSlice[2] locusEffectString := lineElementsSlice[6] if (rsidString == "-" || rsidString == ""){ return false, 0, 0, 0, 0, nil } rsidWithoutPrefix, prefixFound := strings.CutPrefix(rsidString, "rs") if (prefixFound == false){ return false, 0, 0, 0, 0, errors.New("Obesity GWAS file contains invalid rsID: " + rsidString) } rsID, err := helpers.ConvertStringToInt64(rsidWithoutPrefix) if (err != nil){ return false, 0, 0, 0, 0, errors.New("RSID is invalid: " + err.Error()) } locusChromosome, err := helpers.ConvertStringToInt(locusChromosomeString) if (err != nil){ if (locusChromosomeString == "X"){ // TODO: Add the ability to read these chromosomes return false, 0, 0, 0, 0, nil } return false, 0, 0, 0, 0, errors.New("Locus Chromosome is invalid: " + err.Error()) } locusPosition, err := helpers.ConvertStringToInt(locusPositionString) if (err != nil){ hasSuffix := strings.HasSuffix(locusPositionString, "+08") if (hasSuffix == true){ // This is an invalid entry in the file return false, 0, 0, 0, 0, nil } return false, 0, 0, 0, 0, errors.New("Locus Position is invalid: " + err.Error()) } locusEffectRaw, err := helpers.ConvertStringToFloat64(locusEffectString) if (err != nil) { return false, 0, 0, 0, 0, errors.New("RSID effect is invalid: " + err.Error()) } return true, rsID, locusChromosome, locusPosition, locusEffectRaw, nil } locusInfoExists, locusRSID, locusChromosome, locusPosition, locusEffectRaw, err := getLocusInfo() if (err != nil) { return err } if (locusInfoExists == false){ continue } // Effect can be negative, we make it positive locusEffect := math.Abs(locusEffectRaw) existingLocusValue, exists := lociInfoMap[locusRSID] if (exists == false){ newLocusInfo := LocusInfo{ Chromosome: locusChromosome, Position: locusPosition, Effect: locusEffect, } lociInfoMap[locusRSID] = newLocusInfo } else { // We see if the effect of this allele is greater // If it is, we update the effect to match the higher effect allele // We do this because we want the most causal rsIDs, not the most causal alleles // When we feed the locus into the neural network, both alleles will be eligible to be trained upon existingChromosome := existingLocusValue.Chromosome existingPosition := existingLocusValue.Position existingEffect := existingLocusValue.Effect if (existingChromosome != locusChromosome){ return errors.New("GIANT gwas contains two rsIDs with conflicting chromosomes.") } if (existingPosition != locusPosition){ return errors.New("GIANT gwas contains two rsIDs with conflicting positions.") } if (existingEffect < locusEffect){ // We update the value with the new effect existingLocusValue.Effect = locusEffect lociInfoMap[locusRSID] = existingLocusValue } } } // We find the top 10,000 rsIDs with the greatest effect rsidsList := helpers.GetListOfMapKeys(lociInfoMap) compareFunction := func(rsid1 int64, rsid2 int64)int{ if (rsid1 == rsid2){ panic("Identical rsIDs found during sort.") } rsid1Info, exists := lociInfoMap[rsid1] if (exists == false){ panic("rsid1 is missing from lociInfoMap.") } rsid2Info, exists := lociInfoMap[rsid2] if (exists == false){ panic("rsid2 is missing from lociInfoMap.") } rsid1Effect := rsid1Info.Effect rsid2Effect := rsid2Info.Effect if (rsid1Effect == rsid2Effect){ return 0 } if (rsid1Effect < rsid2Effect){ return 1 } return -1 } slices.SortFunc(rsidsList, compareFunction) // We take the top 1000 most impactful loci mostImpactfulLoci := rsidsList[:1000] // We add these rsIDs to the locus metadata locusMetadatasToAddList := make([]locusMetadata.LocusMetadata, 0) for _, rsID := range mostImpactfulLoci{ locusInfo, exists := lociInfoMap[rsID] if (exists == false){ return errors.New("lociInfoMap missing rsID.") } locusChromosome := locusInfo.Chromosome locusPosition := locusInfo.Position locusReferencesMap := make(map[string]string) locusReferencesMap[heightOrObesity + " Genome-Wide Association Study (GWAS) created by the GIANT consortium"] = "https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files" newLocusMetadata := locusMetadata.LocusMetadata{ RSIDsList: []int64{rsID}, Chromosome: locusChromosome, Position: locusPosition, GeneInfoIsKnown: false, GeneExists: false, GeneNamesList: make([]string, 0), CompanyAliases: make(map[locusMetadata.GeneticsCompany][]string), References: locusReferencesMap, } locusMetadatasToAddList = append(locusMetadatasToAddList, newLocusMetadata) } // We add the locus metadatas _, newLocusMetadataFileBytes, err := modifyLocusMetadata.AddLocusMetadata(locusMetadatasToAddList) if (err != nil) { return err } err = localFilesystem.CreateOrOverwriteFile(newLocusMetadataFileBytes, "./", "NewLocusMetadata.gob") if (err != nil){ return err } // We create the rsIDs list file buffer := new(bytes.Buffer) gobEncoder := gob.NewEncoder(buffer) err = gobEncoder.Encode(mostImpactfulLoci) if (err != nil) { return err } encodedBytes := buffer.Bytes() filename := "Giant" + heightOrObesity + "StudyLoci.gob" err = localFilesystem.CreateOrOverwriteFile(encodedBytes, "./", filename) if (err != nil){ return err } return nil } err := extractGiantLoci() if (err != nil){ log.Println("Extraction failed: " + err.Error()) return } log.Println("Extraction successful!") }