seekia/utilities/extractGiantLoci/extractGiantLoci.go
2024-08-14 03:37:18 +00:00

393 lines
12 KiB
Go

// extractGiantLoci.go provides a function to extract rsIDs from a Genome-Wide Association Study (GWAS) created by the GIANT consortium
// These studies are released as files on this website:
// https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files
// The files are a tab-delimeted file of rsIDs and their effect on a particular trait
// The output file is a .gob encoded []int64 of the top 1000 most impactful loci on the trait.
// These files are then saved into the following folders:
// -Height -> /resources/geneticReferences/traits/rsIDs
// -Obesity -> /resources/geneticReferences/polygenicDiseases/rsIDs
// The loci metadata for loci from these files is also imported into the locusMetadata package to enable them to be used in Seekia
package main
// Here are the files I used to extract causal rsIDs
// Trait: Height
// Download Link:
// https://portals.broadinstitute.org/collaboration/giant/images/4/4e/GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_ALL.gz
// SHA-256 Checksum:
// db18859724675f2f9ba86eff28cb4dacac0629c0b25c9806a6cf2eed6bb8b71e
// Trait: Obesity (Waist-to-hip-ratio)
// Download Link:
// https://portals.broadinstitute.org/collaboration/giant/images/0/09/PublicRelease.WHRadjBMI.C.All.Add.txt.gz
// SHA-256 Checksum:
// 2a863b0357037ae5c34853342052ed3c59735d6440da0fd15d1cab34b7d49daf
import "seekia/resources/geneticReferences/locusMetadata"
import "seekia/resources/geneticReferences/modifyLocusMetadata"
import "seekia/internal/helpers"
import "seekia/internal/localFilesystem"
import "log"
import "bufio"
import "os"
import "io"
import "math"
import "bytes"
import "strings"
import "encoding/gob"
import "errors"
import "slices"
func main(){
extractGiantLoci := func()error{
// heightOrObesity := "Height"
heightOrObesity := "Obesity"
filepath := "./Giant" + heightOrObesity + "Study.txt"
fileBytes, err := os.ReadFile(filepath)
if (err != nil){
return errors.New("Could not open " + filepath + ": " + err.Error())
}
fileReader := bytes.NewReader(fileBytes)
bufioReader := bufio.NewReader(fileReader)
// We first read the header line
//These are the columns of the Height file:
// Filename: GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_*.gz
// - SNPID
// -represented as CHR:POS:REF:ALT
// - RSID
// -RS NUMBER, WHEN AVAILABLE
// - CHR
// -CHROMOSOME
// - The chromosome on which the SNP is located
// - POS
// -GENOMIC POSITION (BASE PAIR) - hg19/hg37 BUILD
// - EFFECT_ALLELE
// -The allele that is associated with the effect being studied (Example: the allele associated with increased height)
// - OTHER_ALLELE
// - EFFECT_ALLELE_FREQ
// -(3 significant figures)
// -ChatGPT says: The frequency of the effect allele in the study population, reported to 3 significant figures.
// - BETA
// -(6 significant figures)
// -ChatGPT says: The effect size or regression coefficient for the association between
// the SNP and the trait of interest, reported to 6 significant figures
// - SE
// -(3 significant figures)
// -Standard error of the effect size, reported to 3 significant figures
// - P
// -P-VALUE MARGINAL EFFECT
// -ChatGPT says: The p-value for the marginal effect of the SNP on the trait of interest
// - N
// -Total sample size used in the GWAS analysis
// These are the columns of the Obesity (WHR) File:
// Filename: PublicRelease.WHRadjBMI.C.All.Add.txt.gz
//
// -1. snpname - dbSNP rsID
// -2. chr - chromosome
// -3. pos - position
// -4. markername - chr:pos
// -5. ref - reference allele (hg19 + strand)
// -6. alt - alternate allele (hg19 + strand)
// -7. beta - beta
// -8. se - standard error
// -9. pvalue - P value
// -10. n - sample size
// -11. gmaf/eur_maf - alternate allele frequency in 1000 Genome Combined/European Ancestries
// -12. exac_maf/exac_nfe_maf -alternate allele frequency in ExAC Combined/Non-Finnish European Ancestries
_, err = bufioReader.ReadString('\n')
if (err != nil) { return err }
type LocusInfo struct{
Chromosome int
Position int
Effect float64
}
lociInfoMap := make(map[int64]LocusInfo)
for {
rsidInfoLine, err := bufioReader.ReadString('\n')
if (err != nil) {
if (err == io.EOF){
// We have reached the end of the file
break
}
// File is corrupt
return errors.New("Error reading file: " + err.Error())
}
lineElementsSlice := strings.Split(string(rsidInfoLine), "\t")
//Outputs:
// -bool: Locus information is available
// -int64: Locus rsID
// -int: Locus Chromosome
// -int: Locus Position
// -float64: Locus effect
// -error
getLocusInfo := func()(bool, int64, int, int, float64, error){
if (heightOrObesity == "Height"){
rsidString := lineElementsSlice[1]
locusChromosomeString := lineElementsSlice[2]
locusPositionString := lineElementsSlice[3]
locusEffectString := lineElementsSlice[7]
rsidWithoutPrefix, prefixFound := strings.CutPrefix(rsidString, "rs")
if (prefixFound == false){
// Some of the rsIDs are not formatted in the "rs123456" format
// We skip those
// log.Println("rs prefix not found in rsID: " + rsidString)
return false, 0, 0, 0, 0, nil
}
rsID, err := helpers.ConvertStringToInt64(rsidWithoutPrefix)
if (err != nil){
return false, 0, 0, 0, 0, errors.New("RSID is invalid: " + err.Error())
}
locusChromosome, err := helpers.ConvertStringToInt(locusChromosomeString)
if (err != nil){
return false, 0, 0, 0, 0, errors.New("Locus Chromosome is invalid: " + err.Error())
}
locusPosition, err := helpers.ConvertStringToInt(locusPositionString)
if (err != nil){
return false, 0, 0, 0, 0, errors.New("Locus Position is invalid: " + err.Error())
}
locusEffectRaw, err := helpers.ConvertStringToFloat64(locusEffectString)
if (err != nil) {
if (locusEffectString == ""){
// The database has at least 1 entry with no effect provided
return false, 0, 0, 0, 0, nil
}
return false, 0, 0, 0, 0, errors.New("RSID effect is invalid: " + err.Error())
}
return true, rsID, locusChromosome, locusPosition, locusEffectRaw, nil
}
rsidString := lineElementsSlice[0]
locusChromosomeString := lineElementsSlice[1]
locusPositionString := lineElementsSlice[2]
locusEffectString := lineElementsSlice[6]
if (rsidString == "-" || rsidString == ""){
return false, 0, 0, 0, 0, nil
}
rsidWithoutPrefix, prefixFound := strings.CutPrefix(rsidString, "rs")
if (prefixFound == false){
return false, 0, 0, 0, 0, errors.New("Obesity GWAS file contains invalid rsID: " + rsidString)
}
rsID, err := helpers.ConvertStringToInt64(rsidWithoutPrefix)
if (err != nil){
return false, 0, 0, 0, 0, errors.New("RSID is invalid: " + err.Error())
}
locusChromosome, err := helpers.ConvertStringToInt(locusChromosomeString)
if (err != nil){
if (locusChromosomeString == "X"){
// TODO: Add the ability to read these chromosomes
return false, 0, 0, 0, 0, nil
}
return false, 0, 0, 0, 0, errors.New("Locus Chromosome is invalid: " + err.Error())
}
locusPosition, err := helpers.ConvertStringToInt(locusPositionString)
if (err != nil){
hasSuffix := strings.HasSuffix(locusPositionString, "+08")
if (hasSuffix == true){
// This is an invalid entry in the file
return false, 0, 0, 0, 0, nil
}
return false, 0, 0, 0, 0, errors.New("Locus Position is invalid: " + err.Error())
}
locusEffectRaw, err := helpers.ConvertStringToFloat64(locusEffectString)
if (err != nil) {
return false, 0, 0, 0, 0, errors.New("RSID effect is invalid: " + err.Error())
}
return true, rsID, locusChromosome, locusPosition, locusEffectRaw, nil
}
locusInfoExists, locusRSID, locusChromosome, locusPosition, locusEffectRaw, err := getLocusInfo()
if (err != nil) { return err }
if (locusInfoExists == false){
continue
}
// Effect can be negative, we make it positive
locusEffect := math.Abs(locusEffectRaw)
existingLocusValue, exists := lociInfoMap[locusRSID]
if (exists == false){
newLocusInfo := LocusInfo{
Chromosome: locusChromosome,
Position: locusPosition,
Effect: locusEffect,
}
lociInfoMap[locusRSID] = newLocusInfo
} else {
// We see if the effect of this allele is greater
// If it is, we update the effect to match the higher effect allele
// We do this because we want the most causal rsIDs, not the most causal alleles
// When we feed the locus into the neural network, both alleles will be eligible to be trained upon
existingChromosome := existingLocusValue.Chromosome
existingPosition := existingLocusValue.Position
existingEffect := existingLocusValue.Effect
if (existingChromosome != locusChromosome){
return errors.New("GIANT gwas contains two rsIDs with conflicting chromosomes.")
}
if (existingPosition != locusPosition){
return errors.New("GIANT gwas contains two rsIDs with conflicting positions.")
}
if (existingEffect < locusEffect){
// We update the value with the new effect
existingLocusValue.Effect = locusEffect
lociInfoMap[locusRSID] = existingLocusValue
}
}
}
// We find the top 10,000 rsIDs with the greatest effect
rsidsList := helpers.GetListOfMapKeys(lociInfoMap)
compareFunction := func(rsid1 int64, rsid2 int64)int{
if (rsid1 == rsid2){
panic("Identical rsIDs found during sort.")
}
rsid1Info, exists := lociInfoMap[rsid1]
if (exists == false){
panic("rsid1 is missing from lociInfoMap.")
}
rsid2Info, exists := lociInfoMap[rsid2]
if (exists == false){
panic("rsid2 is missing from lociInfoMap.")
}
rsid1Effect := rsid1Info.Effect
rsid2Effect := rsid2Info.Effect
if (rsid1Effect == rsid2Effect){
return 0
}
if (rsid1Effect < rsid2Effect){
return 1
}
return -1
}
slices.SortFunc(rsidsList, compareFunction)
// We take the top 1000 most impactful loci
mostImpactfulLoci := rsidsList[:1000]
// We add these rsIDs to the locus metadata
locusMetadatasToAddList := make([]locusMetadata.LocusMetadata, 0)
for _, rsID := range mostImpactfulLoci{
locusInfo, exists := lociInfoMap[rsID]
if (exists == false){
return errors.New("lociInfoMap missing rsID.")
}
locusChromosome := locusInfo.Chromosome
locusPosition := locusInfo.Position
locusReferencesMap := make(map[string]string)
locusReferencesMap[heightOrObesity + " Genome-Wide Association Study (GWAS) created by the GIANT consortium"] = "https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files"
newLocusMetadata := locusMetadata.LocusMetadata{
RSIDsList: []int64{rsID},
Chromosome: locusChromosome,
Position: locusPosition,
GeneInfoIsKnown: false,
GeneExists: false,
GeneNamesList: make([]string, 0),
CompanyAliases: make(map[locusMetadata.GeneticsCompany][]string),
References: locusReferencesMap,
}
locusMetadatasToAddList = append(locusMetadatasToAddList, newLocusMetadata)
}
// We add the locus metadatas
_, newLocusMetadataFileBytes, err := modifyLocusMetadata.AddLocusMetadata(locusMetadatasToAddList)
if (err != nil) { return err }
err = localFilesystem.CreateOrOverwriteFile(newLocusMetadataFileBytes, "./", "NewLocusMetadata.gob")
if (err != nil){ return err }
// We create the rsIDs list file
buffer := new(bytes.Buffer)
gobEncoder := gob.NewEncoder(buffer)
err = gobEncoder.Encode(mostImpactfulLoci)
if (err != nil) { return err }
encodedBytes := buffer.Bytes()
filename := "Giant" + heightOrObesity + "StudyLoci.gob"
err = localFilesystem.CreateOrOverwriteFile(encodedBytes, "./", filename)
if (err != nil){ return err }
return nil
}
err := extractGiantLoci()
if (err != nil){
log.Println("Extraction failed: " + err.Error())
return
}
log.Println("Extraction successful!")
}