seekia/utilities/extractGiantLoci/extractGiantLoci.go

283 lines
8.2 KiB
Go

// extractGiantLoci.go provides a function to extract rsIDs from a Genome-Wide Association Study (GWAS) created by the GIANT consortium
// These studies are released as files on this website:
// https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files
// The files are a tab-delimeted file of rsIDs and their effect on a particular trait
// The output file is a .gob encoded []int64 of the top 1000 most impactful loci on the trait.
// These files are then saved into /resources/geneticReferences/traits/rsIDs
// The loci metadata for loci from these files is also imported into the locusMetadata package to enable them to be used in Seekia
package main
// Here is the file I used to extract causal rsIDs for height
// Download link:
// https://portals.broadinstitute.org/collaboration/giant/images/4/4e/GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_ALL.gz
//SHA-256 Checksum:
// db18859724675f2f9ba86eff28cb4dacac0629c0b25c9806a6cf2eed6bb8b71e
import "seekia/resources/geneticReferences/locusMetadata"
import "seekia/resources/geneticReferences/modifyLocusMetadata"
import "seekia/internal/helpers"
import "seekia/internal/localFilesystem"
import "log"
import "bufio"
import "os"
import "io"
import "math"
import "bytes"
import "strings"
import "encoding/gob"
import "errors"
import "slices"
func main(){
extractGiantLoci := func()error{
fileBytes, err := os.ReadFile("./GiantHeightStudy.txt")
if (err != nil) {
return errors.New("Could not open GiantHeightStudy.txt file: " + err.Error())
}
fileReader := bytes.NewReader(fileBytes)
bufioReader := bufio.NewReader(fileReader)
// We first read the header line
//These are the columns of the file:
// COLUMN DESCRIPTION FOR FILE NAMED GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_*.gz
// - SNPID
// -represented as CHR:POS:REF:ALT)
// - RSID
// -RS NUMBER, WHEN AVAILABLE
// - CHR
// -CHROMOSOME
// - The chromosome on which the SNP is located
// - POS
// -GENOMIC POSITION (BASE PAIR) - hg19/hg37 BUILD
// - EFFECT_ALLELE
// -The allele that is associated with the effect being studied (Example: the allele associated with increased height)
// - OTHER_ALLELE
// - EFFECT_ALLELE_FREQ
// -(3 significant figures)
// -ChatGPT says: The frequency of the effect allele in the study population, reported to 3 significant figures.
// - BETA
// -(6 significant figures)
// -ChatGPT says: The effect size or regression coefficient for the association between
// the SNP and the trait of interest, reported to 6 significant figures
// - SE
// -(3 significant figures)
// -Standard error of the effect size, reported to 3 significant figures
// - P
// -P-VALUE MARGINAL EFFECT
// -ChatGPT says: The p-value for the marginal effect of the SNP on the trait of interest
// - N
// -Total sample size used in the GWAS analysis
_, err = bufioReader.ReadString('\n')
if (err != nil) { return err }
type LocusInfo struct{
Chromosome int
Position int
Effect float64
}
rsidsInfoMap := make(map[int64]LocusInfo)
for {
rsidInfoLine, err := bufioReader.ReadString('\n')
if (err != nil) {
if (err == io.EOF){
// We have reached the end of the file
break
}
// File is corrupt
return errors.New("Error reading file: " + err.Error())
}
lineElementsSlice := strings.Split(string(rsidInfoLine), "\t")
rsidString := lineElementsSlice[1]
rsidChromosomeString := lineElementsSlice[2]
rsidPositionString := lineElementsSlice[3]
rsidEffectString := lineElementsSlice[7]
rsidWithoutPrefix, prefixFound := strings.CutPrefix(rsidString, "rs")
if (prefixFound == false){
// Some of the rsIDs are not formatted in the "rs123456" format
// We skip those
//log.Println("rs prefix not found in rsID: " + rsIDString)
continue
}
rsID, err := helpers.ConvertStringToInt64(rsidWithoutPrefix)
if (err != nil){
return errors.New("RSID is invalid: " + err.Error())
}
rsidChromosome, err := helpers.ConvertStringToInt(rsidChromosomeString)
if (err != nil){ return err }
rsidPosition, err := helpers.ConvertStringToInt(rsidPositionString)
if (err != nil){ return err }
rsidEffectRaw, err := helpers.ConvertStringToFloat64(rsidEffectString)
if (err != nil) {
if (rsidEffectString == ""){
// The database has at least 1 entry with no effect provided
continue
}
return err
}
// Effect can be negative, we make it positive
rsidEffect := math.Abs(rsidEffectRaw)
existingLocusValue, exists := rsidsInfoMap[rsID]
if (exists == false){
newLocusInfo := LocusInfo{
Chromosome: rsidChromosome,
Position: rsidPosition,
Effect: rsidEffect,
}
rsidsInfoMap[rsID] = newLocusInfo
} else {
// We see if the effect of this allele is greater
// If it is, we update the effect to match the higher effect allele
// We do this because we want the most causal rsIDs, not the most causal alleles
// When we feed the locus into the neural network, both alleles will be eligible to be trained upon
existingChromosome := existingLocusValue.Chromosome
existingPosition := existingLocusValue.Position
existingEffect := existingLocusValue.Effect
if (existingChromosome != rsidChromosome){
return errors.New("GIANT gwas contains two rsIDs with conflicting chromosomes.")
}
if (existingPosition != rsidPosition){
return errors.New("GIANT gwas contains two rsIDs with conflicting positions.")
}
if (existingEffect < rsidEffect){
// We update the value with the new effect
existingLocusValue.Effect = rsidEffect
rsidsInfoMap[rsID] = existingLocusValue
}
}
}
// We find the top 10,000 rsIDs with the greatest effect
rsidsList := helpers.GetListOfMapKeys(rsidsInfoMap)
compareFunction := func(rsid1 int64, rsid2 int64)int{
if (rsid1 == rsid2){
panic("Identical rsIDs found during sort.")
}
rsid1Info, exists := rsidsInfoMap[rsid1]
if (exists == false){
panic("rsid1 is missing from rsidsInfoMap.")
}
rsid2Info, exists := rsidsInfoMap[rsid2]
if (exists == false){
panic("rsid2 is missing from rsidsInfoMap.")
}
rsid1Effect := rsid1Info.Effect
rsid2Effect := rsid2Info.Effect
if (rsid1Effect == rsid2Effect){
return 0
}
if (rsid1Effect < rsid2Effect){
return 1
}
return -1
}
slices.SortFunc(rsidsList, compareFunction)
// We take the top 1000 most impactful loci
mostImpactfulLoci := rsidsList[:1000]
// We add these rsIDs to the locus metadata
locusMetadatasToAddList := make([]locusMetadata.LocusMetadata, 0)
for _, rsID := range mostImpactfulLoci{
locusInfo, exists := rsidsInfoMap[rsID]
if (exists == false){
return errors.New("rsidsInfoMap missing rsID.")
}
locusChromosome := locusInfo.Chromosome
locusPosition := locusInfo.Position
locusReferencesMap := map[string]string{
"Height Genome-Wide Association Study (GWAS) created by the GIANT consortium": "https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files",
}
newLocusMetadata := locusMetadata.LocusMetadata{
RSIDsList: []int64{rsID},
Chromosome: locusChromosome,
Position: locusPosition,
GeneInfoIsKnown: false,
GeneExists: false,
GeneNamesList: make([]string, 0),
CompanyAliases: make(map[locusMetadata.GeneticsCompany][]string),
References: locusReferencesMap,
}
locusMetadatasToAddList = append(locusMetadatasToAddList, newLocusMetadata)
}
_, newLocusMetadataFileBytes, err := modifyLocusMetadata.AddLocusMetadata(locusMetadatasToAddList)
if (err != nil) { return err }
err = localFilesystem.CreateOrOverwriteFile(newLocusMetadataFileBytes, "./", "NewLocusMetadata.gob")
if (err != nil){ return err }
buffer := new(bytes.Buffer)
gobEncoder := gob.NewEncoder(buffer)
err = gobEncoder.Encode(mostImpactfulLoci)
if (err != nil) { return err }
encodedBytes := buffer.Bytes()
err = localFilesystem.CreateOrOverwriteFile(encodedBytes, "./", "GiantHeightStudyLoci.gob")
if (err != nil){ return err }
return nil
}
err := extractGiantLoci()
if (err != nil){
log.Println("Extraction failed: " + err.Error())
return
}
log.Println("Extraction successful!")
}