283 lines
8.2 KiB
Go
283 lines
8.2 KiB
Go
|
|
// extractGiantLoci.go provides a function to extract rsIDs from a Genome-Wide Association Study (GWAS) created by the GIANT consortium
|
|
// These studies are released as files on this website:
|
|
// https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files
|
|
// The files are a tab-delimeted file of rsIDs and their effect on a particular trait
|
|
|
|
// The output file is a .gob encoded []int64 of the top 1000 most impactful loci on the trait.
|
|
// These files are then saved into /resources/geneticReferences/traits/rsIDs
|
|
// The loci metadata for loci from these files is also imported into the locusMetadata package to enable them to be used in Seekia
|
|
|
|
package main
|
|
|
|
// Here is the file I used to extract causal rsIDs for height
|
|
// Download link:
|
|
// https://portals.broadinstitute.org/collaboration/giant/images/4/4e/GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_ALL.gz
|
|
|
|
//SHA-256 Checksum:
|
|
// db18859724675f2f9ba86eff28cb4dacac0629c0b25c9806a6cf2eed6bb8b71e
|
|
|
|
import "seekia/resources/geneticReferences/locusMetadata"
|
|
import "seekia/resources/geneticReferences/modifyLocusMetadata"
|
|
|
|
import "seekia/internal/helpers"
|
|
import "seekia/internal/localFilesystem"
|
|
|
|
|
|
import "log"
|
|
import "bufio"
|
|
import "os"
|
|
import "io"
|
|
import "math"
|
|
import "bytes"
|
|
import "strings"
|
|
import "encoding/gob"
|
|
import "errors"
|
|
import "slices"
|
|
|
|
|
|
func main(){
|
|
|
|
extractGiantLoci := func()error{
|
|
|
|
fileBytes, err := os.ReadFile("./GiantHeightStudy.txt")
|
|
if (err != nil) {
|
|
return errors.New("Could not open GiantHeightStudy.txt file: " + err.Error())
|
|
}
|
|
|
|
fileReader := bytes.NewReader(fileBytes)
|
|
|
|
bufioReader := bufio.NewReader(fileReader)
|
|
|
|
// We first read the header line
|
|
|
|
//These are the columns of the file:
|
|
|
|
// COLUMN DESCRIPTION FOR FILE NAMED GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_*.gz
|
|
// - SNPID
|
|
// -represented as CHR:POS:REF:ALT)
|
|
// - RSID
|
|
// -RS NUMBER, WHEN AVAILABLE
|
|
// - CHR
|
|
// -CHROMOSOME
|
|
// - The chromosome on which the SNP is located
|
|
// - POS
|
|
// -GENOMIC POSITION (BASE PAIR) - hg19/hg37 BUILD
|
|
// - EFFECT_ALLELE
|
|
// -The allele that is associated with the effect being studied (Example: the allele associated with increased height)
|
|
// - OTHER_ALLELE
|
|
// - EFFECT_ALLELE_FREQ
|
|
// -(3 significant figures)
|
|
// -ChatGPT says: The frequency of the effect allele in the study population, reported to 3 significant figures.
|
|
// - BETA
|
|
// -(6 significant figures)
|
|
// -ChatGPT says: The effect size or regression coefficient for the association between
|
|
// the SNP and the trait of interest, reported to 6 significant figures
|
|
// - SE
|
|
// -(3 significant figures)
|
|
// -Standard error of the effect size, reported to 3 significant figures
|
|
// - P
|
|
// -P-VALUE MARGINAL EFFECT
|
|
// -ChatGPT says: The p-value for the marginal effect of the SNP on the trait of interest
|
|
// - N
|
|
// -Total sample size used in the GWAS analysis
|
|
|
|
_, err = bufioReader.ReadString('\n')
|
|
if (err != nil) { return err }
|
|
|
|
type LocusInfo struct{
|
|
Chromosome int
|
|
Position int
|
|
Effect float64
|
|
}
|
|
|
|
rsidsInfoMap := make(map[int64]LocusInfo)
|
|
|
|
for {
|
|
|
|
rsidInfoLine, err := bufioReader.ReadString('\n')
|
|
if (err != nil) {
|
|
|
|
if (err == io.EOF){
|
|
// We have reached the end of the file
|
|
break
|
|
}
|
|
// File is corrupt
|
|
return errors.New("Error reading file: " + err.Error())
|
|
}
|
|
|
|
lineElementsSlice := strings.Split(string(rsidInfoLine), "\t")
|
|
|
|
rsidString := lineElementsSlice[1]
|
|
rsidChromosomeString := lineElementsSlice[2]
|
|
rsidPositionString := lineElementsSlice[3]
|
|
rsidEffectString := lineElementsSlice[7]
|
|
|
|
rsidWithoutPrefix, prefixFound := strings.CutPrefix(rsidString, "rs")
|
|
if (prefixFound == false){
|
|
// Some of the rsIDs are not formatted in the "rs123456" format
|
|
// We skip those
|
|
//log.Println("rs prefix not found in rsID: " + rsIDString)
|
|
continue
|
|
}
|
|
|
|
rsID, err := helpers.ConvertStringToInt64(rsidWithoutPrefix)
|
|
if (err != nil){
|
|
return errors.New("RSID is invalid: " + err.Error())
|
|
}
|
|
|
|
rsidChromosome, err := helpers.ConvertStringToInt(rsidChromosomeString)
|
|
if (err != nil){ return err }
|
|
|
|
rsidPosition, err := helpers.ConvertStringToInt(rsidPositionString)
|
|
if (err != nil){ return err }
|
|
|
|
rsidEffectRaw, err := helpers.ConvertStringToFloat64(rsidEffectString)
|
|
if (err != nil) {
|
|
if (rsidEffectString == ""){
|
|
// The database has at least 1 entry with no effect provided
|
|
continue
|
|
}
|
|
return err
|
|
}
|
|
|
|
// Effect can be negative, we make it positive
|
|
rsidEffect := math.Abs(rsidEffectRaw)
|
|
|
|
existingLocusValue, exists := rsidsInfoMap[rsID]
|
|
if (exists == false){
|
|
|
|
newLocusInfo := LocusInfo{
|
|
Chromosome: rsidChromosome,
|
|
Position: rsidPosition,
|
|
Effect: rsidEffect,
|
|
}
|
|
|
|
rsidsInfoMap[rsID] = newLocusInfo
|
|
} else {
|
|
|
|
// We see if the effect of this allele is greater
|
|
// If it is, we update the effect to match the higher effect allele
|
|
// We do this because we want the most causal rsIDs, not the most causal alleles
|
|
// When we feed the locus into the neural network, both alleles will be eligible to be trained upon
|
|
|
|
existingChromosome := existingLocusValue.Chromosome
|
|
existingPosition := existingLocusValue.Position
|
|
existingEffect := existingLocusValue.Effect
|
|
|
|
if (existingChromosome != rsidChromosome){
|
|
return errors.New("GIANT gwas contains two rsIDs with conflicting chromosomes.")
|
|
}
|
|
if (existingPosition != rsidPosition){
|
|
return errors.New("GIANT gwas contains two rsIDs with conflicting positions.")
|
|
}
|
|
if (existingEffect < rsidEffect){
|
|
|
|
// We update the value with the new effect
|
|
existingLocusValue.Effect = rsidEffect
|
|
rsidsInfoMap[rsID] = existingLocusValue
|
|
}
|
|
}
|
|
}
|
|
|
|
// We find the top 10,000 rsIDs with the greatest effect
|
|
|
|
rsidsList := helpers.GetListOfMapKeys(rsidsInfoMap)
|
|
|
|
compareFunction := func(rsid1 int64, rsid2 int64)int{
|
|
|
|
if (rsid1 == rsid2){
|
|
panic("Identical rsIDs found during sort.")
|
|
}
|
|
|
|
rsid1Info, exists := rsidsInfoMap[rsid1]
|
|
if (exists == false){
|
|
panic("rsid1 is missing from rsidsInfoMap.")
|
|
}
|
|
|
|
rsid2Info, exists := rsidsInfoMap[rsid2]
|
|
if (exists == false){
|
|
panic("rsid2 is missing from rsidsInfoMap.")
|
|
}
|
|
|
|
rsid1Effect := rsid1Info.Effect
|
|
rsid2Effect := rsid2Info.Effect
|
|
|
|
if (rsid1Effect == rsid2Effect){
|
|
return 0
|
|
}
|
|
if (rsid1Effect < rsid2Effect){
|
|
return 1
|
|
}
|
|
|
|
return -1
|
|
}
|
|
|
|
slices.SortFunc(rsidsList, compareFunction)
|
|
|
|
// We take the top 1000 most impactful loci
|
|
|
|
mostImpactfulLoci := rsidsList[:1000]
|
|
|
|
// We add these rsIDs to the locus metadata
|
|
|
|
locusMetadatasToAddList := make([]locusMetadata.LocusMetadata, 0)
|
|
|
|
for _, rsID := range mostImpactfulLoci{
|
|
|
|
locusInfo, exists := rsidsInfoMap[rsID]
|
|
if (exists == false){
|
|
return errors.New("rsidsInfoMap missing rsID.")
|
|
}
|
|
|
|
locusChromosome := locusInfo.Chromosome
|
|
locusPosition := locusInfo.Position
|
|
|
|
locusReferencesMap := map[string]string{
|
|
"Height Genome-Wide Association Study (GWAS) created by the GIANT consortium": "https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files",
|
|
}
|
|
|
|
newLocusMetadata := locusMetadata.LocusMetadata{
|
|
RSIDsList: []int64{rsID},
|
|
Chromosome: locusChromosome,
|
|
Position: locusPosition,
|
|
GeneInfoIsKnown: false,
|
|
GeneExists: false,
|
|
GeneNamesList: make([]string, 0),
|
|
CompanyAliases: make(map[locusMetadata.GeneticsCompany][]string),
|
|
References: locusReferencesMap,
|
|
}
|
|
|
|
locusMetadatasToAddList = append(locusMetadatasToAddList, newLocusMetadata)
|
|
}
|
|
|
|
_, newLocusMetadataFileBytes, err := modifyLocusMetadata.AddLocusMetadata(locusMetadatasToAddList)
|
|
if (err != nil) { return err }
|
|
|
|
err = localFilesystem.CreateOrOverwriteFile(newLocusMetadataFileBytes, "./", "NewLocusMetadata.gob")
|
|
if (err != nil){ return err }
|
|
|
|
buffer := new(bytes.Buffer)
|
|
|
|
gobEncoder := gob.NewEncoder(buffer)
|
|
|
|
err = gobEncoder.Encode(mostImpactfulLoci)
|
|
if (err != nil) { return err }
|
|
|
|
encodedBytes := buffer.Bytes()
|
|
|
|
err = localFilesystem.CreateOrOverwriteFile(encodedBytes, "./", "GiantHeightStudyLoci.gob")
|
|
if (err != nil){ return err }
|
|
|
|
return nil
|
|
}
|
|
|
|
err := extractGiantLoci()
|
|
if (err != nil){
|
|
log.Println("Extraction failed: " + err.Error())
|
|
return
|
|
}
|
|
|
|
log.Println("Extraction successful!")
|
|
}
|
|
|