393 lines
12 KiB
Go
393 lines
12 KiB
Go
|
|
// extractGiantLoci.go provides a function to extract rsIDs from a Genome-Wide Association Study (GWAS) created by the GIANT consortium
|
|
// These studies are released as files on this website:
|
|
// https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files
|
|
// The files are a tab-delimeted file of rsIDs and their effect on a particular trait
|
|
|
|
// The output file is a .gob encoded []int64 of the top 1000 most impactful loci on the trait.
|
|
// These files are then saved into the following folders:
|
|
// -Height -> /resources/geneticReferences/traits/rsIDs
|
|
// -Obesity -> /resources/geneticReferences/polygenicDiseases/rsIDs
|
|
// The loci metadata for loci from these files is also imported into the locusMetadata package to enable them to be used in Seekia
|
|
|
|
package main
|
|
|
|
// Here are the files I used to extract causal rsIDs
|
|
|
|
// Trait: Height
|
|
// Download Link:
|
|
// https://portals.broadinstitute.org/collaboration/giant/images/4/4e/GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_ALL.gz
|
|
// SHA-256 Checksum:
|
|
// db18859724675f2f9ba86eff28cb4dacac0629c0b25c9806a6cf2eed6bb8b71e
|
|
|
|
// Trait: Obesity (Waist-to-hip-ratio)
|
|
// Download Link:
|
|
// https://portals.broadinstitute.org/collaboration/giant/images/0/09/PublicRelease.WHRadjBMI.C.All.Add.txt.gz
|
|
// SHA-256 Checksum:
|
|
// 2a863b0357037ae5c34853342052ed3c59735d6440da0fd15d1cab34b7d49daf
|
|
|
|
|
|
import "seekia/resources/geneticReferences/locusMetadata"
|
|
import "seekia/resources/geneticReferences/modifyLocusMetadata"
|
|
|
|
import "seekia/internal/helpers"
|
|
import "seekia/internal/localFilesystem"
|
|
|
|
|
|
import "log"
|
|
import "bufio"
|
|
import "os"
|
|
import "io"
|
|
import "math"
|
|
import "bytes"
|
|
import "strings"
|
|
import "encoding/gob"
|
|
import "errors"
|
|
import "slices"
|
|
|
|
|
|
func main(){
|
|
|
|
extractGiantLoci := func()error{
|
|
|
|
// heightOrObesity := "Height"
|
|
heightOrObesity := "Obesity"
|
|
|
|
filepath := "./Giant" + heightOrObesity + "Study.txt"
|
|
|
|
fileBytes, err := os.ReadFile(filepath)
|
|
if (err != nil){
|
|
return errors.New("Could not open " + filepath + ": " + err.Error())
|
|
}
|
|
|
|
fileReader := bytes.NewReader(fileBytes)
|
|
|
|
bufioReader := bufio.NewReader(fileReader)
|
|
|
|
// We first read the header line
|
|
|
|
//These are the columns of the Height file:
|
|
|
|
// Filename: GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_*.gz
|
|
// - SNPID
|
|
// -represented as CHR:POS:REF:ALT
|
|
// - RSID
|
|
// -RS NUMBER, WHEN AVAILABLE
|
|
// - CHR
|
|
// -CHROMOSOME
|
|
// - The chromosome on which the SNP is located
|
|
// - POS
|
|
// -GENOMIC POSITION (BASE PAIR) - hg19/hg37 BUILD
|
|
// - EFFECT_ALLELE
|
|
// -The allele that is associated with the effect being studied (Example: the allele associated with increased height)
|
|
// - OTHER_ALLELE
|
|
// - EFFECT_ALLELE_FREQ
|
|
// -(3 significant figures)
|
|
// -ChatGPT says: The frequency of the effect allele in the study population, reported to 3 significant figures.
|
|
// - BETA
|
|
// -(6 significant figures)
|
|
// -ChatGPT says: The effect size or regression coefficient for the association between
|
|
// the SNP and the trait of interest, reported to 6 significant figures
|
|
// - SE
|
|
// -(3 significant figures)
|
|
// -Standard error of the effect size, reported to 3 significant figures
|
|
// - P
|
|
// -P-VALUE MARGINAL EFFECT
|
|
// -ChatGPT says: The p-value for the marginal effect of the SNP on the trait of interest
|
|
// - N
|
|
// -Total sample size used in the GWAS analysis
|
|
|
|
// These are the columns of the Obesity (WHR) File:
|
|
// Filename: PublicRelease.WHRadjBMI.C.All.Add.txt.gz
|
|
//
|
|
// -1. snpname - dbSNP rsID
|
|
// -2. chr - chromosome
|
|
// -3. pos - position
|
|
// -4. markername - chr:pos
|
|
// -5. ref - reference allele (hg19 + strand)
|
|
// -6. alt - alternate allele (hg19 + strand)
|
|
// -7. beta - beta
|
|
// -8. se - standard error
|
|
// -9. pvalue - P value
|
|
// -10. n - sample size
|
|
// -11. gmaf/eur_maf - alternate allele frequency in 1000 Genome Combined/European Ancestries
|
|
// -12. exac_maf/exac_nfe_maf -alternate allele frequency in ExAC Combined/Non-Finnish European Ancestries
|
|
|
|
_, err = bufioReader.ReadString('\n')
|
|
if (err != nil) { return err }
|
|
|
|
type LocusInfo struct{
|
|
Chromosome int
|
|
Position int
|
|
Effect float64
|
|
}
|
|
|
|
lociInfoMap := make(map[int64]LocusInfo)
|
|
|
|
for {
|
|
|
|
rsidInfoLine, err := bufioReader.ReadString('\n')
|
|
if (err != nil) {
|
|
|
|
if (err == io.EOF){
|
|
// We have reached the end of the file
|
|
break
|
|
}
|
|
|
|
// File is corrupt
|
|
return errors.New("Error reading file: " + err.Error())
|
|
}
|
|
|
|
lineElementsSlice := strings.Split(string(rsidInfoLine), "\t")
|
|
|
|
//Outputs:
|
|
// -bool: Locus information is available
|
|
// -int64: Locus rsID
|
|
// -int: Locus Chromosome
|
|
// -int: Locus Position
|
|
// -float64: Locus effect
|
|
// -error
|
|
getLocusInfo := func()(bool, int64, int, int, float64, error){
|
|
|
|
if (heightOrObesity == "Height"){
|
|
|
|
rsidString := lineElementsSlice[1]
|
|
locusChromosomeString := lineElementsSlice[2]
|
|
locusPositionString := lineElementsSlice[3]
|
|
locusEffectString := lineElementsSlice[7]
|
|
|
|
rsidWithoutPrefix, prefixFound := strings.CutPrefix(rsidString, "rs")
|
|
if (prefixFound == false){
|
|
// Some of the rsIDs are not formatted in the "rs123456" format
|
|
// We skip those
|
|
// log.Println("rs prefix not found in rsID: " + rsidString)
|
|
return false, 0, 0, 0, 0, nil
|
|
}
|
|
|
|
rsID, err := helpers.ConvertStringToInt64(rsidWithoutPrefix)
|
|
if (err != nil){
|
|
return false, 0, 0, 0, 0, errors.New("RSID is invalid: " + err.Error())
|
|
}
|
|
|
|
locusChromosome, err := helpers.ConvertStringToInt(locusChromosomeString)
|
|
if (err != nil){
|
|
return false, 0, 0, 0, 0, errors.New("Locus Chromosome is invalid: " + err.Error())
|
|
}
|
|
|
|
locusPosition, err := helpers.ConvertStringToInt(locusPositionString)
|
|
if (err != nil){
|
|
return false, 0, 0, 0, 0, errors.New("Locus Position is invalid: " + err.Error())
|
|
}
|
|
|
|
locusEffectRaw, err := helpers.ConvertStringToFloat64(locusEffectString)
|
|
if (err != nil) {
|
|
if (locusEffectString == ""){
|
|
// The database has at least 1 entry with no effect provided
|
|
return false, 0, 0, 0, 0, nil
|
|
}
|
|
return false, 0, 0, 0, 0, errors.New("RSID effect is invalid: " + err.Error())
|
|
}
|
|
|
|
return true, rsID, locusChromosome, locusPosition, locusEffectRaw, nil
|
|
}
|
|
|
|
rsidString := lineElementsSlice[0]
|
|
locusChromosomeString := lineElementsSlice[1]
|
|
locusPositionString := lineElementsSlice[2]
|
|
locusEffectString := lineElementsSlice[6]
|
|
|
|
if (rsidString == "-" || rsidString == ""){
|
|
return false, 0, 0, 0, 0, nil
|
|
}
|
|
|
|
rsidWithoutPrefix, prefixFound := strings.CutPrefix(rsidString, "rs")
|
|
if (prefixFound == false){
|
|
return false, 0, 0, 0, 0, errors.New("Obesity GWAS file contains invalid rsID: " + rsidString)
|
|
}
|
|
|
|
rsID, err := helpers.ConvertStringToInt64(rsidWithoutPrefix)
|
|
if (err != nil){
|
|
return false, 0, 0, 0, 0, errors.New("RSID is invalid: " + err.Error())
|
|
}
|
|
|
|
locusChromosome, err := helpers.ConvertStringToInt(locusChromosomeString)
|
|
if (err != nil){
|
|
|
|
if (locusChromosomeString == "X"){
|
|
// TODO: Add the ability to read these chromosomes
|
|
return false, 0, 0, 0, 0, nil
|
|
}
|
|
|
|
return false, 0, 0, 0, 0, errors.New("Locus Chromosome is invalid: " + err.Error())
|
|
}
|
|
|
|
locusPosition, err := helpers.ConvertStringToInt(locusPositionString)
|
|
if (err != nil){
|
|
|
|
hasSuffix := strings.HasSuffix(locusPositionString, "+08")
|
|
if (hasSuffix == true){
|
|
// This is an invalid entry in the file
|
|
return false, 0, 0, 0, 0, nil
|
|
}
|
|
return false, 0, 0, 0, 0, errors.New("Locus Position is invalid: " + err.Error())
|
|
}
|
|
|
|
locusEffectRaw, err := helpers.ConvertStringToFloat64(locusEffectString)
|
|
if (err != nil) {
|
|
return false, 0, 0, 0, 0, errors.New("RSID effect is invalid: " + err.Error())
|
|
}
|
|
|
|
return true, rsID, locusChromosome, locusPosition, locusEffectRaw, nil
|
|
}
|
|
|
|
locusInfoExists, locusRSID, locusChromosome, locusPosition, locusEffectRaw, err := getLocusInfo()
|
|
if (err != nil) { return err }
|
|
if (locusInfoExists == false){
|
|
continue
|
|
}
|
|
|
|
// Effect can be negative, we make it positive
|
|
locusEffect := math.Abs(locusEffectRaw)
|
|
|
|
existingLocusValue, exists := lociInfoMap[locusRSID]
|
|
if (exists == false){
|
|
|
|
newLocusInfo := LocusInfo{
|
|
Chromosome: locusChromosome,
|
|
Position: locusPosition,
|
|
Effect: locusEffect,
|
|
}
|
|
|
|
lociInfoMap[locusRSID] = newLocusInfo
|
|
} else {
|
|
|
|
// We see if the effect of this allele is greater
|
|
// If it is, we update the effect to match the higher effect allele
|
|
// We do this because we want the most causal rsIDs, not the most causal alleles
|
|
// When we feed the locus into the neural network, both alleles will be eligible to be trained upon
|
|
|
|
existingChromosome := existingLocusValue.Chromosome
|
|
existingPosition := existingLocusValue.Position
|
|
existingEffect := existingLocusValue.Effect
|
|
|
|
if (existingChromosome != locusChromosome){
|
|
return errors.New("GIANT gwas contains two rsIDs with conflicting chromosomes.")
|
|
}
|
|
if (existingPosition != locusPosition){
|
|
return errors.New("GIANT gwas contains two rsIDs with conflicting positions.")
|
|
}
|
|
if (existingEffect < locusEffect){
|
|
|
|
// We update the value with the new effect
|
|
existingLocusValue.Effect = locusEffect
|
|
lociInfoMap[locusRSID] = existingLocusValue
|
|
}
|
|
}
|
|
}
|
|
|
|
// We find the top 10,000 rsIDs with the greatest effect
|
|
|
|
rsidsList := helpers.GetListOfMapKeys(lociInfoMap)
|
|
|
|
compareFunction := func(rsid1 int64, rsid2 int64)int{
|
|
|
|
if (rsid1 == rsid2){
|
|
panic("Identical rsIDs found during sort.")
|
|
}
|
|
|
|
rsid1Info, exists := lociInfoMap[rsid1]
|
|
if (exists == false){
|
|
panic("rsid1 is missing from lociInfoMap.")
|
|
}
|
|
|
|
rsid2Info, exists := lociInfoMap[rsid2]
|
|
if (exists == false){
|
|
panic("rsid2 is missing from lociInfoMap.")
|
|
}
|
|
|
|
rsid1Effect := rsid1Info.Effect
|
|
rsid2Effect := rsid2Info.Effect
|
|
|
|
if (rsid1Effect == rsid2Effect){
|
|
return 0
|
|
}
|
|
if (rsid1Effect < rsid2Effect){
|
|
return 1
|
|
}
|
|
|
|
return -1
|
|
}
|
|
|
|
slices.SortFunc(rsidsList, compareFunction)
|
|
|
|
// We take the top 1000 most impactful loci
|
|
|
|
mostImpactfulLoci := rsidsList[:1000]
|
|
|
|
// We add these rsIDs to the locus metadata
|
|
|
|
locusMetadatasToAddList := make([]locusMetadata.LocusMetadata, 0)
|
|
|
|
for _, rsID := range mostImpactfulLoci{
|
|
|
|
locusInfo, exists := lociInfoMap[rsID]
|
|
if (exists == false){
|
|
return errors.New("lociInfoMap missing rsID.")
|
|
}
|
|
|
|
locusChromosome := locusInfo.Chromosome
|
|
locusPosition := locusInfo.Position
|
|
|
|
locusReferencesMap := make(map[string]string)
|
|
locusReferencesMap[heightOrObesity + " Genome-Wide Association Study (GWAS) created by the GIANT consortium"] = "https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files"
|
|
|
|
newLocusMetadata := locusMetadata.LocusMetadata{
|
|
RSIDsList: []int64{rsID},
|
|
Chromosome: locusChromosome,
|
|
Position: locusPosition,
|
|
GeneInfoIsKnown: false,
|
|
GeneExists: false,
|
|
GeneNamesList: make([]string, 0),
|
|
CompanyAliases: make(map[locusMetadata.GeneticsCompany][]string),
|
|
References: locusReferencesMap,
|
|
}
|
|
|
|
locusMetadatasToAddList = append(locusMetadatasToAddList, newLocusMetadata)
|
|
}
|
|
|
|
// We add the locus metadatas
|
|
|
|
_, newLocusMetadataFileBytes, err := modifyLocusMetadata.AddLocusMetadata(locusMetadatasToAddList)
|
|
if (err != nil) { return err }
|
|
|
|
err = localFilesystem.CreateOrOverwriteFile(newLocusMetadataFileBytes, "./", "NewLocusMetadata.gob")
|
|
if (err != nil){ return err }
|
|
|
|
// We create the rsIDs list file
|
|
|
|
buffer := new(bytes.Buffer)
|
|
|
|
gobEncoder := gob.NewEncoder(buffer)
|
|
|
|
err = gobEncoder.Encode(mostImpactfulLoci)
|
|
if (err != nil) { return err }
|
|
|
|
encodedBytes := buffer.Bytes()
|
|
|
|
filename := "Giant" + heightOrObesity + "StudyLoci.gob"
|
|
|
|
err = localFilesystem.CreateOrOverwriteFile(encodedBytes, "./", filename)
|
|
if (err != nil){ return err }
|
|
|
|
return nil
|
|
}
|
|
|
|
err := extractGiantLoci()
|
|
if (err != nil){
|
|
log.Println("Extraction failed: " + err.Error())
|
|
return
|
|
}
|
|
|
|
log.Println("Extraction successful!")
|
|
}
|
|
|