2024-08-05 09:11:10 +02:00
// extractGiantLoci.go provides a function to extract rsIDs from a Genome-Wide Association Study (GWAS) created by the GIANT consortium
// These studies are released as files on this website:
// https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files
// The files are a tab-delimeted file of rsIDs and their effect on a particular trait
// The output file is a .gob encoded []int64 of the top 1000 most impactful loci on the trait.
2024-08-14 05:37:18 +02:00
// These files are then saved into the following folders:
// -Height -> /resources/geneticReferences/traits/rsIDs
// -Obesity -> /resources/geneticReferences/polygenicDiseases/rsIDs
2024-08-05 09:11:10 +02:00
// The loci metadata for loci from these files is also imported into the locusMetadata package to enable them to be used in Seekia
package main
2024-08-14 05:37:18 +02:00
// Here are the files I used to extract causal rsIDs
2024-08-05 09:11:10 +02:00
2024-08-14 05:37:18 +02:00
// Trait: Height
// Download Link:
// https://portals.broadinstitute.org/collaboration/giant/images/4/4e/GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_ALL.gz
// SHA-256 Checksum:
2024-08-05 09:11:10 +02:00
// db18859724675f2f9ba86eff28cb4dacac0629c0b25c9806a6cf2eed6bb8b71e
2024-08-14 05:37:18 +02:00
// Trait: Obesity (Waist-to-hip-ratio)
// Download Link:
// https://portals.broadinstitute.org/collaboration/giant/images/0/09/PublicRelease.WHRadjBMI.C.All.Add.txt.gz
// SHA-256 Checksum:
// 2a863b0357037ae5c34853342052ed3c59735d6440da0fd15d1cab34b7d49daf
2024-08-05 09:11:10 +02:00
import "seekia/resources/geneticReferences/locusMetadata"
import "seekia/resources/geneticReferences/modifyLocusMetadata"
import "seekia/internal/helpers"
import "seekia/internal/localFilesystem"
import "log"
import "bufio"
import "os"
import "io"
import "math"
import "bytes"
import "strings"
import "encoding/gob"
import "errors"
import "slices"
func main ( ) {
extractGiantLoci := func ( ) error {
2024-08-14 05:37:18 +02:00
// heightOrObesity := "Height"
heightOrObesity := "Obesity"
filepath := "./Giant" + heightOrObesity + "Study.txt"
fileBytes , err := os . ReadFile ( filepath )
if ( err != nil ) {
return errors . New ( "Could not open " + filepath + ": " + err . Error ( ) )
2024-08-05 09:11:10 +02:00
}
fileReader := bytes . NewReader ( fileBytes )
bufioReader := bufio . NewReader ( fileReader )
// We first read the header line
2024-08-14 05:37:18 +02:00
//These are the columns of the Height file:
2024-08-05 09:11:10 +02:00
2024-08-14 05:37:18 +02:00
// Filename: GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_*.gz
2024-08-05 09:11:10 +02:00
// - SNPID
2024-08-14 05:37:18 +02:00
// -represented as CHR:POS:REF:ALT
2024-08-05 09:11:10 +02:00
// - RSID
// -RS NUMBER, WHEN AVAILABLE
// - CHR
// -CHROMOSOME
// - The chromosome on which the SNP is located
// - POS
// -GENOMIC POSITION (BASE PAIR) - hg19/hg37 BUILD
// - EFFECT_ALLELE
// -The allele that is associated with the effect being studied (Example: the allele associated with increased height)
// - OTHER_ALLELE
// - EFFECT_ALLELE_FREQ
// -(3 significant figures)
// -ChatGPT says: The frequency of the effect allele in the study population, reported to 3 significant figures.
// - BETA
// -(6 significant figures)
// -ChatGPT says: The effect size or regression coefficient for the association between
// the SNP and the trait of interest, reported to 6 significant figures
// - SE
// -(3 significant figures)
// -Standard error of the effect size, reported to 3 significant figures
// - P
// -P-VALUE MARGINAL EFFECT
// -ChatGPT says: The p-value for the marginal effect of the SNP on the trait of interest
// - N
// -Total sample size used in the GWAS analysis
2024-08-14 05:37:18 +02:00
// These are the columns of the Obesity (WHR) File:
// Filename: PublicRelease.WHRadjBMI.C.All.Add.txt.gz
//
// -1. snpname - dbSNP rsID
// -2. chr - chromosome
// -3. pos - position
// -4. markername - chr:pos
// -5. ref - reference allele (hg19 + strand)
// -6. alt - alternate allele (hg19 + strand)
// -7. beta - beta
// -8. se - standard error
// -9. pvalue - P value
// -10. n - sample size
// -11. gmaf/eur_maf - alternate allele frequency in 1000 Genome Combined/European Ancestries
// -12. exac_maf/exac_nfe_maf -alternate allele frequency in ExAC Combined/Non-Finnish European Ancestries
2024-08-05 09:11:10 +02:00
_ , err = bufioReader . ReadString ( '\n' )
if ( err != nil ) { return err }
type LocusInfo struct {
Chromosome int
Position int
Effect float64
}
2024-08-14 05:37:18 +02:00
lociInfoMap := make ( map [ int64 ] LocusInfo )
2024-08-05 09:11:10 +02:00
for {
rsidInfoLine , err := bufioReader . ReadString ( '\n' )
if ( err != nil ) {
if ( err == io . EOF ) {
// We have reached the end of the file
break
}
2024-08-14 05:37:18 +02:00
2024-08-05 09:11:10 +02:00
// File is corrupt
return errors . New ( "Error reading file: " + err . Error ( ) )
}
lineElementsSlice := strings . Split ( string ( rsidInfoLine ) , "\t" )
2024-08-14 05:37:18 +02:00
//Outputs:
// -bool: Locus information is available
// -int64: Locus rsID
// -int: Locus Chromosome
// -int: Locus Position
// -float64: Locus effect
// -error
getLocusInfo := func ( ) ( bool , int64 , int , int , float64 , error ) {
if ( heightOrObesity == "Height" ) {
rsidString := lineElementsSlice [ 1 ]
locusChromosomeString := lineElementsSlice [ 2 ]
locusPositionString := lineElementsSlice [ 3 ]
locusEffectString := lineElementsSlice [ 7 ]
rsidWithoutPrefix , prefixFound := strings . CutPrefix ( rsidString , "rs" )
if ( prefixFound == false ) {
// Some of the rsIDs are not formatted in the "rs123456" format
// We skip those
// log.Println("rs prefix not found in rsID: " + rsidString)
return false , 0 , 0 , 0 , 0 , nil
}
rsID , err := helpers . ConvertStringToInt64 ( rsidWithoutPrefix )
if ( err != nil ) {
return false , 0 , 0 , 0 , 0 , errors . New ( "RSID is invalid: " + err . Error ( ) )
}
locusChromosome , err := helpers . ConvertStringToInt ( locusChromosomeString )
if ( err != nil ) {
return false , 0 , 0 , 0 , 0 , errors . New ( "Locus Chromosome is invalid: " + err . Error ( ) )
}
locusPosition , err := helpers . ConvertStringToInt ( locusPositionString )
if ( err != nil ) {
return false , 0 , 0 , 0 , 0 , errors . New ( "Locus Position is invalid: " + err . Error ( ) )
}
locusEffectRaw , err := helpers . ConvertStringToFloat64 ( locusEffectString )
if ( err != nil ) {
if ( locusEffectString == "" ) {
// The database has at least 1 entry with no effect provided
return false , 0 , 0 , 0 , 0 , nil
}
return false , 0 , 0 , 0 , 0 , errors . New ( "RSID effect is invalid: " + err . Error ( ) )
}
return true , rsID , locusChromosome , locusPosition , locusEffectRaw , nil
}
2024-08-05 09:11:10 +02:00
2024-08-14 05:37:18 +02:00
rsidString := lineElementsSlice [ 0 ]
locusChromosomeString := lineElementsSlice [ 1 ]
locusPositionString := lineElementsSlice [ 2 ]
locusEffectString := lineElementsSlice [ 6 ]
2024-08-05 09:11:10 +02:00
2024-08-14 05:37:18 +02:00
if ( rsidString == "-" || rsidString == "" ) {
return false , 0 , 0 , 0 , 0 , nil
}
rsidWithoutPrefix , prefixFound := strings . CutPrefix ( rsidString , "rs" )
if ( prefixFound == false ) {
return false , 0 , 0 , 0 , 0 , errors . New ( "Obesity GWAS file contains invalid rsID: " + rsidString )
}
2024-08-05 09:11:10 +02:00
2024-08-14 05:37:18 +02:00
rsID , err := helpers . ConvertStringToInt64 ( rsidWithoutPrefix )
if ( err != nil ) {
return false , 0 , 0 , 0 , 0 , errors . New ( "RSID is invalid: " + err . Error ( ) )
}
2024-08-05 09:11:10 +02:00
2024-08-14 05:37:18 +02:00
locusChromosome , err := helpers . ConvertStringToInt ( locusChromosomeString )
if ( err != nil ) {
2024-08-05 09:11:10 +02:00
2024-08-14 05:37:18 +02:00
if ( locusChromosomeString == "X" ) {
// TODO: Add the ability to read these chromosomes
return false , 0 , 0 , 0 , 0 , nil
}
return false , 0 , 0 , 0 , 0 , errors . New ( "Locus Chromosome is invalid: " + err . Error ( ) )
}
locusPosition , err := helpers . ConvertStringToInt ( locusPositionString )
if ( err != nil ) {
hasSuffix := strings . HasSuffix ( locusPositionString , "+08" )
if ( hasSuffix == true ) {
// This is an invalid entry in the file
return false , 0 , 0 , 0 , 0 , nil
}
return false , 0 , 0 , 0 , 0 , errors . New ( "Locus Position is invalid: " + err . Error ( ) )
}
locusEffectRaw , err := helpers . ConvertStringToFloat64 ( locusEffectString )
if ( err != nil ) {
return false , 0 , 0 , 0 , 0 , errors . New ( "RSID effect is invalid: " + err . Error ( ) )
2024-08-05 09:11:10 +02:00
}
2024-08-14 05:37:18 +02:00
return true , rsID , locusChromosome , locusPosition , locusEffectRaw , nil
}
locusInfoExists , locusRSID , locusChromosome , locusPosition , locusEffectRaw , err := getLocusInfo ( )
if ( err != nil ) { return err }
if ( locusInfoExists == false ) {
continue
2024-08-05 09:11:10 +02:00
}
// Effect can be negative, we make it positive
2024-08-14 05:37:18 +02:00
locusEffect := math . Abs ( locusEffectRaw )
2024-08-05 09:11:10 +02:00
2024-08-14 05:37:18 +02:00
existingLocusValue , exists := lociInfoMap [ locusRSID ]
2024-08-05 09:11:10 +02:00
if ( exists == false ) {
newLocusInfo := LocusInfo {
2024-08-14 05:37:18 +02:00
Chromosome : locusChromosome ,
Position : locusPosition ,
Effect : locusEffect ,
2024-08-05 09:11:10 +02:00
}
2024-08-14 05:37:18 +02:00
lociInfoMap [ locusRSID ] = newLocusInfo
2024-08-05 09:11:10 +02:00
} else {
// We see if the effect of this allele is greater
// If it is, we update the effect to match the higher effect allele
// We do this because we want the most causal rsIDs, not the most causal alleles
// When we feed the locus into the neural network, both alleles will be eligible to be trained upon
existingChromosome := existingLocusValue . Chromosome
existingPosition := existingLocusValue . Position
existingEffect := existingLocusValue . Effect
2024-08-14 05:37:18 +02:00
if ( existingChromosome != locusChromosome ) {
2024-08-05 09:11:10 +02:00
return errors . New ( "GIANT gwas contains two rsIDs with conflicting chromosomes." )
}
2024-08-14 05:37:18 +02:00
if ( existingPosition != locusPosition ) {
2024-08-05 09:11:10 +02:00
return errors . New ( "GIANT gwas contains two rsIDs with conflicting positions." )
}
2024-08-14 05:37:18 +02:00
if ( existingEffect < locusEffect ) {
2024-08-05 09:11:10 +02:00
// We update the value with the new effect
2024-08-14 05:37:18 +02:00
existingLocusValue . Effect = locusEffect
lociInfoMap [ locusRSID ] = existingLocusValue
2024-08-05 09:11:10 +02:00
}
}
}
// We find the top 10,000 rsIDs with the greatest effect
2024-08-14 05:37:18 +02:00
rsidsList := helpers . GetListOfMapKeys ( lociInfoMap )
2024-08-05 09:11:10 +02:00
compareFunction := func ( rsid1 int64 , rsid2 int64 ) int {
if ( rsid1 == rsid2 ) {
panic ( "Identical rsIDs found during sort." )
}
2024-08-14 05:37:18 +02:00
rsid1Info , exists := lociInfoMap [ rsid1 ]
2024-08-05 09:11:10 +02:00
if ( exists == false ) {
2024-08-14 05:37:18 +02:00
panic ( "rsid1 is missing from lociInfoMap." )
2024-08-05 09:11:10 +02:00
}
2024-08-14 05:37:18 +02:00
rsid2Info , exists := lociInfoMap [ rsid2 ]
2024-08-05 09:11:10 +02:00
if ( exists == false ) {
2024-08-14 05:37:18 +02:00
panic ( "rsid2 is missing from lociInfoMap." )
2024-08-05 09:11:10 +02:00
}
rsid1Effect := rsid1Info . Effect
rsid2Effect := rsid2Info . Effect
if ( rsid1Effect == rsid2Effect ) {
return 0
}
if ( rsid1Effect < rsid2Effect ) {
return 1
}
return - 1
}
slices . SortFunc ( rsidsList , compareFunction )
// We take the top 1000 most impactful loci
mostImpactfulLoci := rsidsList [ : 1000 ]
// We add these rsIDs to the locus metadata
locusMetadatasToAddList := make ( [ ] locusMetadata . LocusMetadata , 0 )
for _ , rsID := range mostImpactfulLoci {
2024-08-14 05:37:18 +02:00
locusInfo , exists := lociInfoMap [ rsID ]
2024-08-05 09:11:10 +02:00
if ( exists == false ) {
2024-08-14 05:37:18 +02:00
return errors . New ( "lociInfoMap missing rsID." )
2024-08-05 09:11:10 +02:00
}
locusChromosome := locusInfo . Chromosome
locusPosition := locusInfo . Position
2024-08-14 05:37:18 +02:00
locusReferencesMap := make ( map [ string ] string )
locusReferencesMap [ heightOrObesity + " Genome-Wide Association Study (GWAS) created by the GIANT consortium" ] = "https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files"
2024-08-05 09:11:10 +02:00
newLocusMetadata := locusMetadata . LocusMetadata {
RSIDsList : [ ] int64 { rsID } ,
Chromosome : locusChromosome ,
Position : locusPosition ,
GeneInfoIsKnown : false ,
GeneExists : false ,
GeneNamesList : make ( [ ] string , 0 ) ,
CompanyAliases : make ( map [ locusMetadata . GeneticsCompany ] [ ] string ) ,
References : locusReferencesMap ,
}
locusMetadatasToAddList = append ( locusMetadatasToAddList , newLocusMetadata )
}
2024-08-14 05:37:18 +02:00
// We add the locus metadatas
2024-08-05 09:11:10 +02:00
_ , newLocusMetadataFileBytes , err := modifyLocusMetadata . AddLocusMetadata ( locusMetadatasToAddList )
if ( err != nil ) { return err }
err = localFilesystem . CreateOrOverwriteFile ( newLocusMetadataFileBytes , "./" , "NewLocusMetadata.gob" )
if ( err != nil ) { return err }
2024-08-14 05:37:18 +02:00
// We create the rsIDs list file
2024-08-05 09:11:10 +02:00
buffer := new ( bytes . Buffer )
gobEncoder := gob . NewEncoder ( buffer )
err = gobEncoder . Encode ( mostImpactfulLoci )
if ( err != nil ) { return err }
encodedBytes := buffer . Bytes ( )
2024-08-14 05:37:18 +02:00
filename := "Giant" + heightOrObesity + "StudyLoci.gob"
err = localFilesystem . CreateOrOverwriteFile ( encodedBytes , "./" , filename )
2024-08-05 09:11:10 +02:00
if ( err != nil ) { return err }
return nil
}
err := extractGiantLoci ( )
if ( err != nil ) {
log . Println ( "Extraction failed: " + err . Error ( ) )
return
}
log . Println ( "Extraction successful!" )
}