1631 lines
52 KiB
Go
1631 lines
52 KiB
Go
|
|
// geneticPrediction provides functions to train and query neural network models
|
|
// These models are used to predict attributes such as eye color and autism from user genome files
|
|
|
|
package geneticPrediction
|
|
|
|
// I am a novice in the ways of neural networks.
|
|
// Machine learning experts should chime in and offer improvements.
|
|
// We have to make sure that model inference remains very fast
|
|
// Sorting matches by offspring total polygenic disease score will require inference on dozens of models for each match
|
|
// We could create slower models that provide more accurate predictions
|
|
|
|
import "seekia/resources/geneticReferences/polygenicDiseases"
|
|
import "seekia/resources/geneticReferences/traits"
|
|
import "seekia/resources/geneticPredictionModels"
|
|
|
|
import "seekia/internal/genetics/locusValue"
|
|
import "seekia/internal/genetics/readBiobankData"
|
|
import "seekia/internal/helpers"
|
|
|
|
import "gorgonia.org/gorgonia"
|
|
import "gorgonia.org/tensor"
|
|
|
|
import mathRand "math/rand/v2"
|
|
import "math"
|
|
import "bytes"
|
|
import "encoding/gob"
|
|
import "slices"
|
|
import "errors"
|
|
|
|
|
|
type NeuralNetwork struct{
|
|
|
|
// ExprGraph is a data structure for a directed acyclic graph (of expressions).
|
|
graph *gorgonia.ExprGraph
|
|
|
|
// These are the weights for each layer of neurons
|
|
weights1 *gorgonia.Node
|
|
weights2 *gorgonia.Node
|
|
weights3 *gorgonia.Node
|
|
|
|
// This is the computed prediction
|
|
prediction *gorgonia.Node
|
|
}
|
|
|
|
// This struct stores a user's training data
|
|
// Each TrainingData represents a single data example
|
|
// For example, the InputLayer is a column of neurons representing a user's genetics,
|
|
// and the OutputLayer is a column representing their phenotype, such as eye color
|
|
type TrainingData struct{
|
|
|
|
// InputLayer stores relevant rsID values for each attribute from the user's genomes
|
|
// It also stores if each rsID is phased and if each rsID exists
|
|
InputLayer []float32
|
|
|
|
// OutputLayer stores user phenotype data as neurons
|
|
// Each neuron represents an outcome
|
|
// For example, for Eye Color, each neuron represents an eye color
|
|
OutputLayer []float32
|
|
}
|
|
|
|
|
|
func EncodeTrainingDataObjectToBytes(inputTrainingData TrainingData)([]byte, error){
|
|
|
|
buffer := new(bytes.Buffer)
|
|
|
|
encoder := gob.NewEncoder(buffer)
|
|
|
|
err := encoder.Encode(inputTrainingData)
|
|
if (err != nil) { return nil, err }
|
|
|
|
trainingDataBytes := buffer.Bytes()
|
|
|
|
return trainingDataBytes, nil
|
|
}
|
|
|
|
func DecodeBytesToTrainingDataObject(inputTrainingData []byte)(TrainingData, error){
|
|
|
|
if (inputTrainingData == nil){
|
|
return TrainingData{}, errors.New("DecodeBytesToTrainingDataObject called with nil inputTrainingData.")
|
|
}
|
|
|
|
buffer := bytes.NewBuffer(inputTrainingData)
|
|
|
|
decoder := gob.NewDecoder(buffer)
|
|
|
|
var newTrainingData TrainingData
|
|
|
|
err := decoder.Decode(&newTrainingData)
|
|
if (err != nil){ return TrainingData{}, err }
|
|
|
|
return newTrainingData, nil
|
|
}
|
|
|
|
// We use this to store a neural network's weights as a .gob file
|
|
type neuralNetworkForEncoding struct{
|
|
|
|
// These are the weights for each layer of neurons
|
|
Weights1 []float32
|
|
Weights2 []float32
|
|
Weights3 []float32
|
|
|
|
Weights1Rows int
|
|
Weights1Columns int
|
|
Weights2Rows int
|
|
Weights2Columns int
|
|
Weights3Rows int
|
|
Weights3Columns int
|
|
}
|
|
|
|
func EncodeNeuralNetworkObjectToBytes(inputNeuralNetwork NeuralNetwork)([]byte, error){
|
|
|
|
weights1 := inputNeuralNetwork.weights1
|
|
weights2 := inputNeuralNetwork.weights2
|
|
weights3 := inputNeuralNetwork.weights3
|
|
|
|
weights1Slice := weights1.Value().Data().([]float32)
|
|
weights2Slice := weights2.Value().Data().([]float32)
|
|
weights3Slice := weights3.Value().Data().([]float32)
|
|
|
|
weights1Rows := weights1.Shape()[0]
|
|
weights1Columns := weights1.Shape()[1]
|
|
weights2Rows := weights2.Shape()[0]
|
|
weights2Columns := weights2.Shape()[1]
|
|
weights3Rows := weights3.Shape()[0]
|
|
weights3Columns := weights3.Shape()[1]
|
|
|
|
newNeuralNetworkForEncoding := neuralNetworkForEncoding{
|
|
Weights1: weights1Slice,
|
|
Weights2: weights2Slice,
|
|
Weights3: weights3Slice,
|
|
|
|
Weights1Rows: weights1Rows,
|
|
Weights1Columns: weights1Columns,
|
|
Weights2Rows: weights2Rows,
|
|
Weights2Columns: weights2Columns,
|
|
Weights3Rows: weights3Rows,
|
|
Weights3Columns: weights3Columns,
|
|
}
|
|
|
|
buffer := new(bytes.Buffer)
|
|
|
|
encoder := gob.NewEncoder(buffer)
|
|
|
|
err := encoder.Encode(newNeuralNetworkForEncoding)
|
|
if (err != nil) { return nil, err }
|
|
|
|
neuralNetworkBytes := buffer.Bytes()
|
|
|
|
return neuralNetworkBytes, nil
|
|
}
|
|
|
|
func DecodeBytesToNeuralNetworkObject(inputNeuralNetwork []byte)(NeuralNetwork, error){
|
|
|
|
if (inputNeuralNetwork == nil){
|
|
return NeuralNetwork{}, errors.New("DecodeBytesToNeuralNetworkObject called with nil inputNeuralNetwork.")
|
|
}
|
|
|
|
buffer := bytes.NewBuffer(inputNeuralNetwork)
|
|
|
|
decoder := gob.NewDecoder(buffer)
|
|
|
|
var newNeuralNetworkForEncoding neuralNetworkForEncoding
|
|
|
|
err := decoder.Decode(&newNeuralNetworkForEncoding)
|
|
if (err != nil){ return NeuralNetwork{}, err }
|
|
|
|
weights1 := newNeuralNetworkForEncoding.Weights1
|
|
weights2 := newNeuralNetworkForEncoding.Weights2
|
|
weights3 := newNeuralNetworkForEncoding.Weights3
|
|
|
|
weights1Rows := newNeuralNetworkForEncoding.Weights1Rows
|
|
weights1Columns := newNeuralNetworkForEncoding.Weights1Columns
|
|
weights2Rows := newNeuralNetworkForEncoding.Weights2Rows
|
|
weights2Columns := newNeuralNetworkForEncoding.Weights2Columns
|
|
weights3Rows := newNeuralNetworkForEncoding.Weights3Rows
|
|
weights3Columns := newNeuralNetworkForEncoding.Weights3Columns
|
|
|
|
// This is the graph object we add each layer to
|
|
newGraph := gorgonia.NewGraph()
|
|
|
|
// A layer is a column of neurons
|
|
// Each neuron has an initial value between 0 and 1
|
|
getNewNeuralNetworkLayerWeights := func(layerName string, layerNeuronRows int, layerNeuronColumns int, layerWeightsList []float32)*gorgonia.Node{
|
|
|
|
layerNameObject := gorgonia.WithName(layerName)
|
|
|
|
layerBacking := tensor.WithBacking(layerWeightsList)
|
|
layerShape := tensor.WithShape(layerNeuronRows, layerNeuronColumns)
|
|
layerTensor := tensor.New(layerBacking, layerShape)
|
|
|
|
layerValueObject := gorgonia.WithValue(layerTensor)
|
|
|
|
layerObject := gorgonia.NewMatrix(newGraph, tensor.Float32, layerNameObject, layerValueObject)
|
|
|
|
return layerObject
|
|
}
|
|
|
|
layer1 := getNewNeuralNetworkLayerWeights("Weights1", weights1Rows, weights1Columns, weights1)
|
|
layer2 := getNewNeuralNetworkLayerWeights("Weights2", weights2Rows, weights2Columns, weights2)
|
|
layer3 := getNewNeuralNetworkLayerWeights("Weights3", weights3Rows, weights3Columns, weights3)
|
|
|
|
newNeuralNetworkObject := NeuralNetwork{
|
|
|
|
graph: newGraph,
|
|
|
|
weights1: layer1,
|
|
weights2: layer2,
|
|
weights3: layer3,
|
|
}
|
|
|
|
return newNeuralNetworkObject, nil
|
|
}
|
|
|
|
// This map is used to store information about how accurate genetic prediction models are for discrete traits
|
|
// Map Structure: Discrete Trait Outcome Info -> Discrete Trait Prediction Accuracy Info
|
|
type DiscreteTraitPredictionAccuracyInfoMap map[DiscreteTraitOutcomeInfo]DiscreteTraitPredictionAccuracyInfo
|
|
|
|
type DiscreteTraitOutcomeInfo struct{
|
|
|
|
// This is the outcome which was predicted
|
|
// Example: "Blue"
|
|
OutcomeName string
|
|
|
|
// This is a value between 0-100 which describes the percentage of the loci which were tested for the input for the prediction
|
|
PercentageOfLociTested int
|
|
|
|
// This is a value between 0-100 which describes the percentage of the tested loci which were phased for the input for the prediction
|
|
PercentageOfPhasedLoci int
|
|
}
|
|
|
|
type DiscreteTraitPredictionAccuracyInfo struct{
|
|
|
|
// This contains the quantity of examples for the outcome with the specified percentageOfLociTested and percentageOfPhasedLoci
|
|
QuantityOfExamples int
|
|
|
|
// This contains the quantity of predictions for the outcome with the specified percentageOfLociTested and percentageOfPhasedLoci
|
|
// Prediction = our model predicted this outcome
|
|
QuantityOfPredictions int
|
|
|
|
// This stores the probability (0-100) that our model will accurately predict this outcome for a genome which has
|
|
// the specified percentageOfLociTested and percentageOfPhasedLoci
|
|
// In other words: What is the probability that if you give Seekia a blue-eyed genome, it will give you a correct Blue prediction?
|
|
// This value is only accurate is QuantityOfExamples > 0
|
|
ProbabilityOfCorrectGenomePrediction int
|
|
|
|
// This stores the probability (0-100) that our model is correct if our model predicts that a genome
|
|
// with the specified percentageOfLociTested and percentageOfPhasedLoci has this outcome
|
|
// In other words: What is the probability that if Seekia says a genome will have blue eyes, it is correct?
|
|
// This value is only accurate is QuantityOfPredictions > 0
|
|
ProbabilityOfCorrectOutcomePrediction int
|
|
}
|
|
|
|
func EncodeDiscreteTraitPredictionAccuracyInfoMapToBytes(inputMap DiscreteTraitPredictionAccuracyInfoMap)([]byte, error){
|
|
|
|
buffer := new(bytes.Buffer)
|
|
|
|
encoder := gob.NewEncoder(buffer)
|
|
|
|
err := encoder.Encode(inputMap)
|
|
if (err != nil) { return nil, err }
|
|
|
|
inputMapBytes := buffer.Bytes()
|
|
|
|
return inputMapBytes, nil
|
|
}
|
|
|
|
func DecodeBytesToDiscreteTraitPredictionAccuracyInfoMap(inputBytes []byte)(DiscreteTraitPredictionAccuracyInfoMap, error){
|
|
|
|
if (inputBytes == nil){
|
|
return nil, errors.New("DecodeBytesToDiscreteTraitPredictionAccuracyInfoMap called with nil inputBytes.")
|
|
}
|
|
|
|
buffer := bytes.NewBuffer(inputBytes)
|
|
|
|
decoder := gob.NewDecoder(buffer)
|
|
|
|
var newDiscreteTraitPredictionAccuracyInfoMap DiscreteTraitPredictionAccuracyInfoMap
|
|
|
|
err := decoder.Decode(&newDiscreteTraitPredictionAccuracyInfoMap)
|
|
if (err != nil){ return nil, err }
|
|
|
|
return newDiscreteTraitPredictionAccuracyInfoMap, nil
|
|
}
|
|
|
|
type NumericAttributePredictionAccuracyInfoMap map[NumericAttributePredictionInfo]NumericAttributePredictionAccuracyRangesMap
|
|
|
|
type NumericAttributePredictionInfo struct{
|
|
|
|
// This is a value between 0-100 which describes the percentage of the loci which were tested for the input for the prediction
|
|
PercentageOfLociTested int
|
|
|
|
// This is a value between 0-100 which describes the percentage of the tested loci which were phased for the input for the prediction
|
|
PercentageOfPhasedLoci int
|
|
}
|
|
|
|
// Map Structure: Accuracy Percentage (AP) -> Amount needed to deviate from prediction for the value to be accurate (AP)% of the time
|
|
// For example, if the model predicted that someone was 150 centimeters tall, how many centimeters would we have to deviate in both directions
|
|
// in order for the true outcome to fall into the range 10% of the time, 20% of the time, 30% of the time, etc...
|
|
// Example:
|
|
// -90%+: 50 centimeters
|
|
// If you travel 50 centimeters in both directions from the prediction,
|
|
// the true height value will fall into this range 90% of the time.
|
|
// -50%+: 20 centimeters
|
|
// -10%+: 10 centimeters
|
|
type NumericAttributePredictionAccuracyRangesMap map[int]float64
|
|
|
|
|
|
func EncodeNumericAttributePredictionAccuracyInfoMapToBytes(inputMap NumericAttributePredictionAccuracyInfoMap)([]byte, error){
|
|
|
|
buffer := new(bytes.Buffer)
|
|
|
|
encoder := gob.NewEncoder(buffer)
|
|
|
|
err := encoder.Encode(inputMap)
|
|
if (err != nil) { return nil, err }
|
|
|
|
inputMapBytes := buffer.Bytes()
|
|
|
|
return inputMapBytes, nil
|
|
}
|
|
|
|
func DecodeBytesToNumericAttributePredictionAccuracyInfoMap(inputBytes []byte)(NumericAttributePredictionAccuracyInfoMap, error){
|
|
|
|
if (inputBytes == nil){
|
|
return nil, errors.New("DecodeBytesToNumericAttributePredictionAccuracyInfoMap called with nil inputBytes.")
|
|
}
|
|
|
|
buffer := bytes.NewBuffer(inputBytes)
|
|
|
|
decoder := gob.NewDecoder(buffer)
|
|
|
|
var newNumericAttributePredictionAccuracyInfoMap NumericAttributePredictionAccuracyInfoMap
|
|
|
|
err := decoder.Decode(&newNumericAttributePredictionAccuracyInfoMap)
|
|
if (err != nil){ return nil, err }
|
|
|
|
return newNumericAttributePredictionAccuracyInfoMap, nil
|
|
}
|
|
|
|
//Outputs:
|
|
// -bool: Neural network model exists for this trait (trait prediction is possible for this trait)
|
|
// -bool: Trait prediction is possible for this user (User has at least 1 known trait locus value)
|
|
// -string: Predicted trait outcome (Example: "Blue")
|
|
// -int: Confidence: Probability (0-100) that the prediction is accurate
|
|
// -int: Quantity of loci known
|
|
// -int: Quantity of phased loci
|
|
// -error
|
|
func GetNeuralNetworkDiscreteTraitPredictionFromGenomeMap(traitName string, genomeMap map[int64]locusValue.LocusValue)(bool, bool, string, int, int, int, error){
|
|
|
|
traitObject, err := traits.GetTraitObject(traitName)
|
|
if (err != nil) { return false, false, "", 0, 0, 0, err }
|
|
|
|
traitIsDiscreteOrNumeric := traitObject.DiscreteOrNumeric
|
|
if (traitIsDiscreteOrNumeric != "Discrete"){
|
|
return false, false, "", 0, 0, 0, errors.New("GetNeuralNetworkDiscreteTraitPredictionFromGenomeMap called with non-discrete trait: " + traitName)
|
|
}
|
|
|
|
// This is a map of rsIDs which influence this trait
|
|
traitRSIDsList := traitObject.LociList
|
|
|
|
if (len(traitRSIDsList) == 0){
|
|
// Neural network trait prediction is not possible for this trait
|
|
return false, false, "", 0, 0, 0, nil
|
|
}
|
|
|
|
predictionModelExists, predictionModelBytes := geneticPredictionModels.GetGeneticPredictionModelBytes(traitName)
|
|
if (predictionModelExists == false){
|
|
// Neural network trait prediction is not possible for this trait
|
|
return false, false, "", 0, 0, 0, nil
|
|
}
|
|
|
|
traitRSIDsListCopy := slices.Clone(traitRSIDsList)
|
|
slices.Sort(traitRSIDsListCopy)
|
|
|
|
neuralNetworkInput, quantityOfLociKnown, quantityOfPhasedLoci, err := createInputNeuralNetworkLayerFromGenomeMap(traitRSIDsListCopy, genomeMap)
|
|
if (err != nil) { return false, false, "", 0, 0, 0, err }
|
|
|
|
if (quantityOfLociKnown == 0){
|
|
// We can't predict anything about this trait for this genome
|
|
return true, false, "", 0, 0, 0, nil
|
|
}
|
|
|
|
neuralNetworkObject, err := DecodeBytesToNeuralNetworkObject(predictionModelBytes)
|
|
if (err != nil) { return false, false, "", 0, 0, 0, err }
|
|
|
|
outputLayer, err := GetNeuralNetworkRawPrediction(&neuralNetworkObject, false, neuralNetworkInput)
|
|
if (err != nil) { return false, false, "", 0, 0, 0, err }
|
|
|
|
predictedOutcomeName, err := GetDiscreteOutcomeNameFromOutputLayer(traitName, false, outputLayer)
|
|
if (err != nil) { return false, false, "", 0, 0, 0, err }
|
|
|
|
modelTraitAccuracyInfoFile, err := geneticPredictionModels.GetPredictionModelDiscreteTraitAccuracyInfoBytes(traitName)
|
|
if (err != nil) { return false, false, "", 0, 0, 0, err }
|
|
|
|
modelTraitAccuracyInfoMap, err := DecodeBytesToDiscreteTraitPredictionAccuracyInfoMap(modelTraitAccuracyInfoFile)
|
|
if (err != nil) { return false, false, "", 0, 0, 0, err }
|
|
|
|
// We find the model trait accuracy info object that is the most similar to our predicted outcome
|
|
|
|
getPredictionAccuracy := func()int{
|
|
|
|
totalNumberOfTraitLoci := len(traitRSIDsList)
|
|
|
|
proportionOfLociTested := float64(quantityOfLociKnown)/float64(totalNumberOfTraitLoci)
|
|
percentageOfLociTested := int(proportionOfLociTested * 100)
|
|
|
|
proportionOfPhasedLoci := float64(quantityOfPhasedLoci)/float64(totalNumberOfTraitLoci)
|
|
percentageOfPhasedLoci := int(proportionOfPhasedLoci * 100)
|
|
|
|
// This is a value between 0 and 100 that represents the most likely accuracy probability for this prediction
|
|
closestPredictionAccuracy := 0
|
|
|
|
// This is a value that represents the distance our closest prediction accuracy has from the current prediction
|
|
// Consider each prediction accuracy value on an (X,Y) coordinate plane
|
|
// X = Number of loci tested
|
|
// Y = Number of phased loci
|
|
closestPredictionAccuracyDistance := float64(0)
|
|
|
|
anyOutcomeAccuracyFound := false
|
|
|
|
for traitOutcomeInfo, traitPredictionAccuracyInfo := range modelTraitAccuracyInfoMap{
|
|
|
|
outcomeName := traitOutcomeInfo.OutcomeName
|
|
if (outcomeName != predictedOutcomeName){
|
|
continue
|
|
}
|
|
|
|
probabilityOfCorrectOutcomePrediction := traitPredictionAccuracyInfo.ProbabilityOfCorrectOutcomePrediction
|
|
|
|
currentPercentageOfLociTested := traitOutcomeInfo.PercentageOfLociTested
|
|
currentPercentageOfPhasedLoci := traitOutcomeInfo.PercentageOfPhasedLoci
|
|
|
|
// Distance Formula for 2 coordinates (x1, y1) and (x2, y2):
|
|
// distance = √((x2 - x1)^2 + (y2 - y1)^2)
|
|
|
|
differenceInX := float64(currentPercentageOfLociTested - percentageOfLociTested)
|
|
differenceInY := float64(currentPercentageOfPhasedLoci - percentageOfPhasedLoci)
|
|
|
|
distance := math.Sqrt(math.Pow(differenceInX, 2) + math.Pow(differenceInY, 2))
|
|
|
|
if (distance == 0){
|
|
// We found the exact prediction accuracy
|
|
return probabilityOfCorrectOutcomePrediction
|
|
}
|
|
|
|
if (anyOutcomeAccuracyFound == false){
|
|
closestPredictionAccuracyDistance = distance
|
|
closestPredictionAccuracy = probabilityOfCorrectOutcomePrediction
|
|
anyOutcomeAccuracyFound = true
|
|
continue
|
|
} else {
|
|
if (distance < closestPredictionAccuracyDistance){
|
|
closestPredictionAccuracyDistance = distance
|
|
closestPredictionAccuracy = probabilityOfCorrectOutcomePrediction
|
|
}
|
|
}
|
|
}
|
|
|
|
if (anyOutcomeAccuracyFound == false){
|
|
// This means that our model has never actually predicted this outcome
|
|
// This shouldn't happen unless our model is really bad, or our training set has very few people with this outcome.
|
|
// We return a 0% accuracy rating
|
|
return 0
|
|
}
|
|
|
|
return closestPredictionAccuracy
|
|
}
|
|
|
|
predictionAccuracy := getPredictionAccuracy()
|
|
|
|
return true, true, predictedOutcomeName, predictionAccuracy, quantityOfLociKnown, quantityOfPhasedLoci, nil
|
|
}
|
|
|
|
// This function is used to predict numeric traits and polygenic disease risk scores
|
|
//Outputs:
|
|
// -bool: Neural network model exists for this attribute (neural network prediction is possible for this attribute)
|
|
// -bool: Attribute prediction is possible for this user (User has at least 1 known attribute locus value)
|
|
// -float64: Predicted attribute outcome (Example: Height in centimeters)
|
|
// -map[int]float64: Accuracy ranges map
|
|
// -Map Structure: Probability prediction is accurate (X) -> Distance from prediction that must be travelled in both directions to
|
|
// create a range in which the true value will fall into, X% of the time
|
|
// -int: Quantity of loci known
|
|
// -int: Quantity of phased loci
|
|
// -error
|
|
func GetNeuralNetworkNumericAttributePredictionFromGenomeMap(attributeName string, attributeLociList []int64, genomeMap map[int64]locusValue.LocusValue)(bool, bool, float64, map[int]float64, int, int, error){
|
|
|
|
predictionModelExists, predictionModelBytes := geneticPredictionModels.GetGeneticPredictionModelBytes(attributeName)
|
|
if (predictionModelExists == false){
|
|
// Prediction is not possible for this attribute
|
|
return false, false, 0, nil, 0, 0, nil
|
|
}
|
|
|
|
if (len(attributeLociList) == 0){
|
|
return false, false, 0, nil, 0, 0, errors.New("GetNeuralNetworkNumericAttributePredictionFromGenomeMap called with empty attributeLociList for attribute with an existing neural network.")
|
|
}
|
|
|
|
attributeLociListCopy := slices.Clone(attributeLociList)
|
|
slices.Sort(attributeLociListCopy)
|
|
|
|
neuralNetworkInput, quantityOfLociKnown, quantityOfPhasedLoci, err := createInputNeuralNetworkLayerFromGenomeMap(attributeLociListCopy, genomeMap)
|
|
if (err != nil) { return false, false, 0, nil, 0, 0, err }
|
|
|
|
if (quantityOfLociKnown == 0){
|
|
// We can't predict anything about this attribute for this genome
|
|
return true, false, 0, nil, 0, 0, nil
|
|
}
|
|
|
|
neuralNetworkObject, err := DecodeBytesToNeuralNetworkObject(predictionModelBytes)
|
|
if (err != nil) { return false, false, 0, nil, 0, 0, err }
|
|
|
|
outputLayer, err := GetNeuralNetworkRawPrediction(&neuralNetworkObject, true, neuralNetworkInput)
|
|
if (err != nil) { return false, false, 0, nil, 0, 0, err }
|
|
|
|
predictedOutcomeValue, err := GetNumericOutcomeValueFromOutputLayer(attributeName, outputLayer)
|
|
if (err != nil) { return false, false, 0, nil, 0, 0, err }
|
|
|
|
modelAccuracyInfoFile, err := geneticPredictionModels.GetPredictionModelNumericAttributeAccuracyInfoBytes(attributeName)
|
|
if (err != nil) { return false, false, 0, nil, 0, 0, err }
|
|
|
|
modelAccuracyInfoMap, err := DecodeBytesToNumericAttributePredictionAccuracyInfoMap(modelAccuracyInfoFile)
|
|
if (err != nil) { return false, false, 0, nil, 0, 0, err }
|
|
|
|
// We create a prediction confidence ranges map for our prediction
|
|
|
|
getPredictionConfidenceRangesMap := func()map[int]float64{
|
|
|
|
totalNumberOfAttributeLoci := len(attributeLociListCopy)
|
|
|
|
proportionOfLociTested := float64(quantityOfLociKnown)/float64(totalNumberOfAttributeLoci)
|
|
percentageOfLociTested := int(proportionOfLociTested * 100)
|
|
|
|
proportionOfPhasedLoci := float64(quantityOfPhasedLoci)/float64(totalNumberOfAttributeLoci)
|
|
percentageOfPhasedLoci := int(proportionOfPhasedLoci * 100)
|
|
|
|
// This is a value between 0 and 100 that represents the most similar confidence ranges map for this prediction
|
|
var closestPredictionConfidenceRangesMap map[int]float64
|
|
|
|
// This is a value that represents the distance our closest prediction confidence ranges map has from the current prediction
|
|
// Consider each prediction accuracy value on an (X,Y) coordinate plane
|
|
// X = Number of loci tested
|
|
// Y = Number of phased loci
|
|
closestPredictionConfidenceRangesMapDistance := float64(0)
|
|
|
|
for attributeOutcomeInfo, attributePredictionConfidenceRangesMap := range modelAccuracyInfoMap{
|
|
|
|
currentPercentageOfLociTested := attributeOutcomeInfo.PercentageOfLociTested
|
|
currentPercentageOfPhasedLoci := attributeOutcomeInfo.PercentageOfPhasedLoci
|
|
|
|
// Distance Formula for 2 coordinates (x1, y1) and (x2, y2):
|
|
// distance = √((x2 - x1)^2 + (y2 - y1)^2)
|
|
|
|
differenceInX := float64(currentPercentageOfLociTested - percentageOfLociTested)
|
|
differenceInY := float64(currentPercentageOfPhasedLoci - percentageOfPhasedLoci)
|
|
|
|
distance := math.Sqrt(math.Pow(differenceInX, 2) + math.Pow(differenceInY, 2))
|
|
|
|
if (distance == 0){
|
|
// We found the exact prediction confidence ranges map
|
|
return attributePredictionConfidenceRangesMap
|
|
}
|
|
|
|
if (closestPredictionConfidenceRangesMap == nil || distance < closestPredictionConfidenceRangesMapDistance){
|
|
closestPredictionConfidenceRangesMapDistance = distance
|
|
closestPredictionConfidenceRangesMap = attributePredictionConfidenceRangesMap
|
|
}
|
|
}
|
|
|
|
return closestPredictionConfidenceRangesMap
|
|
}
|
|
|
|
predictionConfidenceRangesMap := getPredictionConfidenceRangesMap()
|
|
|
|
return true, true, predictedOutcomeValue, predictionConfidenceRangesMap, quantityOfLociKnown, quantityOfPhasedLoci, nil
|
|
}
|
|
|
|
|
|
//Outputs:
|
|
// -[]float32: Input layer for neural network
|
|
// -int: Quantity of known loci
|
|
// -int: Quantity of phased loci
|
|
// -error
|
|
func createInputNeuralNetworkLayerFromGenomeMap(rsidsList []int64, genomeMap map[int64]locusValue.LocusValue)([]float32, int, int, error){
|
|
|
|
// In the inputLayer, each locus value is represented by 3 neurons:
|
|
// 1. LocusExists/LocusIsPhased
|
|
// -0 = Locus value is unknown
|
|
// -0.5 = Locus Is known, phase is unknown
|
|
// -1 = Locus Is Known, phase is known
|
|
// 2. Allele1 Locus Value (Value between 0-1)
|
|
// -0 = Value is unknown
|
|
// 3. Allele2 Locus Value (Value between 0-1)
|
|
// -0 = Value is unknown
|
|
//
|
|
neuralNetworkInput := make([]float32, 0)
|
|
|
|
quantityOfLociKnown := 0
|
|
quantityOfPhasedLoci := 0
|
|
|
|
for _, rsID := range rsidsList{
|
|
|
|
userLocusValue, exists := genomeMap[rsID]
|
|
if (exists == false){
|
|
neuralNetworkInput = append(neuralNetworkInput, 0, 0, 0)
|
|
continue
|
|
}
|
|
|
|
quantityOfLociKnown += 1
|
|
|
|
locusAllele1 := userLocusValue.Base1Value
|
|
locusAllele2 := userLocusValue.Base2Value
|
|
locusIsPhased := userLocusValue.LocusIsPhased
|
|
|
|
getNeuron1 := func()float32{
|
|
if (locusIsPhased == false){
|
|
return 0.5
|
|
}
|
|
|
|
quantityOfPhasedLoci += 1
|
|
return 1
|
|
}
|
|
|
|
neuron1 := getNeuron1()
|
|
|
|
neuron2, err := convertAlleleToNeuron(locusAllele1)
|
|
if (err != nil) { return nil, 0, 0, err }
|
|
|
|
neuron3, err := convertAlleleToNeuron(locusAllele2)
|
|
if (err != nil) { return nil, 0, 0, err }
|
|
|
|
neuralNetworkInput = append(neuralNetworkInput, neuron1, neuron2, neuron3)
|
|
}
|
|
|
|
return neuralNetworkInput, quantityOfLociKnown, quantityOfLociKnown, nil
|
|
}
|
|
|
|
//Outputs:
|
|
// -int: Number of loci values that are known
|
|
// -int: Number of loci values that are known and phased
|
|
// -int: Number of loci
|
|
// -error
|
|
func GetLociInfoFromNetworkInputLayer(inputLayer []float32)(int, int, int, error){
|
|
|
|
// Each input layer has 3 neurons for each locus
|
|
// Each rsID (locus) is represented by 3 neurons: LocusExists/LocusIsPhased, Allele1 Value, Allele2 Value
|
|
// The LocusExists/LocusIsPhased neuron stores information like so:
|
|
// -0 = Locus value is unknown
|
|
// -0.5 = Locus Is known, phase is unknown
|
|
// -1 = Locus Is Known, phase is known
|
|
// Each rsID's neurons are concatenated together to form the inputLayer
|
|
|
|
inputLayerLength := len(inputLayer)
|
|
|
|
if (inputLayerLength%3 != 0){
|
|
return 0, 0, 0, errors.New("GetLociInfoFromNetworkInputLayer called with invalid length input layer: Not evenly divisible by 4.")
|
|
}
|
|
|
|
numberOfLoci := len(inputLayer)/3
|
|
|
|
numberOfLociValuesThatAreKnown := 0
|
|
numberOfLociValuesThatAreKnownAndPhased := 0
|
|
|
|
for index, neuronValue := range inputLayer{
|
|
|
|
indexRemainder := index%3
|
|
|
|
if (indexRemainder == 0){
|
|
|
|
if (neuronValue == 0){
|
|
continue
|
|
}
|
|
|
|
numberOfLociValuesThatAreKnown += 1
|
|
|
|
/// We use an inequality instead of ==1 because floats are imprecise
|
|
if (neuronValue > 0.99){
|
|
numberOfLociValuesThatAreKnown += 1
|
|
numberOfLociValuesThatAreKnownAndPhased += 1
|
|
}
|
|
}
|
|
}
|
|
|
|
if (numberOfLociValuesThatAreKnown == 0){
|
|
return 0, 0, 0, errors.New("GetLociInfoFromNetworkInputLayer called with input layer with no known loci values.")
|
|
}
|
|
|
|
return numberOfLociValuesThatAreKnown, numberOfLociValuesThatAreKnownAndPhased, numberOfLoci, nil
|
|
}
|
|
|
|
// This function returns which outcome is being described from a neural network's final output layer
|
|
// This is only used for discrete traits
|
|
// Outputs:
|
|
// -string: Output Name (Example: "Blue")
|
|
// -error
|
|
func GetDiscreteOutcomeNameFromOutputLayer(traitName string, verifyOutputLayer bool, outputLayer []float32)(string, error){
|
|
|
|
if (verifyOutputLayer == true){
|
|
|
|
// We make sure all neurons sum to 1
|
|
|
|
summedNeurons := float32(0)
|
|
|
|
for _, neuronValue := range outputLayer{
|
|
summedNeurons += neuronValue
|
|
}
|
|
|
|
// We allow a small amount of inaccuracy due to the imprecise nature of floats.
|
|
if (summedNeurons > 1.01 || summedNeurons < .99){
|
|
summedNeuronsString := helpers.ConvertFloat32ToString(summedNeurons)
|
|
return "", errors.New("GetOutcomeNameFromOutputLayer called with layer containing neuron values which don't sum to 1: " + summedNeuronsString)
|
|
}
|
|
}
|
|
|
|
getBiggestNeuronIndex := func()int{
|
|
|
|
biggestNeuronValue := float32(0)
|
|
biggestNeuronIndex := 0
|
|
|
|
for index, neuronValue := range outputLayer{
|
|
|
|
if (index == 0){
|
|
biggestNeuronValue = neuronValue
|
|
} else {
|
|
|
|
if (neuronValue > biggestNeuronValue){
|
|
biggestNeuronValue = neuronValue
|
|
biggestNeuronIndex = index
|
|
}
|
|
}
|
|
}
|
|
|
|
return biggestNeuronIndex
|
|
}
|
|
|
|
biggestNeuronIndex := getBiggestNeuronIndex()
|
|
|
|
switch traitName{
|
|
|
|
case "Eye Color":{
|
|
|
|
if (len(outputLayer) != 4){
|
|
return "", errors.New("GetOutcomeNameFromOutputLayer called with invalid length output layer.")
|
|
}
|
|
|
|
switch biggestNeuronIndex{
|
|
case 0:{
|
|
return "Blue", nil
|
|
}
|
|
case 1:{
|
|
return "Green", nil
|
|
}
|
|
case 2:{
|
|
return "Hazel", nil
|
|
}
|
|
case 3:{
|
|
return "Brown", nil
|
|
}
|
|
}
|
|
}
|
|
case "Lactose Tolerance":{
|
|
|
|
if (len(outputLayer) != 2){
|
|
return "", errors.New("GetOutcomeNameFromOutputLayer called with invalid length output layer.")
|
|
}
|
|
|
|
switch biggestNeuronIndex{
|
|
case 0:{
|
|
return "Tolerant", nil
|
|
}
|
|
case 1:{
|
|
return "Intolerant", nil
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return "", errors.New("GetOutcomeNameFromOutputLayer called with unknown traitName: " + traitName)
|
|
}
|
|
|
|
|
|
// This function returns which outcome is being described from a neural network's final output layer
|
|
// This is only used for numeric traits and polygenic diseases
|
|
// Outputs:
|
|
// -float64: Output Value (example: 150 centimeters)
|
|
// -error
|
|
func GetNumericOutcomeValueFromOutputLayer(attributeName string, outputLayer []float32)(float64, error){
|
|
|
|
if (len(outputLayer) != 1){
|
|
return 0, errors.New("GetNumericOutcomeValueFromOutputLayer called with output layer which is not length of 1")
|
|
}
|
|
|
|
outputNeuron := outputLayer[0]
|
|
|
|
if (outputNeuron < 0 || outputNeuron > 1){
|
|
return 0, errors.New("GetNumericOutcomeValueFromOutputLayer called with output layer contains out-of-bounds neuron")
|
|
}
|
|
|
|
getOutcomeMinAndMax := func()(float64, float64, error){
|
|
|
|
switch attributeName{
|
|
case "Height":{
|
|
// Shortest person of all time: 54 cm
|
|
// Tallest person of all time: 272 cm
|
|
return 54, 272, nil
|
|
}
|
|
case "Autism",
|
|
"Homosexualness",
|
|
"Obesity":{
|
|
return 0, 10, nil
|
|
}
|
|
}
|
|
|
|
return 0, 0, errors.New("GetNumericOutcomeValueFromOutputLayer called with unknown attributeName: " + attributeName)
|
|
}
|
|
|
|
outcomeMin, outcomeMax, err := getOutcomeMinAndMax()
|
|
if (err != nil) { return 0, err }
|
|
|
|
outcomeValue, err := helpers.ScaleFloat64Proportionally(true, float64(outputNeuron), 0, 1, outcomeMin, outcomeMax)
|
|
if (err != nil) { return 0, err }
|
|
|
|
return outcomeValue, nil
|
|
}
|
|
|
|
//Outputs:
|
|
// -int: Layer 1 neuron count (input layer)
|
|
// -int: Layer 2 neuron count
|
|
// -int: Layer 3 neuron count
|
|
// -int: Layer 4 neuron count (output layer)
|
|
// -error
|
|
func getNeuralNetworkLayerSizes(attributeName string)(int, int, int, int, error){
|
|
|
|
switch attributeName{
|
|
|
|
case "Eye Color":{
|
|
|
|
// There are 282 input neurons
|
|
// There are 4 output neurons, each representing a color
|
|
// There are 4 colors: Blue, Green, Brown, Hazel
|
|
|
|
return 282, 100, 50, 4, nil
|
|
}
|
|
case "Lactose Tolerance":{
|
|
|
|
// There are 6 input neurons
|
|
// There are 2 output neurons, each representing a tolerance: Tolerant, Intolerant
|
|
return 6, 4, 3, 2, nil
|
|
}
|
|
case "Height":{
|
|
// There are 3000 input neurons
|
|
// There is 1 output neuron, representing a height value
|
|
return 3000, 3, 2, 1, nil
|
|
}
|
|
case "Autism":{
|
|
// There are 1428 input neurons
|
|
// There is 1 output neuron, representing an autism value
|
|
return 1428, 3, 2, 1, nil
|
|
}
|
|
case "Homosexualness":{
|
|
// There are 12 input neurons
|
|
// There is 1 output neuron, representing a homosexualness value
|
|
return 12, 10, 5, 1, nil
|
|
}
|
|
case "Obesity":{
|
|
// There are 3000 input neurons
|
|
// There is 1 output neuron, representing an obesity value
|
|
return 3000, 3, 2, 1, nil
|
|
}
|
|
}
|
|
|
|
return 0, 0, 0, 0, errors.New("getNeuralNetworkLayerSizes called with unknown attributeName: " + attributeName)
|
|
}
|
|
|
|
//This function converts a genome allele to a neuron to use in a tensor
|
|
// A value of 0 means that the allele is unknown
|
|
func convertAlleleToNeuron(allele string)(float32, error){
|
|
|
|
switch allele{
|
|
|
|
case "C":{
|
|
|
|
return 0.16, nil
|
|
}
|
|
case "A":{
|
|
|
|
return 0.32, nil
|
|
}
|
|
case "T":{
|
|
|
|
return 0.48, nil
|
|
}
|
|
case "G":{
|
|
|
|
return 0.64, nil
|
|
}
|
|
case "I":{
|
|
|
|
return 0.8, nil
|
|
}
|
|
case "D":{
|
|
|
|
return 1, nil
|
|
}
|
|
}
|
|
|
|
return 0, errors.New("convertAlleleToNeuron called with invalid allele: " + allele)
|
|
}
|
|
|
|
|
|
// This function returns training data to use to train each neural network prediction model
|
|
// Outputs:
|
|
// -bool: User has phenotype data and enough loci to train model
|
|
// -[]TrainingData: List of TrainingData for the user which we will use to train the model
|
|
// -error
|
|
func CreateGeneticPredictionTrainingData_OpenSNP(
|
|
attributeName string,
|
|
userPhenotypeDataObject readBiobankData.PhenotypeData_OpenSNP,
|
|
userLocusValuesMap map[int64]locusValue.LocusValue)(bool, []TrainingData, error){
|
|
|
|
//Outputs:
|
|
// -[]int64: Attribute rsIDs list
|
|
// -error
|
|
getAttributeLociList := func()([]int64, error){
|
|
|
|
switch attributeName{
|
|
|
|
case "Eye Color",
|
|
"Lactose Tolerance",
|
|
"Height",
|
|
"Homosexualness":{
|
|
|
|
traitObject, err := traits.GetTraitObject(attributeName)
|
|
if (err != nil) { return nil, err }
|
|
|
|
// This is a list of rsIDs which influence this trait
|
|
traitLociList := traitObject.LociList
|
|
|
|
return traitLociList, nil
|
|
}
|
|
case "Autism",
|
|
"Obesity":{
|
|
|
|
diseaseObject, err := polygenicDiseases.GetPolygenicDiseaseObject(attributeName)
|
|
if (err != nil) { return nil, err }
|
|
|
|
// This is a list of rsIDs which influence this disease
|
|
diseaseLociList := diseaseObject.LociList
|
|
|
|
return diseaseLociList, nil
|
|
}
|
|
}
|
|
|
|
return nil, errors.New("CreateGeneticPredictionTrainingData_OpenSNP called with unknown attributeName: " + attributeName)
|
|
}
|
|
|
|
attributeLociList, err := getAttributeLociList()
|
|
if (err != nil) { return false, nil, err }
|
|
|
|
if (len(attributeLociList) == 0){
|
|
return false, nil, errors.New("getAttributeLociList returning empty attributeLociList.")
|
|
}
|
|
|
|
// Each layer is represented as a []float32
|
|
// Each float is a value between 0 and 1
|
|
//
|
|
// Each TrainingData holds a variation of the user's genome rsID values
|
|
// We add many rows with withheld data to improve training data
|
|
|
|
numberOfInputLayerRows, _, _, numberOfOutputLayerRows, err := getNeuralNetworkLayerSizes(attributeName)
|
|
if (err != nil) { return false, nil, err }
|
|
|
|
// Each rsID is represented by 3 neurons: LocusExists/LocusIsPhased, Allele1 Value, Allele2 Value
|
|
// The LocusExists/LocusIsPhased neuron stores information like so:
|
|
// -0 = Locus value is unknown
|
|
// -0.5 = Locus Is known, phase is unknown
|
|
// -1 = Locus Is Known, phase is known
|
|
expectedNumberOfInputLayerRows := len(attributeLociList) * 3
|
|
|
|
if (numberOfInputLayerRows != expectedNumberOfInputLayerRows){
|
|
|
|
expectedNumberOfInputLayerRowsString := helpers.ConvertIntToString(expectedNumberOfInputLayerRows)
|
|
|
|
return false, nil, errors.New("numberOfInputLayerRows is not expected: " + expectedNumberOfInputLayerRowsString)
|
|
}
|
|
|
|
checkIfAnyAttributeLocusValuesExist := func()bool{
|
|
|
|
for _, rsID := range attributeLociList{
|
|
|
|
_, exists := userLocusValuesMap[rsID]
|
|
if (exists == true){
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
anyAttributeLocusValuesExist := checkIfAnyAttributeLocusValuesExist()
|
|
if (anyAttributeLocusValuesExist == false){
|
|
// The user's genome does not contain any of this attribute's locus values
|
|
// We will not train on their data
|
|
return false, nil, nil
|
|
}
|
|
|
|
// We sort rsIDs in ascending order
|
|
|
|
attributeLociListCopy := slices.Clone(attributeLociList)
|
|
slices.Sort(attributeLociListCopy)
|
|
|
|
// This function returns the outputLayer for all trainingDatas for this user
|
|
// Each outputLayer represents the user's attribute value (Example: "Blue" for Eye Color)
|
|
// Each outputLayer is identical, because each TrainingData example belongs to the same user
|
|
//
|
|
// Outputs:
|
|
// -bool: User attribute value is known
|
|
// -[]float32: Neuron values for layer
|
|
// -error
|
|
getUserAttributeValueNeurons := func()(bool, []float32, error){
|
|
|
|
switch attributeName{
|
|
|
|
case "Eye Color":{
|
|
|
|
userEyeColorIsKnown := userPhenotypeDataObject.EyeColorIsKnown
|
|
if (userEyeColorIsKnown == false){
|
|
return false, nil, nil
|
|
}
|
|
|
|
userEyeColor := userPhenotypeDataObject.EyeColor
|
|
|
|
if (userEyeColor == "Blue"){
|
|
|
|
return true, []float32{1, 0, 0, 0}, nil
|
|
|
|
} else if (userEyeColor == "Green"){
|
|
|
|
return true, []float32{0, 1, 0, 0}, nil
|
|
|
|
} else if (userEyeColor == "Hazel"){
|
|
|
|
return true, []float32{0, 0, 1, 0}, nil
|
|
|
|
} else if (userEyeColor == "Brown"){
|
|
|
|
return true, []float32{0, 0, 0, 1}, nil
|
|
}
|
|
|
|
return false, nil, errors.New("Malformed userPhenotypeDataObject: Invalid eyeColor: " + userEyeColor)
|
|
}
|
|
case "Lactose Tolerance":{
|
|
|
|
userLactoseToleranceIsKnown := userPhenotypeDataObject.LactoseToleranceIsKnown
|
|
if (userLactoseToleranceIsKnown == false){
|
|
return false, nil, nil
|
|
}
|
|
|
|
userLactoseTolerance := userPhenotypeDataObject.LactoseTolerance
|
|
|
|
if (userLactoseTolerance == true){
|
|
|
|
return true, []float32{1, 0}, nil
|
|
}
|
|
|
|
return true, []float32{0, 1}, nil
|
|
}
|
|
case "Height":{
|
|
|
|
userHeightIsKnown := userPhenotypeDataObject.HeightIsKnown
|
|
if (userHeightIsKnown == false){
|
|
return false, nil, nil
|
|
}
|
|
|
|
userHeight := userPhenotypeDataObject.Height
|
|
|
|
// Shortest person of all time: 54 cm
|
|
// Tallest person of all time: 272 cm
|
|
|
|
outputValue, err := helpers.ScaleFloat64Proportionally(true, userHeight, 54, 272, 0, 1)
|
|
if (err != nil) { return false, nil, err }
|
|
|
|
outputValueFloat32 := float32(outputValue)
|
|
|
|
outputLayer := []float32{outputValueFloat32}
|
|
|
|
return true, outputLayer, nil
|
|
}
|
|
case "Autism":{
|
|
|
|
userAutismIsKnown := userPhenotypeDataObject.AutismIsKnown
|
|
if (userAutismIsKnown == false){
|
|
return false, nil, nil
|
|
}
|
|
|
|
userAutism := userPhenotypeDataObject.Autism
|
|
|
|
outputValueFloat32 := float32(userAutism)
|
|
|
|
outputLayer := []float32{outputValueFloat32}
|
|
|
|
return true, outputLayer, nil
|
|
}
|
|
case "Homosexualness":{
|
|
|
|
userHomosexualnessIsKnown := userPhenotypeDataObject.HomosexualnessIsKnown
|
|
if (userHomosexualnessIsKnown == false){
|
|
return false, nil, nil
|
|
}
|
|
|
|
userHomosexualness := userPhenotypeDataObject.Homosexualness
|
|
|
|
outputValueFloat32 := float32(userHomosexualness)
|
|
|
|
outputLayer := []float32{outputValueFloat32}
|
|
|
|
return true, outputLayer, nil
|
|
}
|
|
case "Obesity":{
|
|
|
|
userObesityIsKnown := userPhenotypeDataObject.ObesityIsKnown
|
|
if (userObesityIsKnown == false){
|
|
return false, nil, nil
|
|
}
|
|
|
|
userObesity := userPhenotypeDataObject.Obesity
|
|
|
|
outputValueFloat32 := float32(userObesity)
|
|
|
|
outputLayer := []float32{outputValueFloat32}
|
|
|
|
return true, outputLayer, nil
|
|
}
|
|
}
|
|
|
|
return false, nil, errors.New("Unknown attributeName: " + attributeName)
|
|
}
|
|
|
|
userAttributeValueExists, userAttributeValueNeurons, err := getUserAttributeValueNeurons()
|
|
if (err != nil) { return false, nil, err }
|
|
if (userAttributeValueExists == false){
|
|
// User cannot be used to train the model.
|
|
// They do not have a value for this attribute.
|
|
return false, nil, nil
|
|
}
|
|
|
|
if (len(userAttributeValueNeurons) != numberOfOutputLayerRows){
|
|
return false, nil, errors.New("getUserAttributeValueNeurons returning invalid length layer slice.")
|
|
}
|
|
|
|
// We want the initial training data to be the same for each call of this function that has the same input parameters
|
|
// This is a necessary step so our neural network models will be reproducable
|
|
// Reproducable means that other people can run the code and produce the same models, byte-for-byte
|
|
|
|
pseudorandomNumberGenerator := mathRand.New(mathRand.NewPCG(1, 2))
|
|
|
|
// We create 110 examples per user.
|
|
// We randomize allele order whenever phase for the locus is unknown
|
|
// 50% of the time we randomize allele order even when phase is known to train the model on unphased data
|
|
// Unphased data is data where the order each allele (Example: G;A) has no meaning, because phase data was not captured
|
|
// We randomize allele order to simulate unphased data
|
|
// For example, if a user's genome is phased, we will randomize the base pair order and set the LocusIsPhased neuron to false
|
|
//
|
|
// Examples 0-10: 100% of the user's loci are used
|
|
// Examples 11-30: 90% of the user's loci are used
|
|
// Examples 31-50: 70% of the user's loci are used
|
|
// Examples 51-70: 50% of the user's loci are used
|
|
// Examples 71-90: 30% of the user's loci are used
|
|
// Examples 91-110: 10% of the user's loci are used
|
|
|
|
// We now add this user's data to the trainingDataList
|
|
|
|
trainingDataList := make([]TrainingData, 0, 110)
|
|
|
|
for i:=0; i < 110; i++{
|
|
|
|
getRandomizePhaseBool := func()bool{
|
|
|
|
if (i%2 == 0){
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
randomizePhaseBool := getRandomizePhaseBool()
|
|
|
|
getProbabilityOfUsingLoci := func()float64{
|
|
|
|
if (i <= 10){
|
|
return 1
|
|
|
|
} else if (i <= 30){
|
|
return 0.9
|
|
|
|
} else if (i <= 50){
|
|
return 0.7
|
|
|
|
} else if (i <= 70){
|
|
return 0.5
|
|
|
|
} else if (i <= 90){
|
|
return 0.3
|
|
}
|
|
|
|
return 0.1
|
|
}
|
|
|
|
probabilityOfUsingLoci := getProbabilityOfUsingLoci()
|
|
|
|
// In the inputLayer, each locus value is represented by 3 neurons:
|
|
// 1. LocusExists/LocusIsPhased
|
|
// -0 = Locus value is unknown
|
|
// -0.5 = Locus Is known, phase is unknown
|
|
// -1 = Locus Is Known, phase is known
|
|
// 2. Allele1 Locus Value (Value between 0-1)
|
|
// -0 = Value is unknown
|
|
// 3. Allele2 Locus Value (Value between 0-1)
|
|
// -0 = Value is unknown
|
|
|
|
anyLocusExists := false
|
|
|
|
inputLayerLength := len(attributeLociListCopy) * 3
|
|
|
|
inputLayer := make([]float32, 0, inputLayerLength)
|
|
|
|
for _, rsID := range attributeLociListCopy{
|
|
|
|
randomFloat := pseudorandomNumberGenerator.Float64()
|
|
if (randomFloat > probabilityOfUsingLoci){
|
|
// This if statement has a !probabilityOfUsingLoci chance of being true.
|
|
// We are skipping this locus
|
|
inputLayer = append(inputLayer, 0, 0, 0)
|
|
continue
|
|
}
|
|
|
|
userLocusValue, exists := userLocusValuesMap[rsID]
|
|
if (exists == false){
|
|
// This user's locus value is unknown
|
|
inputLayer = append(inputLayer, 0, 0, 0)
|
|
continue
|
|
}
|
|
|
|
anyLocusExists = true
|
|
|
|
//Outputs:
|
|
// -float32: Final neuron value
|
|
// -0 == Value is unknown
|
|
// -0.5 == Value is known, phase is unknown
|
|
// -1 == Value is known, phase is known
|
|
// -string: Allele 1 value
|
|
// -string: Allele 2 value
|
|
getFirstNeuronAndLocusAlleles := func()(float32, string, string){
|
|
|
|
locusAllele1 := userLocusValue.Base1Value
|
|
locusAllele2 := userLocusValue.Base2Value
|
|
|
|
if (locusAllele1 == locusAllele2){
|
|
// Locus phase is unimportant
|
|
return 1, locusAllele1, locusAllele2
|
|
}
|
|
|
|
locusIsPhased := userLocusValue.LocusIsPhased
|
|
|
|
if (randomizePhaseBool == false && locusIsPhased == true){
|
|
return 1, locusAllele1, locusAllele2
|
|
}
|
|
|
|
// We randomize the phase of the locus
|
|
// We always do this if the locus is not phased, because the genome data might actually be partially/fully phased,
|
|
// even if it is not advertised as being phased
|
|
|
|
randomNumber := pseudorandomNumberGenerator.IntN(2)
|
|
if (randomNumber == 1){
|
|
// This has a 50% chance of being true.
|
|
return 0.5, locusAllele1, locusAllele2
|
|
}
|
|
|
|
return 0.5, locusAllele2, locusAllele1
|
|
}
|
|
|
|
locusIsKnownAndPhasedNeuronValue, locusAllele1, locusAllele2 := getFirstNeuronAndLocusAlleles()
|
|
|
|
locusAllele1NeuronValue, err := convertAlleleToNeuron(locusAllele1)
|
|
if (err != nil){ return false, nil, err }
|
|
locusAllele2NeuronValue, err := convertAlleleToNeuron(locusAllele2)
|
|
if (err != nil) { return false, nil, err }
|
|
|
|
inputLayer = append(inputLayer, locusIsKnownAndPhasedNeuronValue, locusAllele1NeuronValue, locusAllele2NeuronValue)
|
|
}
|
|
|
|
if (anyLocusExists == false){
|
|
// We have 0 known loci for this training example.
|
|
// We won't add it to the training data.
|
|
continue
|
|
}
|
|
|
|
userAttributeValueNeuronsCopy := slices.Clone(userAttributeValueNeurons)
|
|
|
|
newTrainingData := TrainingData{
|
|
InputLayer: inputLayer,
|
|
OutputLayer: userAttributeValueNeuronsCopy,
|
|
}
|
|
|
|
trainingDataList = append(trainingDataList, newTrainingData)
|
|
}
|
|
|
|
return true, trainingDataList, nil
|
|
}
|
|
|
|
func GetNewUntrainedNeuralNetworkObject(attributeName string)(*NeuralNetwork, error){
|
|
|
|
layer1NeuronCount, layer2NeuronCount, layer3NeuronCount, layer4NeuronCount, err := getNeuralNetworkLayerSizes(attributeName)
|
|
if (err != nil) { return nil, err }
|
|
|
|
// This is the graph object we add each layer to
|
|
newGraph := gorgonia.NewGraph()
|
|
|
|
// We want the initial weights to be the same for each call of this function that has the same input parameters
|
|
// This is a necessary step so our neural network models will be reproducable
|
|
// Reproducable means that other people can run the code and produce the same models, byte-for-byte
|
|
|
|
pseudorandomNumberGenerator := mathRand.New(mathRand.NewPCG(1, 2))
|
|
|
|
// A layer is a column of neurons
|
|
// Each neuron has an initial value between 0 and 1
|
|
getNewNeuralNetworkLayerWeights := func(layerName string, layerNeuronRows int, layerNeuronColumns int)*gorgonia.Node{
|
|
|
|
layerNameObject := gorgonia.WithName(layerName)
|
|
|
|
totalNumberOfNeurons := layerNeuronRows * layerNeuronColumns
|
|
|
|
layerInitialWeightsList := make([]float32, 0, totalNumberOfNeurons)
|
|
|
|
for i:=0; i < totalNumberOfNeurons; i++{
|
|
|
|
// We initialize the weights with He initialization
|
|
// He initialization = (0 +/- sqrt(2/n) where n is the number of nodes in the prior layer)
|
|
|
|
// pseudorandomNumberGenerator.Float32() returns a pseudo-random number between 0 and 1
|
|
|
|
newWeight := ((pseudorandomNumberGenerator.Float32()-0.5)*2) * float32(math.Sqrt(float64(2)/float64(layerNeuronRows)))
|
|
|
|
layerInitialWeightsList = append(layerInitialWeightsList, newWeight)
|
|
}
|
|
|
|
layerBacking := tensor.WithBacking(layerInitialWeightsList)
|
|
|
|
layerShape := tensor.WithShape(layerNeuronRows, layerNeuronColumns)
|
|
|
|
layerTensor := tensor.New(layerBacking, layerShape)
|
|
|
|
layerValueObject := gorgonia.WithValue(layerTensor)
|
|
|
|
layerObject := gorgonia.NewMatrix(newGraph, tensor.Float32, layerNameObject, layerValueObject)
|
|
|
|
return layerObject
|
|
}
|
|
|
|
layer1 := getNewNeuralNetworkLayerWeights("Weights1", layer1NeuronCount, layer2NeuronCount)
|
|
layer2 := getNewNeuralNetworkLayerWeights("Weights2", layer2NeuronCount, layer3NeuronCount)
|
|
layer3 := getNewNeuralNetworkLayerWeights("Weights3", layer3NeuronCount, layer4NeuronCount)
|
|
|
|
newNeuralNetworkObject := NeuralNetwork{
|
|
|
|
graph: newGraph,
|
|
|
|
weights1: layer1,
|
|
weights2: layer2,
|
|
weights3: layer3,
|
|
}
|
|
|
|
return &newNeuralNetworkObject, nil
|
|
}
|
|
|
|
// This function returns the weights of the neural network
|
|
// We need this for training
|
|
func (inputNetwork *NeuralNetwork)getLearnables()gorgonia.Nodes{
|
|
|
|
weights1 := inputNetwork.weights1
|
|
weights2 := inputNetwork.weights2
|
|
weights3 := inputNetwork.weights3
|
|
|
|
result := gorgonia.Nodes{weights1, weights2, weights3}
|
|
|
|
return result
|
|
}
|
|
|
|
|
|
// This function will train the neural network
|
|
// The function is passed a batch of TrainingData examples to train on
|
|
// Inputs:
|
|
// -string: Attribute Name
|
|
// -bool: Attribute is Numeric
|
|
// -An example of a numeric attribute is Height
|
|
// -An example of a discrete attribute is Eye Color, which has discrete outcomes (colors)
|
|
// -*NeuralNetwork
|
|
// -func()(bool, bool, TrainingData, error): Function to get the next training data.
|
|
// -Outputs:
|
|
// -bool: User stopped the training run
|
|
// -bool: Another training data exists
|
|
// -TrainingData: The next training data example
|
|
// -error
|
|
// Outputs:
|
|
// -bool: Process completed (was not stopped mid-way)
|
|
// -error
|
|
func TrainNeuralNetwork(attributeName string, attributeIsNumeric bool, neuralNetworkObject *NeuralNetwork, getNextTrainingData func()(bool, bool, TrainingData, error))(bool, error){
|
|
|
|
layer1NeuronCount, _, _, layer4NeuronCount, err := getNeuralNetworkLayerSizes(attributeName)
|
|
if (err != nil) { return false, err }
|
|
|
|
neuralNetworkGraph := neuralNetworkObject.graph
|
|
|
|
// We first create the input and output nodes
|
|
// They don't have any values yet.
|
|
|
|
trainingDataInputNode := gorgonia.NewMatrix(neuralNetworkGraph,
|
|
tensor.Float32,
|
|
gorgonia.WithName("Input"),
|
|
gorgonia.WithShape(1, layer1NeuronCount),
|
|
)
|
|
|
|
trainingDataExpectedOutputNode := gorgonia.NewMatrix(neuralNetworkGraph,
|
|
tensor.Float32,
|
|
gorgonia.WithName("ExpectedOutput"),
|
|
gorgonia.WithShape(1, layer4NeuronCount),
|
|
)
|
|
|
|
err = neuralNetworkObject.buildNeuralNetwork(trainingDataInputNode, attributeIsNumeric)
|
|
if (err != nil) { return false, err }
|
|
|
|
// This computes the loss (how accurate was our prediction)
|
|
losses, err := gorgonia.Sub(trainingDataExpectedOutputNode, neuralNetworkObject.prediction)
|
|
if (err != nil) { return false, err }
|
|
|
|
squareOfLosses, err := gorgonia.Square(losses)
|
|
if (err != nil) { return false, err }
|
|
|
|
// Cost is an average of the square of losses
|
|
cost, err := gorgonia.Mean(squareOfLosses)
|
|
if (err != nil) { return false, err }
|
|
|
|
neuralNetworkLearnables := neuralNetworkObject.getLearnables()
|
|
|
|
// Grad takes a scalar cost node and a list of with-regards-to, and returns the gradient
|
|
_, err = gorgonia.Grad(cost, neuralNetworkLearnables...)
|
|
if (err != nil) { return false, err }
|
|
|
|
bindDualValues := gorgonia.BindDualValues(neuralNetworkLearnables...)
|
|
|
|
// NewTapeMachine creates a Virtual Machine that compiles a graph into a prog.
|
|
virtualMachine := gorgonia.NewTapeMachine(neuralNetworkGraph, bindDualValues)
|
|
|
|
// This is the learn rate or step size for the solver.
|
|
learningRate := gorgonia.WithLearnRate(.01)
|
|
|
|
// This clips the gradient if it gets too crazy
|
|
// gradientClip := gorgonia.WithClip(.05)
|
|
|
|
solver := gorgonia.NewVanillaSolver(learningRate)
|
|
// solver := gorgonia.NewVanillaSolver(learningRate, gradientClip)
|
|
defer virtualMachine.Close()
|
|
|
|
for {
|
|
|
|
userStoppedTraining, nextDataExists, trainingDataObject, err := getNextTrainingData()
|
|
if (err != nil) { return false, err }
|
|
if (userStoppedTraining == true){
|
|
// User manually stopped the training run
|
|
return false, nil
|
|
}
|
|
if (nextDataExists == false){
|
|
// We are done training
|
|
break
|
|
}
|
|
|
|
// We convert our input training data slices to the type *Dense and assign them to our nodes
|
|
|
|
// This inputLayer contains the allele values for this training example
|
|
trainingDataInputLayer := trainingDataObject.InputLayer
|
|
|
|
// This outputLayer contains the phenotype for this training example (example: Eye color of Blue)
|
|
trainingDataOutputLayer := trainingDataObject.OutputLayer
|
|
|
|
inputTensorShapeObject := tensor.WithShape(1, layer1NeuronCount)
|
|
outputTensorShapeObject := tensor.WithShape(1, layer4NeuronCount)
|
|
|
|
inputTensorBacking := tensor.WithBacking(trainingDataInputLayer)
|
|
outputTensorBacking := tensor.WithBacking(trainingDataOutputLayer)
|
|
|
|
inputTensor := tensor.New(inputTensorShapeObject, inputTensorBacking)
|
|
outputTensor := tensor.New(outputTensorShapeObject, outputTensorBacking)
|
|
|
|
err = gorgonia.Let(trainingDataInputNode, inputTensor)
|
|
if (err != nil) { return false, err }
|
|
|
|
err = gorgonia.Let(trainingDataExpectedOutputNode, outputTensor)
|
|
if (err != nil) { return false, err }
|
|
|
|
err = virtualMachine.RunAll()
|
|
if (err != nil) { return false, err }
|
|
|
|
// NodesToValueGrads is a utility function that converts a Nodes to a slice of ValueGrad for the solver
|
|
valueGrads := gorgonia.NodesToValueGrads(neuralNetworkLearnables)
|
|
|
|
err = solver.Step(valueGrads)
|
|
if (err != nil) { return false, err }
|
|
|
|
virtualMachine.Reset()
|
|
}
|
|
|
|
return true, nil
|
|
}
|
|
|
|
|
|
// This function computes a raw prediction from the neural network
|
|
// Outputs:
|
|
// -[]float32: Output neurons
|
|
// -error
|
|
func GetNeuralNetworkRawPrediction(inputNeuralNetwork *NeuralNetwork, attributeIsNumeric bool, inputLayer []float32)([]float32, error){
|
|
|
|
neuralNetworkGraph := inputNeuralNetwork.graph
|
|
|
|
numberOfInputNeurons := len(inputLayer)
|
|
|
|
inputNode := gorgonia.NewMatrix(neuralNetworkGraph,
|
|
tensor.Float32,
|
|
gorgonia.WithName("Input"),
|
|
gorgonia.WithShape(1, numberOfInputNeurons),
|
|
)
|
|
|
|
// We convert the inputLayer []float32 to a tensor *Dense object
|
|
|
|
inputTensorShapeObject := tensor.WithShape(1, numberOfInputNeurons)
|
|
|
|
inputTensorBacking := tensor.WithBacking(inputLayer)
|
|
|
|
inputTensor := tensor.New(inputTensorShapeObject, inputTensorBacking)
|
|
|
|
err := gorgonia.Let(inputNode, inputTensor)
|
|
if (err != nil) { return nil, err }
|
|
|
|
|
|
err = inputNeuralNetwork.buildNeuralNetwork(inputNode, attributeIsNumeric)
|
|
if (err != nil){ return nil, err }
|
|
|
|
// Now we create a virtual machine to compute the prediction
|
|
|
|
neuralNetworkLearnables := inputNeuralNetwork.getLearnables()
|
|
|
|
bindDualValues := gorgonia.BindDualValues(neuralNetworkLearnables...)
|
|
|
|
virtualMachine := gorgonia.NewTapeMachine(neuralNetworkGraph, bindDualValues)
|
|
|
|
err = virtualMachine.RunAll()
|
|
if (err != nil) { return nil, err }
|
|
|
|
prediction := inputNeuralNetwork.prediction
|
|
|
|
predictionValues := prediction.Value().Data().([]float32)
|
|
|
|
return predictionValues, nil
|
|
}
|
|
|
|
|
|
// This function will take a neural network and input layer and build the network to be able to compute a prediction
|
|
// We need to run a virtual machine after calling this function in order for the prediction to be generated
|
|
func (inputNetwork *NeuralNetwork)buildNeuralNetwork(inputLayer *gorgonia.Node, predictionIsNumeric bool)error{
|
|
|
|
// We copy node pointer (says to do this in a resource i'm reading)
|
|
|
|
inputLayerCopy := inputLayer
|
|
|
|
// We multiply weights at each layer and perform ReLU (Rectification) after each multiplication
|
|
|
|
weights1 := inputNetwork.weights1
|
|
|
|
layer1Product, err := gorgonia.Mul(inputLayerCopy, weights1)
|
|
if (err != nil) {
|
|
return errors.New("Layer 1 multiplication failed: " + err.Error())
|
|
}
|
|
|
|
layer1ProductRectified, err := gorgonia.Rectify(layer1Product)
|
|
if (err != nil){
|
|
return errors.New("Layer 1 Rectify failed: " + err.Error())
|
|
}
|
|
|
|
weights2 := inputNetwork.weights2
|
|
|
|
layer2Product, err := gorgonia.Mul(layer1ProductRectified, weights2)
|
|
if (err != nil) {
|
|
return errors.New("Layer 2 multiplication failed: " + err.Error())
|
|
}
|
|
|
|
layer2ProductRectified, err := gorgonia.Rectify(layer2Product)
|
|
if (err != nil){
|
|
return errors.New("Layer 2 Rectify failed: " + err.Error())
|
|
}
|
|
|
|
weights3 := inputNetwork.weights3
|
|
|
|
layer3Product, err := gorgonia.Mul(layer2ProductRectified, weights3)
|
|
if (err != nil) {
|
|
return errors.New("Layer 3 multiplication failed: " + err.Error())
|
|
}
|
|
|
|
if (predictionIsNumeric == false){
|
|
|
|
// We SoftMax the output to get the prediction
|
|
|
|
prediction, err := gorgonia.SoftMax(layer3Product)
|
|
if (err != nil) {
|
|
return errors.New("SoftMax failed: " + err.Error())
|
|
}
|
|
|
|
inputNetwork.prediction = prediction
|
|
|
|
} else {
|
|
|
|
// We Sigmoid the output to get the prediction
|
|
|
|
prediction, err := gorgonia.Sigmoid(layer3Product)
|
|
if (err != nil) {
|
|
return errors.New("Sigmoid failed: " + err.Error())
|
|
}
|
|
|
|
inputNetwork.prediction = prediction
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
|