-
Notifications
You must be signed in to change notification settings - Fork 16
Create Duplication_model.slim #33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
initialize() { | ||
initializeSLiMOptions(nucleotideBased=T); | ||
defineConstant("L_BASE", 1000000); // Genome length | ||
defineConstant("DUP_LENGTH", 10000); // Duplication size | ||
defineConstant("DUP_START", L_BASE); // Start position of duplication | ||
defineConstant("L", L_BASE + DUP_LENGTH); // Full Genome size with duplication allocation | ||
defineConstant("N", 500); // Population size | ||
|
||
//create a random sequence for the genome length | ||
//this is not used via randomNucleotide function because that generates a single string and we want it in a vector to be more easialy manipulated | ||
randomPart = sample(c("A", "T", "C", "G"), L_BASE, replace=T); | ||
|
||
//create a copy of the last nucleotides to make our duplication | ||
duplicatedPart = randomPart[(length(randomPart) - DUP_LENGTH) : (length(randomPart) - 1)]; | ||
|
||
//Combine randomPart and duplicatedPart to form full sequence and initialize ancestral sequence | ||
fullSequence = c(randomPart, duplicatedPart); | ||
initializeAncestralNucleotides(fullSequence); | ||
|
||
// Define mutation types | ||
initializeMutationTypeNuc("m1", 0.5, "f", 0.0); // Neutral mutations | ||
initializeMutationType("m2", 0.5, "f", 0.0); // Duplication marker | ||
m2.color = "red"; | ||
m2.convertToSubstitution = F; | ||
|
||
// Define genomic element | ||
initializeGenomicElementType("g1", m1, 1.0, mmJukesCantor(3.33e-8)); | ||
initializeGenomicElement(g1, 0, L - 1); | ||
|
||
// Set recombination rate | ||
initializeRecombinationRate(1e-8); | ||
} | ||
|
||
1 early() { | ||
defineConstant("simID", getSeed()); | ||
sim.addSubpop("p1", N); // Initialize population | ||
} | ||
|
||
recombination() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, this is the tricky part of the model obviously. Please add more comments to this code, about what the intention of the code in each part is, and why that is the intention of the code – what the underlying logic of the model design is. You've already got some commenting, which is great; expand on it. Explain the code to the reader, just as you would explain it if you were teaching this model to a class. I'm not talking whole paragraphs, but for each of the three branches, let's say two complete sentences, maybe. |
||
gm1 = genome1.containsMarkerMutation(m2, DUP_START); | ||
gm2 = genome2.containsMarkerMutation(m2, DUP_START); | ||
if (!(gm1 | gm2)) { | ||
// homozygote non-duplicated | ||
// Filter breakpoints that exceed the recombination limit | ||
// Since no genome contains the duplication the model needs to act as if those bases dont exist and thus dont place any break points there | ||
breakpoints = breakpoints[breakpoints < DUP_START]; | ||
return T; | ||
} | ||
if (gm1 & gm2) { | ||
//homozygot duplication | ||
// No modification to breakpoints necisary | ||
// Since both genomes contain the duplication and this model models the paralougs being right next to each other and at the end of the chromosome duplication is allowed as normal | ||
return F; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is correct; you have not modified the breakpoints in this code branch. |
||
} | ||
else { | ||
// heterozygote duplicated | ||
// Filter breakpoints that exceed the recombination limit | ||
// Since only one genome contains the duplication we dont want any breakpoints in the allocated space in the non duplicated genome, thus we give this the same treatment as the homozygote non-duplicated | ||
breakpoints = breakpoints[breakpoints < DUP_START]; | ||
return T; | ||
} | ||
} | ||
|
||
1: late(){ // Remove new mutations from SLiM that are inside unused duplication regions | ||
for (genome in p1.genomes) { | ||
// Check if the genome does NOT have the marker mutation m2 at DUP_START | ||
allMuts = 0; | ||
if (!(genome.containsMarkerMutation(m2, DUP_START))) { | ||
allMuts = genome.mutations; | ||
mutsToRemove = allMuts[allMuts.position >= DUP_START]; | ||
genome.removeMutations(mutsToRemove); | ||
} | ||
} | ||
} | ||
|
||
1000 late() { | ||
sim.outputFull(tempdir() + "burnin_" + simID + ".txt"); // Save state after burn-in | ||
catn("Burn-in phase complete. State saved."); | ||
} | ||
|
||
1001 late() { | ||
// Introduce duplication mutation in one individual | ||
duplicated = sample(p1.genomes, 1); | ||
duplicated.addNewDrawnMutation(m2, DUP_START); | ||
catn("Duplication mutation introduced at generation 10001."); | ||
|
||
// Define the original region | ||
originalRegion = c(DUP_START - DUP_LENGTH, DUP_START - 1); | ||
|
||
newPos = c(); | ||
newPoisition = c(); | ||
mutsToCopy = c(); | ||
|
||
// ----------------------------- | ||
// Step 1: Copy Substitutions | ||
// ----------------------------- | ||
substitutionsInRegion = sim.substitutions[sim.substitutions.position >= originalRegion[0] & sim.substitutions.position <= originalRegion[1]]; | ||
|
||
catn("Number of substitutions in original region: " + length(substitutionsInRegion)); | ||
|
||
for (sub in substitutionsInRegion) { | ||
newPos = sub.position + DUP_LENGTH; | ||
// Create a new mutation replicating the substitutions effect | ||
duplicated.addNewMutation(sub.mutationType, sub.selectionCoeff, newPos, p1, sub.nucleotide); | ||
catn("Copied substitution: Type=" + ", Position=" + newPos + ", Nucleotide=" + sub.nucleotide); | ||
} | ||
|
||
// ----------------------------- | ||
// Step 2: Copy Active Mutations | ||
// ----------------------------- | ||
mutsToCopy = duplicated.mutations[duplicated.mutations.position >= originalRegion[0] & duplicated.mutations.position <= originalRegion[1]]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This style of coding is hard to read and also inefficient, because the whole vector of mutations in duplicated has to be re-fetched (an expensive operation) each time that you have dupMuts = duplicated.mutations; Since this code will be re-run every time the sim reloads, this might possibly even matter a tiny bit to the speed of the model; but in any case, it's more readable and less bug-prone. (Whenever you have duplicated code, the possibility of a bug is introduced, because you might decide to change one of the copies, and forget to change the other copy; a very easy mistake to make.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, it's a little weird to make |
||
|
||
catn("Number of active mutations in original region: " + length(mutsToCopy)); | ||
|
||
for (mut in mutsToCopy) { | ||
newPosition = asInteger(mut.position + DUP_LENGTH); | ||
duplicated.addNewMutation(mut.mutationType, mut.selectionCoeff, newPosition, p1, mut.nucleotide); | ||
catn("Copied active mutation: Type=" + ", Position=" + newPosition + ", Nucleotide=" + mut.nucleotide); | ||
} | ||
|
||
// ----------------------------- | ||
// Step 3: Verify Duplication | ||
// ----------------------------- | ||
// Retrieve the nucleotide sequences for comparison | ||
originalSeq = duplicated.nucleotides(originalRegion[0], originalRegion[1]); | ||
duplicatedSeq = duplicated.nucleotides(originalRegion[0] + DUP_LENGTH, originalRegion[1] + DUP_LENGTH); | ||
if (originalSeq != duplicatedSeq) { | ||
stop("Duplicated region does not match original region. Check substitution copying logic!"); | ||
} | ||
|
||
catn(length(mutsToCopy) + " active mutations copied to duplicated region."); | ||
sim.outputFull(tempdir() + "afterDup_" + simID + ".txt"); // Save state after burn-in | ||
} | ||
|
||
|
||
1002: late() { | ||
// Check if m2 is lost | ||
if (sim.countOfMutationsOfType(m2) == 0) { | ||
catn("Duplication lost. Restarting from save-state..."); | ||
sim.readFromPopulationFile(tempdir() + "afterDup_" + simID + ".txt"); // Reset to after duplication event inorder to not lose substitutions that should be copied | ||
} | ||
} | ||
|
||
11000 late() { | ||
// check wheter the duplication fixated or at what amount its in the population | ||
count = p1.genomes.countOfMutationsOfType(m2); | ||
catn(sum(count) + " duplicated genes"); | ||
|
||
p1.outputVCFSample(300, filePath = "output.vcf", simplifyNucleotides = T); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For demonstration purposes, probably no reason for this to be nucleotide-based, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(But if you want it to be, it seems harmless...)