diff --git a/docs/part3/commonstatsmethods.md b/docs/part3/commonstatsmethods.md
index 78946e2ac70..3eb89187787 100644
--- a/docs/part3/commonstatsmethods.md
+++ b/docs/part3/commonstatsmethods.md
@@ -533,6 +533,8 @@ Failed to delete temporary file roostats-Sprxsw.root: No such file or directory
The result stored in the **limit** branch of the output tree will be the upper limit (and its error, stored in **limitErr**). The default behaviour will be, as above, to search for the upper limit on **r**. However, the values of $p_{\mu}, p_{b}$ and CLs can be calculated for a particular value **r=X** by specifying the option `--singlePoint=X`. In this case, the value stored in the branch **limit** will be the value of CLs (or $p_{\mu}$) (see the [FAQ](http://cms-analysis.github.io/HiggsAnalysis-CombinedLimit/part4/usefullinks/#faq) section).
+In more complicated models, some of the toy datasets may lead to poor fits which results in spurious values of the test-statistic. In general, it is advised that diagnostic tools on the model/fits provided are used to avoid these poor fits, however if the fraction of these results is small, the option `--removeOutliersForPValue X` can be used. This option removes test-statistic values from the test-statistic distributions that are further than $X\sigma$ from the mean of the distribution, where $\sigma$ is the standard deviation of the distribution and $X$ is the value passed to the option, before calculating $p-$values and limits. Setting this to some reasonably large number (eg 50) should remove the spurious values. By default, no removal of outliers is performed.
+
#### Expected Limits
For simple models, we can run interactively 5 times to compute the median expected and the 68% and 95% central interval boundaries. For this, we can use the `HybridNew` method with the same options as for the observed limit, but adding a `--expectedFromGrid=`. Here, the quantile should be set to 0.5 for the median, 0.84 for the +ve side of the 68% band, 0.16 for the -ve side of the 68% band, 0.975 for the +ve side of the 95% band, and 0.025 for the -ve side of the 95% band.
diff --git a/interface/HybridNew.h b/interface/HybridNew.h
index b08b341ddec..e5bbc28cbf5 100644
--- a/interface/HybridNew.h
+++ b/interface/HybridNew.h
@@ -76,6 +76,7 @@ class HybridNew : public LimitAlgo {
static float adaptiveToys_;
static double EPS;
+ static float removeOulierForPvalThreshold_;
// graph, used to compute the limit, not just for plotting!
std::unique_ptr limitPlot_;
@@ -137,7 +138,7 @@ class HybridNew : public LimitAlgo {
void useGrid();
bool doFC_;
-
+ void removeOutliers(std::vector *sd, std::vector *sw);
};
#endif
diff --git a/src/HybridNew.cc b/src/HybridNew.cc
index 5345dc09d7d..9564f1cf0a2 100644
--- a/src/HybridNew.cc
+++ b/src/HybridNew.cc
@@ -98,6 +98,7 @@ float HybridNew::confidenceToleranceForToyScaling_ = 0.2;
float HybridNew::maxProbability_ = 0.999;
double HybridNew::EPS = 1e-4;
std::string HybridNew::mode_ = "";
+float HybridNew::removeOulierForPvalThreshold_ = -1.0;
HybridNew::HybridNew() :
LimitAlgo("HybridNew specific options") {
@@ -146,6 +147,7 @@ LimitAlgo("HybridNew specific options") {
("importantContours",boost::program_options::value(&scaleAndConfidenceSelection_)->default_value(scaleAndConfidenceSelection_), "Throw fewer toys far from interesting contours , format : CL_1,CL_2,..CL_N (--toysH scaled down when probability is far from any of CL_i) ")
("maxProbability", boost::program_options::value(&maxProbability_)->default_value(maxProbability_), "when point is > maxProbability countour, don't bother throwing toys")
("confidenceTolerance", boost::program_options::value(&confidenceToleranceForToyScaling_)->default_value(confidenceToleranceForToyScaling_), "Determine what 'far' means for adatptiveToys. (relative in terms of (1-cl))")
+ ("removeOutliersForPValues", boost::program_options::value(&removeOulierForPvalThreshold_)->default_value(removeOulierForPvalThreshold_), "When calculating p-values (pmu, 1-pb), remove outliers from the distribution of toys. Outliers are defined as those further from the mean than this threshold times the standard deviation")
("LHCmode", boost::program_options::value(&mode_)->default_value(mode_), "Shortcuts for LHC style running modes. --LHCmode LHC-significance: --generateNuisances=0 --generateExternalMeasurements=1 --fitNuisances=1 --testStat=LHC (Q_LHC, modified for discovery) --significance, --LHCmode LHC-limits: --generateNuisances=0 --generateExternalMeasurements=1 --fitNuisances=1 --testStat=LHC (Q_LHC, modified for upper limits) --rule CLs, --LHCmode LHC-feldman-cousins: --generateNuisances=0 --generateExternalMeasurements=1 --fitNuisances=1 --testStat=PL (Q_Profile, includes boundaries) --rule Pmu")
;
@@ -1189,16 +1191,17 @@ std::pair HybridNew::eval(const RooStats::HypoTestResult &hcres,
double minimizerTolerance_ = ROOT::Math::MinimizerOptions::DefaultTolerance();
+ RooStats::SamplingDistribution * bDistribution = hcres.GetNullDistribution(), * sDistribution = hcres.GetAltDistribution();
+ const std::vector & bdist = bDistribution->GetSamplingDistribution();
+ const std::vector & bweight = bDistribution->GetSampleWeights();
+ const std::vector & sdist = sDistribution->GetSamplingDistribution();
+ const std::vector & sweight = sDistribution->GetSampleWeights();
+ Double_t data = hcres.GetTestStatisticData();
+ std::vector absbdist(bdist.size()), abssdist(sdist.size());
+ std::vector absbweight(bweight), abssweight(sweight);
+ Double_t absdata = data;
+
if (testStat_ == "LHCFC") {
- RooStats::SamplingDistribution * bDistribution = hcres.GetNullDistribution(), * sDistribution = hcres.GetAltDistribution();
- const std::vector & bdist = bDistribution->GetSamplingDistribution();
- const std::vector & bweight = bDistribution->GetSampleWeights();
- const std::vector & sdist = sDistribution->GetSamplingDistribution();
- const std::vector & sweight = sDistribution->GetSampleWeights();
- Double_t data = hcres.GetTestStatisticData();
- std::vector absbdist(bdist.size()), abssdist(sdist.size());
- std::vector absbweight(bweight), abssweight(sweight);
- Double_t absdata;
if (rule_ == "FC") {
for (int i = 0, n = absbdist.size(); i < n; ++i) absbdist[i] = fabs(bdist[i]);
for (int i = 0, n = abssdist.size(); i < n; ++i) abssdist[i] = fabs(sdist[i]);
@@ -1214,24 +1217,42 @@ std::pair HybridNew::eval(const RooStats::HypoTestResult &hcres,
abssweight.reserve(absbweight.size() + abssweight.size());
abssweight.insert(abssweight.end(), absbweight.begin(), absbweight.end());
}
- RooStats::HypoTestResult result;
- RooStats::SamplingDistribution *abssDist = new RooStats::SamplingDistribution("s","s",abssdist,abssweight);
- RooStats::SamplingDistribution *absbDist = new RooStats::SamplingDistribution("b","b",absbdist,absbweight);
- result.SetNullDistribution(absbDist);
- result.SetAltDistribution(abssDist);
- result.SetTestStatisticData(absdata);
- result.SetPValueIsRightTail(!result.GetPValueIsRightTail());
- if (CLs_) {
- return std::pair(result.CLs(), result.CLsError());
- } else {
- return std::pair(result.CLsplusb(), result.CLsplusbError());
- }
} else {
- if (CLs_) {
- return std::pair(hcres.CLs(), hcres.CLsError());
- } else {
- return std::pair(hcres.CLsplusb(), hcres.CLsplusbError());
- }
+ // In this case we could probably just set the new vectors to the old ones (copy)
+ for (int i = 0, n = absbdist.size(); i < n; ++i) absbdist[i] = bdist[i];
+ for (int i = 0, n = abssdist.size(); i < n; ++i) abssdist[i] = sdist[i];
+ }
+
+ removeOutliers(&abssdist,&abssweight);
+ removeOutliers(&absbdist,&absbweight);
+
+ RooStats::HypoTestResult result;
+ RooStats::SamplingDistribution *abssDist = new RooStats::SamplingDistribution("s","s",abssdist,abssweight);
+ RooStats::SamplingDistribution *absbDist = new RooStats::SamplingDistribution("b","b",absbdist,absbweight);
+ result.SetNullDistribution(absbDist);
+ result.SetAltDistribution(abssDist);
+ result.SetTestStatisticData(absdata);
+ if (testStat_ == "LHCFC") result.SetPValueIsRightTail(!result.GetPValueIsRightTail());
+
+ // print comparison of Results before and after cleaning //
+ CombineLogger::instance().log("HybridNew.cc",__LINE__,std::string(Form("Original: r = %g, CLs = %g +/- %g\n\t1-Pb = %g +/- %g\n\tPmu = %g +/- %g"
+ ,rVal
+ ,hcres.CLs(), hcres.CLsError()
+ ,hcres.CLb(), hcres.CLbError()
+ ,hcres.CLsplusb(), hcres.CLsplusbError()))
+ ,__func__);
+ CombineLogger::instance().log("HybridNew.cc",__LINE__,std::string(Form("Update: r = %g, CLs = %g +/- %g\n\t1-Pb = %g +/- %g\n\tPmu = %g +/- %g"
+ ,rVal
+ ,result.CLs(), result.CLsError()
+ ,result.CLb(), result.CLbError()
+ ,result.CLsplusb(), result.CLsplusbError()))
+ ,__func__);
+
+
+ if (CLs_) {
+ return std::pair(result.CLs(), result.CLsError());
+ } else {
+ return std::pair(result.CLsplusb(), result.CLsplusbError());
}
}
@@ -1763,3 +1784,34 @@ std::vector > HybridNew::findIntervalsFromSplines(TGrap
std::sort(points.begin(),points.end());
return points;
}
+
+void HybridNew::removeOutliers(std::vector *sd, std::vector *sw){
+
+ if ( removeOulierForPvalThreshold_ <= 0 ) return;
+ // calculate median and stddev of distriution
+ std::vector sorted_sd(sd->size());
+ std::partial_sort_copy(std::begin(*sd), std::end(*sd), std::begin(sorted_sd), std::end(sorted_sd));
+
+ double median = sorted_sd[(int)0.5*sorted_sd.size()];
+
+ double accum = 0.0;
+ std::for_each (std::begin(*sd), std::end(*sd), [&](const double d) {
+ accum += (d - median) * (d - median);
+ });
+ double stdev = sqrt(accum / (sd->size()));
+
+ // filter outliers based on threshold
+ std::vector indices;
+ for(unsigned int i = 0; i < sd->size(); i++) {
+ if(std::abs((*sd)[i] - median) > removeOulierForPvalThreshold_*stdev) {
+ indices.push_back(i);
+ }
+ }
+
+ // Remove elements from sd and sw
+ for(auto it = indices.rbegin(); it != indices.rend(); ++it) {
+ sd->erase(sd->begin() + *it);
+ sw->erase(sw->begin() + *it);
+ }
+}
+