diff --git a/docs/part3/commonstatsmethods.md b/docs/part3/commonstatsmethods.md index 78946e2ac70..3eb89187787 100644 --- a/docs/part3/commonstatsmethods.md +++ b/docs/part3/commonstatsmethods.md @@ -533,6 +533,8 @@ Failed to delete temporary file roostats-Sprxsw.root: No such file or directory The result stored in the **limit** branch of the output tree will be the upper limit (and its error, stored in **limitErr**). The default behaviour will be, as above, to search for the upper limit on **r**. However, the values of $p_{\mu}, p_{b}$ and CLs can be calculated for a particular value **r=X** by specifying the option `--singlePoint=X`. In this case, the value stored in the branch **limit** will be the value of CLs (or $p_{\mu}$) (see the [FAQ](http://cms-analysis.github.io/HiggsAnalysis-CombinedLimit/part4/usefullinks/#faq) section). +In more complicated models, some of the toy datasets may lead to poor fits which results in spurious values of the test-statistic. In general, it is advised that diagnostic tools on the model/fits provided are used to avoid these poor fits, however if the fraction of these results is small, the option `--removeOutliersForPValue X` can be used. This option removes test-statistic values from the test-statistic distributions that are further than $X\sigma$ from the mean of the distribution, where $\sigma$ is the standard deviation of the distribution and $X$ is the value passed to the option, before calculating $p-$values and limits. Setting this to some reasonably large number (eg 50) should remove the spurious values. By default, no removal of outliers is performed. + #### Expected Limits For simple models, we can run interactively 5 times to compute the median expected and the 68% and 95% central interval boundaries. For this, we can use the `HybridNew` method with the same options as for the observed limit, but adding a `--expectedFromGrid=`. Here, the quantile should be set to 0.5 for the median, 0.84 for the +ve side of the 68% band, 0.16 for the -ve side of the 68% band, 0.975 for the +ve side of the 95% band, and 0.025 for the -ve side of the 95% band. diff --git a/interface/HybridNew.h b/interface/HybridNew.h index b08b341ddec..e5bbc28cbf5 100644 --- a/interface/HybridNew.h +++ b/interface/HybridNew.h @@ -76,6 +76,7 @@ class HybridNew : public LimitAlgo { static float adaptiveToys_; static double EPS; + static float removeOulierForPvalThreshold_; // graph, used to compute the limit, not just for plotting! std::unique_ptr limitPlot_; @@ -137,7 +138,7 @@ class HybridNew : public LimitAlgo { void useGrid(); bool doFC_; - + void removeOutliers(std::vector *sd, std::vector *sw); }; #endif diff --git a/src/HybridNew.cc b/src/HybridNew.cc index 5345dc09d7d..9564f1cf0a2 100644 --- a/src/HybridNew.cc +++ b/src/HybridNew.cc @@ -98,6 +98,7 @@ float HybridNew::confidenceToleranceForToyScaling_ = 0.2; float HybridNew::maxProbability_ = 0.999; double HybridNew::EPS = 1e-4; std::string HybridNew::mode_ = ""; +float HybridNew::removeOulierForPvalThreshold_ = -1.0; HybridNew::HybridNew() : LimitAlgo("HybridNew specific options") { @@ -146,6 +147,7 @@ LimitAlgo("HybridNew specific options") { ("importantContours",boost::program_options::value(&scaleAndConfidenceSelection_)->default_value(scaleAndConfidenceSelection_), "Throw fewer toys far from interesting contours , format : CL_1,CL_2,..CL_N (--toysH scaled down when probability is far from any of CL_i) ") ("maxProbability", boost::program_options::value(&maxProbability_)->default_value(maxProbability_), "when point is > maxProbability countour, don't bother throwing toys") ("confidenceTolerance", boost::program_options::value(&confidenceToleranceForToyScaling_)->default_value(confidenceToleranceForToyScaling_), "Determine what 'far' means for adatptiveToys. (relative in terms of (1-cl))") + ("removeOutliersForPValues", boost::program_options::value(&removeOulierForPvalThreshold_)->default_value(removeOulierForPvalThreshold_), "When calculating p-values (pmu, 1-pb), remove outliers from the distribution of toys. Outliers are defined as those further from the mean than this threshold times the standard deviation") ("LHCmode", boost::program_options::value(&mode_)->default_value(mode_), "Shortcuts for LHC style running modes. --LHCmode LHC-significance: --generateNuisances=0 --generateExternalMeasurements=1 --fitNuisances=1 --testStat=LHC (Q_LHC, modified for discovery) --significance, --LHCmode LHC-limits: --generateNuisances=0 --generateExternalMeasurements=1 --fitNuisances=1 --testStat=LHC (Q_LHC, modified for upper limits) --rule CLs, --LHCmode LHC-feldman-cousins: --generateNuisances=0 --generateExternalMeasurements=1 --fitNuisances=1 --testStat=PL (Q_Profile, includes boundaries) --rule Pmu") ; @@ -1189,16 +1191,17 @@ std::pair HybridNew::eval(const RooStats::HypoTestResult &hcres, double minimizerTolerance_ = ROOT::Math::MinimizerOptions::DefaultTolerance(); + RooStats::SamplingDistribution * bDistribution = hcres.GetNullDistribution(), * sDistribution = hcres.GetAltDistribution(); + const std::vector & bdist = bDistribution->GetSamplingDistribution(); + const std::vector & bweight = bDistribution->GetSampleWeights(); + const std::vector & sdist = sDistribution->GetSamplingDistribution(); + const std::vector & sweight = sDistribution->GetSampleWeights(); + Double_t data = hcres.GetTestStatisticData(); + std::vector absbdist(bdist.size()), abssdist(sdist.size()); + std::vector absbweight(bweight), abssweight(sweight); + Double_t absdata = data; + if (testStat_ == "LHCFC") { - RooStats::SamplingDistribution * bDistribution = hcres.GetNullDistribution(), * sDistribution = hcres.GetAltDistribution(); - const std::vector & bdist = bDistribution->GetSamplingDistribution(); - const std::vector & bweight = bDistribution->GetSampleWeights(); - const std::vector & sdist = sDistribution->GetSamplingDistribution(); - const std::vector & sweight = sDistribution->GetSampleWeights(); - Double_t data = hcres.GetTestStatisticData(); - std::vector absbdist(bdist.size()), abssdist(sdist.size()); - std::vector absbweight(bweight), abssweight(sweight); - Double_t absdata; if (rule_ == "FC") { for (int i = 0, n = absbdist.size(); i < n; ++i) absbdist[i] = fabs(bdist[i]); for (int i = 0, n = abssdist.size(); i < n; ++i) abssdist[i] = fabs(sdist[i]); @@ -1214,24 +1217,42 @@ std::pair HybridNew::eval(const RooStats::HypoTestResult &hcres, abssweight.reserve(absbweight.size() + abssweight.size()); abssweight.insert(abssweight.end(), absbweight.begin(), absbweight.end()); } - RooStats::HypoTestResult result; - RooStats::SamplingDistribution *abssDist = new RooStats::SamplingDistribution("s","s",abssdist,abssweight); - RooStats::SamplingDistribution *absbDist = new RooStats::SamplingDistribution("b","b",absbdist,absbweight); - result.SetNullDistribution(absbDist); - result.SetAltDistribution(abssDist); - result.SetTestStatisticData(absdata); - result.SetPValueIsRightTail(!result.GetPValueIsRightTail()); - if (CLs_) { - return std::pair(result.CLs(), result.CLsError()); - } else { - return std::pair(result.CLsplusb(), result.CLsplusbError()); - } } else { - if (CLs_) { - return std::pair(hcres.CLs(), hcres.CLsError()); - } else { - return std::pair(hcres.CLsplusb(), hcres.CLsplusbError()); - } + // In this case we could probably just set the new vectors to the old ones (copy) + for (int i = 0, n = absbdist.size(); i < n; ++i) absbdist[i] = bdist[i]; + for (int i = 0, n = abssdist.size(); i < n; ++i) abssdist[i] = sdist[i]; + } + + removeOutliers(&abssdist,&abssweight); + removeOutliers(&absbdist,&absbweight); + + RooStats::HypoTestResult result; + RooStats::SamplingDistribution *abssDist = new RooStats::SamplingDistribution("s","s",abssdist,abssweight); + RooStats::SamplingDistribution *absbDist = new RooStats::SamplingDistribution("b","b",absbdist,absbweight); + result.SetNullDistribution(absbDist); + result.SetAltDistribution(abssDist); + result.SetTestStatisticData(absdata); + if (testStat_ == "LHCFC") result.SetPValueIsRightTail(!result.GetPValueIsRightTail()); + + // print comparison of Results before and after cleaning // + CombineLogger::instance().log("HybridNew.cc",__LINE__,std::string(Form("Original: r = %g, CLs = %g +/- %g\n\t1-Pb = %g +/- %g\n\tPmu = %g +/- %g" + ,rVal + ,hcres.CLs(), hcres.CLsError() + ,hcres.CLb(), hcres.CLbError() + ,hcres.CLsplusb(), hcres.CLsplusbError())) + ,__func__); + CombineLogger::instance().log("HybridNew.cc",__LINE__,std::string(Form("Update: r = %g, CLs = %g +/- %g\n\t1-Pb = %g +/- %g\n\tPmu = %g +/- %g" + ,rVal + ,result.CLs(), result.CLsError() + ,result.CLb(), result.CLbError() + ,result.CLsplusb(), result.CLsplusbError())) + ,__func__); + + + if (CLs_) { + return std::pair(result.CLs(), result.CLsError()); + } else { + return std::pair(result.CLsplusb(), result.CLsplusbError()); } } @@ -1763,3 +1784,34 @@ std::vector > HybridNew::findIntervalsFromSplines(TGrap std::sort(points.begin(),points.end()); return points; } + +void HybridNew::removeOutliers(std::vector *sd, std::vector *sw){ + + if ( removeOulierForPvalThreshold_ <= 0 ) return; + // calculate median and stddev of distriution + std::vector sorted_sd(sd->size()); + std::partial_sort_copy(std::begin(*sd), std::end(*sd), std::begin(sorted_sd), std::end(sorted_sd)); + + double median = sorted_sd[(int)0.5*sorted_sd.size()]; + + double accum = 0.0; + std::for_each (std::begin(*sd), std::end(*sd), [&](const double d) { + accum += (d - median) * (d - median); + }); + double stdev = sqrt(accum / (sd->size())); + + // filter outliers based on threshold + std::vector indices; + for(unsigned int i = 0; i < sd->size(); i++) { + if(std::abs((*sd)[i] - median) > removeOulierForPvalThreshold_*stdev) { + indices.push_back(i); + } + } + + // Remove elements from sd and sw + for(auto it = indices.rbegin(); it != indices.rend(); ++it) { + sd->erase(sd->begin() + *it); + sw->erase(sw->begin() + *it); + } +} +