mirror of
https://github.com/prometheus/prometheus.git
synced 2025-07-03 19:13:23 +00:00
Merge pull request #16569 from prometheus/beorn7/promql2
Some checks failed
buf.build / lint and publish (push) Has been cancelled
CI / Go tests (push) Has been cancelled
CI / More Go tests (push) Has been cancelled
CI / Go tests with previous Go version (push) Has been cancelled
CI / UI tests (push) Has been cancelled
CI / Go tests on Windows (push) Has been cancelled
CI / Mixins tests (push) Has been cancelled
CI / Build Prometheus for common architectures (push) Has been cancelled
CI / Build Prometheus for all architectures (push) Has been cancelled
CI / Check generated parser (push) Has been cancelled
CI / golangci-lint (push) Has been cancelled
CI / fuzzing (push) Has been cancelled
CI / codeql (push) Has been cancelled
Scorecards supply-chain security / Scorecards analysis (push) Has been cancelled
CI / Report status of build Prometheus for all architectures (push) Has been cancelled
CI / Publish main branch artifacts (push) Has been cancelled
CI / Publish release artefacts (push) Has been cancelled
CI / Publish UI on npm Registry (push) Has been cancelled
Some checks failed
buf.build / lint and publish (push) Has been cancelled
CI / Go tests (push) Has been cancelled
CI / More Go tests (push) Has been cancelled
CI / Go tests with previous Go version (push) Has been cancelled
CI / UI tests (push) Has been cancelled
CI / Go tests on Windows (push) Has been cancelled
CI / Mixins tests (push) Has been cancelled
CI / Build Prometheus for common architectures (push) Has been cancelled
CI / Build Prometheus for all architectures (push) Has been cancelled
CI / Check generated parser (push) Has been cancelled
CI / golangci-lint (push) Has been cancelled
CI / fuzzing (push) Has been cancelled
CI / codeql (push) Has been cancelled
Scorecards supply-chain security / Scorecards analysis (push) Has been cancelled
CI / Report status of build Prometheus for all architectures (push) Has been cancelled
CI / Publish main branch artifacts (push) Has been cancelled
CI / Publish release artefacts (push) Has been cancelled
CI / Publish UI on npm Registry (push) Has been cancelled
promql: Simplify avg aggregation and avg_over_time
This commit is contained in:
commit
8fc1750bcc
3 changed files with 113 additions and 87 deletions
|
@ -2998,7 +2998,6 @@ type groupedAggregation struct {
|
|||
hasHistogram bool // Has at least 1 histogram sample aggregated.
|
||||
incompatibleHistograms bool // If true, group has seen mixed exponential and custom buckets, or incompatible custom buckets.
|
||||
groupAggrComplete bool // Used by LIMITK to short-cut series loop when we've reached K elem on every group.
|
||||
incrementalMean bool // True after reverting to incremental calculation of the mean value.
|
||||
}
|
||||
|
||||
// aggregation evaluates sum, avg, count, stdvar, stddev or quantile at one timestep on inputMatrix.
|
||||
|
@ -3097,6 +3096,21 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
|
|||
}
|
||||
|
||||
case parser.AVG:
|
||||
// For the average calculation, we use incremental mean
|
||||
// calculation. In particular in combination with Kahan
|
||||
// summation (which we do for floats, but not yet for
|
||||
// histograms, see issue #14105), this is quite accurate
|
||||
// and only breaks in extreme cases (see testdata for
|
||||
// avg_over_time). One might assume that simple direct
|
||||
// mean calculation works better in some cases, but so
|
||||
// far, our conclusion is that we fare best with the
|
||||
// incremental approach plus Kahan summation (for
|
||||
// floats). For a relevant discussion, see
|
||||
// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
|
||||
// Additional note: For even better numerical accuracy,
|
||||
// we would need to process the values in a particular
|
||||
// order, but that would be very hard to implement given
|
||||
// how the PromQL engine works.
|
||||
group.groupCount++
|
||||
if h != nil {
|
||||
group.hasHistogram = true
|
||||
|
@ -3121,45 +3135,11 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
|
|||
// point in copying the histogram in that case.
|
||||
} else {
|
||||
group.hasFloat = true
|
||||
if !group.incrementalMean {
|
||||
newV, newC := kahanSumInc(f, group.floatValue, group.floatKahanC)
|
||||
if !math.IsInf(newV, 0) {
|
||||
// The sum doesn't overflow, so we propagate it to the
|
||||
// group struct and continue with the regular
|
||||
// calculation of the mean value.
|
||||
group.floatValue, group.floatKahanC = newV, newC
|
||||
break
|
||||
}
|
||||
// If we are here, we know that the sum _would_ overflow. So
|
||||
// instead of continue to sum up, we revert to incremental
|
||||
// calculation of the mean value from here on.
|
||||
group.incrementalMean = true
|
||||
group.floatMean = group.floatValue / (group.groupCount - 1)
|
||||
group.floatKahanC /= group.groupCount - 1
|
||||
}
|
||||
if math.IsInf(group.floatMean, 0) {
|
||||
if math.IsInf(f, 0) && (group.floatMean > 0) == (f > 0) {
|
||||
// The `floatMean` and `s.F` values are `Inf` of the same sign. They
|
||||
// can't be subtracted, but the value of `floatMean` is correct
|
||||
// already.
|
||||
break
|
||||
}
|
||||
if !math.IsInf(f, 0) && !math.IsNaN(f) {
|
||||
// At this stage, the mean is an infinite. If the added
|
||||
// value is neither an Inf or a Nan, we can keep that mean
|
||||
// value.
|
||||
// This is required because our calculation below removes
|
||||
// the mean value, which would look like Inf += x - Inf and
|
||||
// end up as a NaN.
|
||||
break
|
||||
}
|
||||
}
|
||||
currentMean := group.floatMean + group.floatKahanC
|
||||
q := (group.groupCount - 1) / group.groupCount
|
||||
group.floatMean, group.floatKahanC = kahanSumInc(
|
||||
// Divide each side of the `-` by `group.groupCount` to avoid float64 overflows.
|
||||
f/group.groupCount-currentMean/group.groupCount,
|
||||
group.floatMean,
|
||||
group.floatKahanC,
|
||||
f/group.groupCount,
|
||||
q*group.floatMean,
|
||||
q*group.floatKahanC,
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -3232,10 +3212,8 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
|
|||
continue
|
||||
case aggr.hasHistogram:
|
||||
aggr.histogramValue = aggr.histogramValue.Compact(0)
|
||||
case aggr.incrementalMean:
|
||||
aggr.floatValue = aggr.floatMean + aggr.floatKahanC
|
||||
default:
|
||||
aggr.floatValue = (aggr.floatValue + aggr.floatKahanC) / aggr.groupCount
|
||||
aggr.floatValue = aggr.floatMean + aggr.floatKahanC
|
||||
}
|
||||
|
||||
case parser.COUNT:
|
||||
|
|
|
@ -671,6 +671,20 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode
|
|||
metricName := firstSeries.Metric.Get(labels.MetricName)
|
||||
return enh.Out, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, args[0].PositionRange()))
|
||||
}
|
||||
// For the average calculation, we use incremental mean calculation. In
|
||||
// particular in combination with Kahan summation (which we do for
|
||||
// floats, but not yet for histograms, see issue #14105), this is quite
|
||||
// accurate and only breaks in extreme cases (see testdata). One might
|
||||
// assume that simple direct mean calculation works better in some
|
||||
// cases, but so far, our conclusion is that we fare best with the
|
||||
// incremental approach plus Kahan summation (for floats). For a
|
||||
// relevant discussion, see
|
||||
// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
|
||||
// Additional note: For even better numerical accuracy, we would need to
|
||||
// process the values in a particular order. For avg_over_time, that
|
||||
// would be more or less feasible, but it would be more expensivo, and
|
||||
// it would also be much harder for the avg aggregator, given how the
|
||||
// PromQL engine works.
|
||||
if len(firstSeries.Floats) == 0 {
|
||||
// The passed values only contain histograms.
|
||||
vec, err := aggrHistOverTime(vals, enh, func(s Series) (*histogram.FloatHistogram, error) {
|
||||
|
@ -702,52 +716,13 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode
|
|||
return vec, nil
|
||||
}
|
||||
return aggrOverTime(vals, enh, func(s Series) float64 {
|
||||
var (
|
||||
sum, mean, count, kahanC float64
|
||||
incrementalMean bool
|
||||
)
|
||||
for _, f := range s.Floats {
|
||||
count++
|
||||
if !incrementalMean {
|
||||
newSum, newC := kahanSumInc(f.F, sum, kahanC)
|
||||
// Perform regular mean calculation as long as
|
||||
// the sum doesn't overflow and (in any case)
|
||||
// for the first iteration (even if we start
|
||||
// with ±Inf) to not run into division-by-zero
|
||||
// problems below.
|
||||
if count == 1 || !math.IsInf(newSum, 0) {
|
||||
sum, kahanC = newSum, newC
|
||||
continue
|
||||
var mean, kahanC float64
|
||||
for i, f := range s.Floats {
|
||||
count := float64(i + 1)
|
||||
q := float64(i) / count
|
||||
mean, kahanC = kahanSumInc(f.F/count, q*mean, q*kahanC)
|
||||
}
|
||||
// Handle overflow by reverting to incremental calculation of the mean value.
|
||||
incrementalMean = true
|
||||
mean = sum / (count - 1)
|
||||
kahanC /= count - 1
|
||||
}
|
||||
if math.IsInf(mean, 0) {
|
||||
if math.IsInf(f.F, 0) && (mean > 0) == (f.F > 0) {
|
||||
// The `mean` and `f.F` values are `Inf` of the same sign. They
|
||||
// can't be subtracted, but the value of `mean` is correct
|
||||
// already.
|
||||
continue
|
||||
}
|
||||
if !math.IsInf(f.F, 0) && !math.IsNaN(f.F) {
|
||||
// At this stage, the mean is an infinite. If the added
|
||||
// value is neither an Inf or a Nan, we can keep that mean
|
||||
// value.
|
||||
// This is required because our calculation below removes
|
||||
// the mean value, which would look like Inf += x - Inf and
|
||||
// end up as a NaN.
|
||||
continue
|
||||
}
|
||||
}
|
||||
correctedMean := mean + kahanC
|
||||
mean, kahanC = kahanSumInc(f.F/count-correctedMean/count, mean, kahanC)
|
||||
}
|
||||
if incrementalMean {
|
||||
return mean + kahanC
|
||||
}
|
||||
return (sum + kahanC) / count
|
||||
}), nil
|
||||
}
|
||||
|
||||
|
|
73
promql/promqltest/testdata/functions.test
vendored
73
promql/promqltest/testdata/functions.test
vendored
|
@ -1013,6 +1013,79 @@ eval instant at 1m sum_over_time(metric[2m])
|
|||
eval instant at 1m avg_over_time(metric[2m])
|
||||
{} 0.5
|
||||
|
||||
# More tests for extreme values.
|
||||
clear
|
||||
# All positive values with varying magnitudes.
|
||||
load 5s
|
||||
metric1 1e10 1e-6 1e-6 1e-6 1e-6 1e-6
|
||||
metric2 5.30921651659898 0.961118537914768 1.62091361305318 0.865089463758091 0.323055185914577 0.951811357687154
|
||||
metric3 1.78264e50 0.76342771 1.9592258 7.69805412 458.90154
|
||||
metric4 0.76342771 1.9592258 7.69805412 1.78264e50 458.90154
|
||||
metric5 1.78264E+50 0.76342771 1.9592258 2.258E+220 7.69805412 458.90154
|
||||
|
||||
eval instant at 55s avg_over_time(metric1[1m])
|
||||
{} 1.6666666666666675e+09
|
||||
|
||||
eval instant at 55s avg_over_time(metric2[1m])
|
||||
{} 1.67186744582113
|
||||
|
||||
eval instant at 55s avg_over_time(metric3[1m])
|
||||
{} 3.56528E+49
|
||||
|
||||
eval instant at 55s avg_over_time(metric4[1m])
|
||||
{} 3.56528E+49
|
||||
|
||||
eval instant at 55s avg_over_time(metric5[1m])
|
||||
{} 3.76333333333333E+219
|
||||
|
||||
# Contains negative values; result is dominated by a very large value.
|
||||
load 5s
|
||||
metric6 -1.78264E+50 0.76342771 1.9592258 2.258E+220 7.69805412 -458.90154
|
||||
metric7 -1.78264E+50 0.76342771 1.9592258 -2.258E+220 7.69805412 -458.90154
|
||||
metric8 -1.78264E+215 0.76342771 1.9592258 2.258E+220 7.69805412 -458.90154
|
||||
metric9 -1.78264E+215 0.76342771 1.9592258 2.258E+220 7.69805412 1.78264E+215 -458.90154
|
||||
metric10 -1.78264E+219 0.76342771 1.9592258 2.3757689E+217 -2.3757689E+217 2.258E+220 7.69805412 1.78264E+219 -458.90154
|
||||
|
||||
eval instant at 55s avg_over_time(metric6[1m])
|
||||
{} 3.76333333333333E+219
|
||||
|
||||
eval instant at 55s avg_over_time(metric7[1m])
|
||||
{} -3.76333333333333E+219
|
||||
|
||||
eval instant at 55s avg_over_time(metric8[1m])
|
||||
{} 3.76330362266667E+219
|
||||
|
||||
eval instant at 55s avg_over_time(metric9[1m])
|
||||
{} 3.225714285714286e+219
|
||||
|
||||
# Interestingly, before PR #16569, this test failed with a result of
|
||||
# 3.225714285714286e+219, so the incremental calculation combined with
|
||||
# Kahan summation (in the improved way as introduced by PR #16569)
|
||||
# seems to be more accurate than the simple mean calculation with
|
||||
# Kahan summation.
|
||||
eval instant at 55s avg_over_time(metric10[1m])
|
||||
{} 2.5088888888888888e+219
|
||||
|
||||
# Large values of different magnitude, combined with small values. The
|
||||
# large values, however, all cancel each other exactly, so that the actual
|
||||
# average here is determined by only the small values. Therefore, the correct
|
||||
# outcome is -44.848083237000004.
|
||||
load 5s
|
||||
metric11 -2.258E+220 -1.78264E+219 0.76342771 1.9592258 2.3757689E+217 -2.3757689E+217 2.258E+220 7.69805412 1.78264E+219 -458.90154
|
||||
|
||||
# Thus, the result here is very far off! With the different orders of
|
||||
# magnitudes involved, even Kahan summation cannot cope.
|
||||
# Interestingly, with the simple mean calculation (still including
|
||||
# Kahan summation) prior to PR #16569, the result is 0. That's
|
||||
# arguably more accurate, but it still shows that Kahan summation
|
||||
# doesn't work well in this situation. To solve this properly, we
|
||||
# needed to do something like sorting the values (which is hard given
|
||||
# how the PromQL engine works). The question is how practically
|
||||
# relevant this scenario is.
|
||||
eval instant at 55s avg_over_time(metric11[1m])
|
||||
{} -1.881783551706252e+203
|
||||
# {} -44.848083237000004 <- This is the correct value.
|
||||
|
||||
# Test per-series aggregation on dense samples.
|
||||
clear
|
||||
load 1ms
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue