mirror of
https://github.com/prometheus/prometheus.git
synced 2025-07-02 10:41:14 +00:00
Enhancement: Reload all scrape pools concurrently (#16595)
Some checks are pending
buf.build / lint and publish (push) Waiting to run
CI / Go tests (push) Waiting to run
CI / More Go tests (push) Waiting to run
CI / Go tests with previous Go version (push) Waiting to run
CI / UI tests (push) Waiting to run
CI / Go tests on Windows (push) Waiting to run
CI / Mixins tests (push) Waiting to run
CI / Build Prometheus for common architectures (push) Waiting to run
CI / Build Prometheus for all architectures (push) Waiting to run
CI / Report status of build Prometheus for all architectures (push) Blocked by required conditions
CI / Check generated parser (push) Waiting to run
CI / golangci-lint (push) Waiting to run
CI / fuzzing (push) Waiting to run
CI / codeql (push) Waiting to run
CI / Publish main branch artifacts (push) Blocked by required conditions
CI / Publish release artefacts (push) Blocked by required conditions
CI / Publish UI on npm Registry (push) Blocked by required conditions
Scorecards supply-chain security / Scorecards analysis (push) Waiting to run
Some checks are pending
buf.build / lint and publish (push) Waiting to run
CI / Go tests (push) Waiting to run
CI / More Go tests (push) Waiting to run
CI / Go tests with previous Go version (push) Waiting to run
CI / UI tests (push) Waiting to run
CI / Go tests on Windows (push) Waiting to run
CI / Mixins tests (push) Waiting to run
CI / Build Prometheus for common architectures (push) Waiting to run
CI / Build Prometheus for all architectures (push) Waiting to run
CI / Report status of build Prometheus for all architectures (push) Blocked by required conditions
CI / Check generated parser (push) Waiting to run
CI / golangci-lint (push) Waiting to run
CI / fuzzing (push) Waiting to run
CI / codeql (push) Waiting to run
CI / Publish main branch artifacts (push) Blocked by required conditions
CI / Publish release artefacts (push) Blocked by required conditions
CI / Publish UI on npm Registry (push) Blocked by required conditions
Scorecards supply-chain security / Scorecards analysis (push) Waiting to run
* Reload all scrape pools concurrently At the moment all scrape pools that need to be reloaded are reloaded one by one. While reloads are ongoing mtxScrape is locked. For each pool that's being reloaded we need to wait until all targets are updated. This whole process can take a while and the more scrape pools to reload the longer. At the same time all pools are independent and there's no real reason to do them one-by-one. Reload each pool in a seperate goroutine so we finish config reload as ASAP as possible and unlock the mtxScrape. Signed-off-by: Lukasz Mierzwa <l.mierzwa@gmail.com> * Address PR review feedback Signed-off-by: Lukasz Mierzwa <l.mierzwa@gmail.com> --------- Signed-off-by: Lukasz Mierzwa <l.mierzwa@gmail.com>
This commit is contained in:
parent
8fc1750bcc
commit
c528293376
1 changed files with 38 additions and 20 deletions
|
@ -26,6 +26,7 @@ import (
|
|||
config_util "github.com/prometheus/common/config"
|
||||
"github.com/prometheus/common/model"
|
||||
"github.com/prometheus/common/promslog"
|
||||
"go.uber.org/atomic"
|
||||
|
||||
"github.com/prometheus/prometheus/config"
|
||||
"github.com/prometheus/prometheus/discovery/targetgroup"
|
||||
|
@ -287,29 +288,46 @@ func (m *Manager) ApplyConfig(cfg *config.Config) error {
|
|||
}
|
||||
|
||||
// Cleanup and reload pool if the configuration has changed.
|
||||
var failed bool
|
||||
for name, sp := range m.scrapePools {
|
||||
switch cfg, ok := m.scrapeConfigs[name]; {
|
||||
case !ok:
|
||||
sp.stop()
|
||||
delete(m.scrapePools, name)
|
||||
case !reflect.DeepEqual(sp.config, cfg):
|
||||
err := sp.reload(cfg)
|
||||
if err != nil {
|
||||
m.logger.Error("error reloading scrape pool", "err", err, "scrape_pool", name)
|
||||
failed = true
|
||||
var (
|
||||
failed atomic.Bool
|
||||
wg sync.WaitGroup
|
||||
toDelete sync.Map // Stores the list of names of pools to delete.
|
||||
)
|
||||
for poolName, pool := range m.scrapePools {
|
||||
wg.Add(1)
|
||||
cfg, ok := m.scrapeConfigs[poolName]
|
||||
// Reload each scrape pool in a dedicated goroutine so we don't have to wait a long time
|
||||
// if we have a lot of scrape pools to update.
|
||||
go func(name string, sp *scrapePool, cfg *config.ScrapeConfig, ok bool) {
|
||||
defer wg.Done()
|
||||
switch {
|
||||
case !ok:
|
||||
sp.stop()
|
||||
toDelete.Store(name, struct{}{})
|
||||
case !reflect.DeepEqual(sp.config, cfg):
|
||||
err := sp.reload(cfg)
|
||||
if err != nil {
|
||||
m.logger.Error("error reloading scrape pool", "err", err, "scrape_pool", name)
|
||||
failed.Store(true)
|
||||
}
|
||||
fallthrough
|
||||
case ok:
|
||||
if l, ok := m.scrapeFailureLoggers[cfg.ScrapeFailureLogFile]; ok {
|
||||
sp.SetScrapeFailureLogger(l)
|
||||
} else {
|
||||
sp.logger.Error("No logger found. This is a bug in Prometheus that should be reported upstream.", "scrape_pool", name)
|
||||
}
|
||||
}
|
||||
fallthrough
|
||||
case ok:
|
||||
if l, ok := m.scrapeFailureLoggers[cfg.ScrapeFailureLogFile]; ok {
|
||||
sp.SetScrapeFailureLogger(l)
|
||||
} else {
|
||||
sp.logger.Error("No logger found. This is a bug in Prometheus that should be reported upstream.", "scrape_pool", name)
|
||||
}
|
||||
}
|
||||
}(poolName, pool, cfg, ok)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
if failed {
|
||||
toDelete.Range(func(name, _ any) bool {
|
||||
delete(m.scrapePools, name.(string))
|
||||
return true
|
||||
})
|
||||
|
||||
if failed.Load() {
|
||||
return errors.New("failed to apply the new configuration")
|
||||
}
|
||||
return nil
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue