Merge pull request #227 from prometheus/fix_with_labels

dmagliola · web-flow · commit 903124ca9316 · 2022-01-09T18:43:36.000Z
Fix `with labels` bug reported in issue #225
diff --git a/lib/prometheus/client/histogram.rb b/lib/prometheus/client/histogram.rb
@@ -42,12 +42,18 @@ def self.exponential_buckets(start:, factor: 2, count:)
       end
 
       def with_labels(labels)
-        self.class.new(name,
-                       docstring: docstring,
-                       labels: @labels,
-                       preset_labels: preset_labels.merge(labels),
-                       buckets: @buckets,
-                       store_settings: @store_settings)
+        new_metric = self.class.new(name,
+                                    docstring: docstring,
+                                    labels: @labels,
+                                    preset_labels: preset_labels.merge(labels),
+                                    buckets: @buckets,
+                                    store_settings: @store_settings)
+
+        # The new metric needs to use the same store as the "main" declared one, otherwise
+        # any observations on that copy with the pre-set labels won't actually be exported.
+        new_metric.replace_internal_store(@store)
+
+        new_metric
       end
 
       def type
diff --git a/lib/prometheus/client/metric.rb b/lib/prometheus/client/metric.rb
@@ -41,21 +41,34 @@ def initialize(name,
           metric_settings: store_settings
         )
 
+        # WARNING: Our internal store can be replaced later by `with_labels`
+        # Everything we do after this point needs to still work if @store gets replaced
         init_label_set({}) if labels.empty?
       end
 
+      protected def replace_internal_store(new_store)
+        @store = new_store
+      end
+
+
       # Returns the value for the given label set
       def get(labels: {})
         label_set = label_set_for(labels)
         @store.get(labels: label_set)
       end
 
       def with_labels(labels)
-        self.class.new(name,
-                       docstring: docstring,
-                       labels: @labels,
-                       preset_labels: preset_labels.merge(labels),
-                       store_settings: @store_settings)
+        new_metric = self.class.new(name,
+                                     docstring: docstring,
+                                     labels: @labels,
+                                     preset_labels: preset_labels.merge(labels),
+                                     store_settings: @store_settings)
+
+        # The new metric needs to use the same store as the "main" declared one, otherwise
+        # any observations on that copy with the pre-set labels won't actually be exported.
+        new_metric.replace_internal_store(@store)
+
+        new_metric
       end
 
       def init_label_set(labels)
diff --git a/spec/prometheus/client/counter_spec.rb b/spec/prometheus/client/counter_spec.rb
@@ -3,6 +3,7 @@
 require 'prometheus/client'
 require 'prometheus/client/counter'
 require 'examples/metric_example'
+require 'prometheus/client/data_stores/direct_file_store'
 
 describe Prometheus::Client::Counter do
   # Reset the data store
@@ -45,12 +46,6 @@
           end.to change { counter.get(labels: { test: 'label' }) }.by(1.0)
         end.to_not change { counter.get(labels: { test: 'other' }) }
       end
-
-      it 'can pre-set labels using `with_labels`' do
-        expect { counter.increment }
-          .to raise_error(Prometheus::Client::LabelSetValidator::InvalidLabelSetError)
-        expect { counter.with_labels(test: 'label').increment }.not_to raise_error
-      end
     end
 
     it 'increments the counter by a given value' do
@@ -122,4 +117,149 @@
       end
     end
   end
+
+  describe '#with_labels' do
+    let(:expected_labels) { [:foo] }
+
+    it 'pre-sets labels for observations' do
+      expect { counter.increment }
+        .to raise_error(Prometheus::Client::LabelSetValidator::InvalidLabelSetError)
+      expect { counter.with_labels(foo: 'label').increment }.not_to raise_error
+    end
+
+    it 'registers `with_labels` observations in the original metric store' do
+      counter.increment(labels: { foo: 'value1'})
+      counter_with_labels = counter.with_labels({ foo: 'value2'})
+      counter_with_labels.increment(by: 2)
+
+      expect(counter_with_labels.values).to eql({foo: 'value1'} => 1.0, {foo: 'value2'} => 2.0)
+      expect(counter.values).to eql({foo: 'value1'} => 1.0, {foo: 'value2'} => 2.0)
+    end
+
+    context 'when using DirectFileStore' do
+      before do
+        Dir.glob('/tmp/prometheus_test/*').each { |file| File.delete(file) }
+        Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: '/tmp/prometheus_test')
+      end
+
+      let(:expected_labels) { [:foo, :bar] }
+
+      # Testing for file corruption: this is weird and complicated, so it needs explaining
+      #
+      # Files get corrupted when we have two different instances of `FileMappedDict`
+      # reading and writing the same file. This corruption is expected; we should never have
+      # two instances of `FileMappedDict` for the same file. If we do, it's a bug in our client.
+      #
+      # To clarify, the bug is that *we ended up with two instances for the same file*, not
+      # that the instances are now corrupting the file.
+      #
+      # This is why we're testing this in `with_labels`. It's the only use case we've found
+      # were we ended up with two instances (before we fixed that bug). `with_labels` is
+      # incidental, if we find another way to get "duplicate" instances, we should add this
+      # same exact test, except for the first line, where we need to instead reproduce
+      # whatever bug gets us that second instance.
+      #
+      # The first thing we need to understand is why having two instances of `FileMappedDict`
+      # corrupts the files:
+      #
+      # `FileMappedDict` keeps track, in an internal variable, of how many bytes in the file
+      # have been used. When adding a new "entry" (observing a new labelset), it serializes
+      # it and adds it at "the end" (according to its internal byte counter), and it also updates
+      # the counter at the beginning of the file. However, it never re-reads that counter
+      # from the file, because there shouldn't be any reason for it to have changed.
+      #
+      # If there are two instances pointing to the same file, initially they will both
+      # share that internal counter, as they do the first read of the file, but if then
+      # each of them adds an entry, their internal "length" counters will disagree, and
+      # they'll start overwriting each other's entries.
+      #
+      # Importantly, if all of the entries happen to have the same length, it will be "fine".
+      # Some of the labelsets will effectively disappear, but there will be no corruption,
+      # because all the important things will fall in the right offsets by pure chance. This
+      # would be very rare in production, but in a test, it's what normally happens because
+      # we set all labels to "foo", "bar", etc. This is the reason for "longervalue" below,
+      # we need to have different labelset lenghts to reproduce the corruption.
+      #
+      # With this background about the internals, we can now get to why the specific sequence of
+      # steps below ends up in corrupted files.
+      #
+      # For this to make sense, i'll need to describe the contents of the file at each step.
+      # I'll represent it like this: `27|labelset1,value1|labelset2,value2|labelset3,value3|`
+      #
+      # These are not the bytes we store in the file, but conceptually it's equivalent,
+      # with two caveats:
+      # - The counter at the beginning (27 == 3 * 9) here shows the combined length of labelsets.
+      #   It'd normally also include the length of values, but doing that makes this explanation
+      #   much harder to follow.
+      # - Each entry also starts with a 4-byte int specifying the length of its labelset, so
+      #   we know how much to read. Again, I'm omitting that for readability.
+      #
+      #
+      # Steps to reproduce:
+      # - We declare `counter` and `counter_with_labels` as a clone. Neither has read the file.
+      # - We increment `counter`, which creates the file and adds the entry ("labelset1")
+      #     - File: `9|labelset1,value1|`
+      # - We increment `counter_with_labels`, which reads the file, and adds the new entry
+      #   to it ("muchlongerlabelset2").
+      #     - File: `28|labelset1,value1|muchlongerlabelset2, value2|`
+      #     - `counter` and `counter_with_labels` now disagree about the length of this file
+      #       (`counter` doesn't know the file has grown).
+      # - We now add a new entry to `counter` ("labelset3"), which thinks the file is shorter
+      #   than it actually is.
+      #     - File: `18|labelset1,value1|labelset3,value3|et2, value2|`
+      #     - The initial counter reflects both labelsets for `counter`; then we have those
+      #       labelsetsp; and finally some "garbage" after the "end" (the garbage is the
+      #       last few bytes of the much longer entry added before by `counter_with_labels`)
+      #     - so far, though, we're still good. If you read the file, all entries are "fine",
+      #       because you're only reading up to the "18" length specified at the beginning.
+      #     - for the problem to manifest itself, we need to increment that counter at the
+      #       beginning, so we'll read the garbage. **BUT**, if we add a new labelset to
+      #       `counter`, it'll overwrite the "garbage" with good data, and the file will
+      #       continue to be fine.
+      # - We add a new entry to `counter_with_labels`. This updates the length counter at
+      #   the beginning of the file.
+      #     - File: `47|labelset1,value1|labelset3,value3|et2, value2|muchlongerlabelset4, value4|`
+      #
+      # - Now the file is properly corrupted. When reading it, `FileMappedDict` sees:
+      #    - labelset1,value1 (cool)
+      #    - labelset3,value3 (cool)
+      #    - et2, value2 (boom)
+      #      |-> the beginning of this entry is garbage because we're actually at the middle
+      #          of an entry, not a beginning.
+      #
+      # What actually breaks is that each of these entries is expected to have, at their
+      # beginning, the length in bytes of its labelset, so we know how much to read.
+      # Now we have garbage in that position, and `FileMappedDict` will either:
+      #   - Try to interpret those four bytes as a long, get an invalid result.
+      #   - Try to read an invalid amount of data (maybe a negative amount).
+      #   - After reading the labelset, try to read the float and go past the end of the file
+      #   - Actually read what it thinks is a float, try to `unpack` it, and fail because
+      #       it's actually garbage.
+      #   - I'm sure there are other fun ways for it to fail.
+      it "doesn't corrupt the data files" do
+        counter_with_labels = counter.with_labels({ foo: 'longervalue'})
+
+        # Initialize / read the files for both views of the metric
+        counter.increment(labels: { foo: 'value1', bar: 'zzz'})
+        counter_with_labels.increment(by: 2, labels: {bar: 'zzz'})
+
+        # After both MetricStores have their files, add a new entry to both
+        counter.increment(labels: { foo: 'value1', bar: 'aaa'}) # If there's a bug, we partially overwrite { foo: 'longervalue', bar: 'zzz'}
+        counter_with_labels.increment(by: 2, labels: {bar: 'aaa'}) # Extend the file so we read past that overwrite
+
+        expect { counter.values }.not_to raise_error # Check it hasn't corrupted our files
+        expect { counter_with_labels.values }.not_to raise_error # Check it hasn't corrupted our files
+
+        expected_values = {
+          {foo: 'value1', bar: 'zzz'} => 1.0,
+          {foo: 'value1', bar: 'aaa'} => 1.0,
+          {foo: 'longervalue', bar: 'zzz'} => 2.0,
+          {foo: 'longervalue', bar: 'aaa'} => 2.0,
+        }
+
+        expect(counter.values).to eql(expected_values)
+        expect(counter_with_labels.values).to eql(expected_values)
+      end
+    end
+  end
 end
diff --git a/spec/prometheus/client/gauge_spec.rb b/spec/prometheus/client/gauge_spec.rb
@@ -45,12 +45,6 @@
           end.to change { gauge.get(labels: { test: 'value' }) }.from(0).to(42)
         end.to_not change { gauge.get(labels: { test: 'other' }) }
       end
-
-      it 'can pre-set labels using `with_labels`' do
-        expect { gauge.set(10) }
-          .to raise_error(Prometheus::Client::LabelSetValidator::InvalidLabelSetError)
-        expect { gauge.with_labels(test: 'value').set(10) }.not_to raise_error
-      end
     end
 
     context 'given an invalid value' do
@@ -204,4 +198,23 @@
       end
     end
   end
+
+  describe '#with_labels' do
+    let(:expected_labels) { [:foo] }
+
+    it 'pre-sets labels for observations' do
+      expect { gauge.set(10) }
+        .to raise_error(Prometheus::Client::LabelSetValidator::InvalidLabelSetError)
+      expect { gauge.with_labels(foo: 'value').set(10) }.not_to raise_error
+    end
+
+    it 'registers `with_labels` observations in the original metric store' do
+      gauge.set(1, labels: { foo: 'value1'})
+      gauge_with_labels = gauge.with_labels({ foo: 'value2'})
+      gauge_with_labels.set(2)
+
+      expect(gauge_with_labels.values).to eql({foo: 'value1'} => 1.0, {foo: 'value2'} => 2.0)
+      expect(gauge.values).to eql({foo: 'value1'} => 1.0, {foo: 'value2'} => 2.0)
+    end
+  end
 end
diff --git a/spec/prometheus/client/histogram_spec.rb b/spec/prometheus/client/histogram_spec.rb
@@ -80,12 +80,6 @@
           end.to change { histogram.get(labels: { test: 'value' }) }
         end.to_not change { histogram.get(labels: { test: 'other' }) }
       end
-
-      it 'can pre-set labels using `with_labels`' do
-        expect { histogram.observe(2) }
-          .to raise_error(Prometheus::Client::LabelSetValidator::InvalidLabelSetError)
-        expect { histogram.with_labels(test: 'value').observe(2) }.not_to raise_error
-      end
     end
 
     context "with non-string label values" do
@@ -189,4 +183,27 @@
       end
     end
   end
+
+  describe '#with_labels' do
+    let(:expected_labels) { [:foo] }
+
+    it 'pre-sets labels for observations' do
+      expect { histogram.observe(2) }
+        .to raise_error(Prometheus::Client::LabelSetValidator::InvalidLabelSetError)
+      expect { histogram.with_labels(foo: 'value').observe(2) }.not_to raise_error
+    end
+
+    it 'registers `with_labels` observations in the original metric store' do
+      histogram.observe(7, labels: { foo: 'value1'})
+      histogram_with_labels = histogram.with_labels({ foo: 'value2'})
+      histogram_with_labels.observe(20)
+
+      expected_values = {
+        {foo: 'value1'} => {'2.5' => 0.0, '5' => 0.0, '10' => 1.0, '+Inf' => 1.0, 'sum' => 7.0},
+        {foo: 'value2'} => {'2.5' => 0.0, '5' => 0.0, '10' => 0.0, '+Inf' => 1.0, 'sum' => 20.0}
+      }
+      expect(histogram_with_labels.values).to eql(expected_values)
+      expect(histogram.values).to eql(expected_values)
+    end
+  end
 end
diff --git a/spec/prometheus/client/summary_spec.rb b/spec/prometheus/client/summary_spec.rb
@@ -61,12 +61,6 @@
           end.to change { summary.get(labels: { test: 'value' })["count"] }
         end.to_not change { summary.get(labels: { test: 'other' })["count"] }
       end
-
-      it 'can pre-set labels using `with_labels`' do
-        expect { summary.observe(2) }
-          .to raise_error(Prometheus::Client::LabelSetValidator::InvalidLabelSetError)
-        expect { summary.with_labels(test: 'value').observe(2) }.not_to raise_error
-      end
     end
 
     context "with non-string label values" do
@@ -152,4 +146,27 @@
       end
     end
   end
+
+  describe '#with_labels' do
+    let(:expected_labels) { [:foo] }
+
+    it 'pre-sets labels for observations' do
+      expect { summary.observe(2) }
+        .to raise_error(Prometheus::Client::LabelSetValidator::InvalidLabelSetError)
+      expect { summary.with_labels(foo: 'value').observe(2) }.not_to raise_error
+    end
+
+    it 'registers `with_labels` observations in the original metric store' do
+      summary.observe(1, labels: { foo: 'value1'})
+      summary_with_labels = summary.with_labels({ foo: 'value2'})
+      summary_with_labels.observe(2)
+
+      expected_values = {
+        {foo: 'value1'} => { 'count' => 1.0, 'sum' => 1.0 },
+        {foo: 'value2'} => { 'count' => 1.0, 'sum' => 2.0 }
+      }
+      expect(summary_with_labels.values).to eql(expected_values)
+      expect(summary.values).to eql(expected_values)
+    end
+  end
 end