From 7ef4b48bad01273189949882ce4d81eddbf7f73d Mon Sep 17 00:00:00 2001 From: Kaise Cheng Date: Tue, 9 May 2023 14:37:03 +0100 Subject: [PATCH 01/20] added setting `dictionary_file_max_bytes` to config the maximum bytes size of the yaml file in `dictionary_path` to overcome the 3MB size limit from SnakeYaml 1.33 Fixed: #96 --- CHANGELOG.md | 3 ++ lib/logstash/filters/dictionary/file.rb | 10 ++--- lib/logstash/filters/dictionary/json_file.rb | 3 -- lib/logstash/filters/dictionary/yaml_file.rb | 12 +++--- lib/logstash/filters/translate.rb | 11 +++++- logstash-filter-translate.gemspec | 3 +- spec/filters/translate_spec.rb | 39 ++++++++++++++++++++ 7 files changed, 66 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b22f00e..43236ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +## 3.4.1 + - Fix the limitation of the size of yaml file that exceeds 3MB + ## 3.4.0 - Refactor: leverage scheduler mixin [#93](https://github.com/logstash-plugins/logstash-filter-translate/pull/93) diff --git a/lib/logstash/filters/dictionary/file.rb b/lib/logstash/filters/dictionary/file.rb index a520e6c..1ff3536 100644 --- a/lib/logstash/filters/dictionary/file.rb +++ b/lib/logstash/filters/dictionary/file.rb @@ -9,9 +9,9 @@ class File include LogStash::Util::Loggable - def self.create(path, refresh_interval, refresh_behaviour, exact, regex) + def self.create(path, refresh_interval, refresh_behaviour, exact, regex, params) if /\.y[a]?ml$/.match(path) - instance = YamlFile.new(path, refresh_interval, exact, regex) + instance = YamlFile.new(path, refresh_interval, exact, regex, params["dictionary_file_max_bytes"]) elsif path.end_with?(".json") instance = JsonFile.new(path, refresh_interval, exact, regex) elsif path.end_with?(".csv") @@ -31,7 +31,7 @@ def self.create(path, refresh_interval, refresh_behaviour, exact, regex) attr_reader :dictionary, :fetch_strategy - def initialize(path, refresh_interval, exact, regex) + def initialize(path, refresh_interval, exact, regex, file_max_bytes = nil) @dictionary_path = path @refresh_interval = refresh_interval @short_refresh = @refresh_interval <= 300 @@ -39,7 +39,7 @@ def initialize(path, refresh_interval, exact, regex) @write_lock = rw_lock.writeLock @dictionary = Hash.new @update_method = method(:merge_dictionary) - initialize_for_file_type + initialize_for_file_type(file_max_bytes) args = [@dictionary, rw_lock] klass = case when exact && regex then FetchStrategy::File::ExactRegex @@ -68,7 +68,7 @@ def set_update_strategy(method_sym) protected - def initialize_for_file_type + def initialize_for_file_type(file_max_bytes) # sub class specific initializer end diff --git a/lib/logstash/filters/dictionary/json_file.rb b/lib/logstash/filters/dictionary/json_file.rb index 29e0bf5..610c69a 100644 --- a/lib/logstash/filters/dictionary/json_file.rb +++ b/lib/logstash/filters/dictionary/json_file.rb @@ -6,9 +6,6 @@ class JsonFile < File protected - def initialize_for_file_type - end - def read_file_into_dictionary content = IO.read(@dictionary_path, :mode => 'r:bom|utf-8') @dictionary.update(LogStash::Json.load(content)) unless content.nil? || content.empty? diff --git a/lib/logstash/filters/dictionary/yaml_file.rb b/lib/logstash/filters/dictionary/yaml_file.rb index cc49128..52fc6f2 100644 --- a/lib/logstash/filters/dictionary/yaml_file.rb +++ b/lib/logstash/filters/dictionary/yaml_file.rb @@ -7,18 +7,20 @@ class YamlFile < File protected - def initialize_for_file_type + def initialize_for_file_type(file_max_bytes) @visitor = YamlVisitor.create + + @parser = Psych::Parser.new(Psych::TreeBuilder.new) + @parser.code_point_limit = file_max_bytes end def read_file_into_dictionary # low level YAML read that tries to create as # few intermediate objects as possible # this overwrites the value at key - @visitor.accept_with_dictionary( - @dictionary, Psych.parse_stream( - IO.read(@dictionary_path, :mode => 'r:bom|utf-8') - )) + yaml_string = IO.read(@dictionary_path, :mode => 'r:bom|utf-8') + @parser.parse(yaml_string, @dictionary_path) + @visitor.accept_with_dictionary(@dictionary, @parser.handler.root) end end end end end diff --git a/lib/logstash/filters/translate.rb b/lib/logstash/filters/translate.rb index 6876d58..6a326f9 100644 --- a/lib/logstash/filters/translate.rb +++ b/lib/logstash/filters/translate.rb @@ -102,6 +102,11 @@ class Translate < LogStash::Filters::Base # as the original text, and the second column as the replacement. config :dictionary_path, :validate => :path + # Setting the maximum bytes size of the file in `dictionary_path`. This setting is effective for YAML file only. + # Snakeyaml 1.33 has a default limit 3MB. YAML file over the limit throws exception. JSON and CSV currently do not have such limit. + # The limit could be too small in some use cases. Setting a bigger number in `dictionary_file_max_bytes` to relax the restriction. + config :dictionary_file_max_bytes, :validate => :number, :default => 3_145_728 + # When using a dictionary file, this setting will indicate how frequently # (in seconds) logstash will check the dictionary file for updates. config :refresh_interval, :validate => :number, :default => 300 @@ -180,8 +185,12 @@ def register ) end + if @dictionary_path && @dictionary_file_max_bytes <= 0 + raise LogStash::ConfigurationError, "Please set a positive number in `dictionary_file_max_bytes => #{@dictionary_file_max_bytes}`." + end + if @dictionary_path - @lookup = Dictionary::File.create(@dictionary_path, @refresh_interval, @refresh_behaviour, @exact, @regex) + @lookup = Dictionary::File.create(@dictionary_path, @refresh_interval, @refresh_behaviour, @exact, @regex, params) else @lookup = Dictionary::Memory.new(@dictionary, @exact, @regex) end diff --git a/logstash-filter-translate.gemspec b/logstash-filter-translate.gemspec index d82a5db..98d60b0 100644 --- a/logstash-filter-translate.gemspec +++ b/logstash-filter-translate.gemspec @@ -1,7 +1,7 @@ Gem::Specification.new do |s| s.name = 'logstash-filter-translate' - s.version = '3.4.0' + s.version = '3.4.1' s.licenses = ['Apache License (2.0)'] s.summary = "Replaces field contents based on a hash or YAML file" s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program" @@ -25,6 +25,7 @@ Gem::Specification.new do |s| s.add_runtime_dependency 'logstash-mixin-validator_support', '~> 1.0' s.add_runtime_dependency 'logstash-mixin-deprecation_logger_support', '~> 1.0' s.add_runtime_dependency "logstash-mixin-scheduler", '~> 1.0' + s.add_runtime_dependency "psych", ">= 5.1.0" s.add_development_dependency 'logstash-devutils' s.add_development_dependency 'rspec-sequencing' diff --git a/spec/filters/translate_spec.rb b/spec/filters/translate_spec.rb index 9a57e70..ed566d4 100644 --- a/spec/filters/translate_spec.rb +++ b/spec/filters/translate_spec.rb @@ -240,6 +240,45 @@ def self.build_fixture_path(filename) end end + describe "when using a yml file with size limit" do + let(:config) do + { + "source" => "status", + "target" => "translation", + "dictionary_path" => dictionary_path, + "dictionary_file_max_bytes" => dictionary_size # the file is 18 bytes + } + end + let(:dictionary_path) { TranslateUtil.build_fixture_path("dict.yml") } + let(:event) { LogStash::Event.new("status" => "a") } + + context "file is over size limit" do + let(:dictionary_size) { 17 } + + it "raises exception" do + expect { subject.register }.to raise_error(/The incoming YAML document exceeds/) + end + end + + context "file is within size limit" do + let(:dictionary_size) { 18 } + + it "returns the exact translation" do + subject.register + subject.filter(event) + expect(event.get("translation")).to eq(1) + end + end + + context "file size set to zero" do + let(:dictionary_size) { 0 } + + it "raises configuration exception" do + expect { subject.register }.to raise_error(LogStash::ConfigurationError, /Please set a positive number/) + end + end + end + context "when using a map tagged yml file" do let(:dictionary_path) { TranslateUtil.build_fixture_path("tag-map-dict.yml") } let(:event) { LogStash::Event.new("status" => "six") } From 39e4cc869b29a824ccbb30c38013b3daeefedd11 Mon Sep 17 00:00:00 2001 From: Kaise Cheng Date: Tue, 9 May 2023 14:54:54 +0100 Subject: [PATCH 02/20] fix flaky test in main --- spec/filters/scheduling_spec.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spec/filters/scheduling_spec.rb b/spec/filters/scheduling_spec.rb index 1d9b202..d7bd5a7 100644 --- a/spec/filters/scheduling_spec.rb +++ b/spec/filters/scheduling_spec.rb @@ -56,10 +56,10 @@ file.puts("a,11\nb,12\nc,13\n") end end - .then_after(1.2, "wait then translate again") do + .then_after(2, "wait then translate again") do subject.filter(event) try(5) do - wait(0.1).for{event.get("[translation]")}.to eq("12"), "field [translation] did not eq '12'" + wait(0.5).for{event.get("[translation]")}.to eq("12"), "field [translation] did not eq '12'" end end .then("stop") do @@ -87,10 +87,10 @@ file.puts("a,21\nb,22\nc,23\n") end end - .then_after(1.2, "wait then translate again") do + .then_after(2, "wait then translate again") do subject.filter(event) try(5) do - wait(0.1).for{event.get("[translation]")}.to eq("22"), "field [translation] did not eq '22'" + wait(0.5).for{event.get("[translation]")}.to eq("22"), "field [translation] did not eq '22'" end end .then("stop") do From fd94e8b8091b520fb09a80924543b9c6312ca2a9 Mon Sep 17 00:00:00 2001 From: Kaise Cheng Date: Tue, 9 May 2023 14:57:49 +0100 Subject: [PATCH 03/20] fix flaky test in main --- spec/filters/scheduling_spec.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spec/filters/scheduling_spec.rb b/spec/filters/scheduling_spec.rb index d7bd5a7..ed706f4 100644 --- a/spec/filters/scheduling_spec.rb +++ b/spec/filters/scheduling_spec.rb @@ -56,9 +56,9 @@ file.puts("a,11\nb,12\nc,13\n") end end - .then_after(2, "wait then translate again") do - subject.filter(event) + .then_after(1.2, "wait then translate again") do try(5) do + subject.filter(event) wait(0.5).for{event.get("[translation]")}.to eq("12"), "field [translation] did not eq '12'" end end @@ -87,9 +87,9 @@ file.puts("a,21\nb,22\nc,23\n") end end - .then_after(2, "wait then translate again") do - subject.filter(event) + .then_after(1.2, "wait then translate again") do try(5) do + subject.filter(event) wait(0.5).for{event.get("[translation]")}.to eq("22"), "field [translation] did not eq '22'" end end From 5a9d95a2c9e5c060040277e974a873d549186752 Mon Sep 17 00:00:00 2001 From: Kaise Cheng Date: Tue, 9 May 2023 15:06:31 +0100 Subject: [PATCH 04/20] add doc --- CHANGELOG.md | 2 +- docs/index.asciidoc | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 43236ec..68b8dc1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,5 @@ ## 3.4.1 - - Fix the limitation of the size of yaml file that exceeds 3MB + - Fix the limitation of the size of yaml file that exceeds 3MB [#97](https://github.com/logstash-plugins/logstash-filter-translate/pull/97) ## 3.4.0 - Refactor: leverage scheduler mixin [#93](https://github.com/logstash-plugins/logstash-filter-translate/pull/93) diff --git a/docs/index.asciidoc b/docs/index.asciidoc index 0208d1e..b3666cf 100644 --- a/docs/index.asciidoc +++ b/docs/index.asciidoc @@ -96,6 +96,7 @@ This plugin supports the following configuration options plus the <> |<>|No | <> |<>|No +| <> |<>|No | <> |a valid filesystem path|No | <> |<>|No | <> |<>|No @@ -149,6 +150,15 @@ Example: NOTE: It is an error to specify both `dictionary` and `dictionary_path`. +[id="plugins-{type}s-{plugin}-dictionary_file_max_bytes"] +===== `dictionary_file_max_bytes` + + * Value type is <> + * Default value is 3145728 (3MB) + +The maximum size of the file in `dictionary_path`. This setting is effective for YAML file only. +YAML over the limit throws exception. JSON and CSV currently do not have such restriction. + [id="plugins-{type}s-{plugin}-dictionary_path"] ===== `dictionary_path` From f2a95deb65b7f05aee6382775d43fefdf4c68274 Mon Sep 17 00:00:00 2001 From: Kaise Cheng Date: Wed, 10 May 2023 17:08:02 +0100 Subject: [PATCH 05/20] set default dictionary file size to 128MB --- docs/index.asciidoc | 2 +- lib/logstash/filters/translate.rb | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/index.asciidoc b/docs/index.asciidoc index b3666cf..46b8206 100644 --- a/docs/index.asciidoc +++ b/docs/index.asciidoc @@ -154,7 +154,7 @@ NOTE: It is an error to specify both `dictionary` and `dictionary_path`. ===== `dictionary_file_max_bytes` * Value type is <> - * Default value is 3145728 (3MB) + * Default value is 134217728 (128MB) The maximum size of the file in `dictionary_path`. This setting is effective for YAML file only. YAML over the limit throws exception. JSON and CSV currently do not have such restriction. diff --git a/lib/logstash/filters/translate.rb b/lib/logstash/filters/translate.rb index 6a326f9..09e16ad 100644 --- a/lib/logstash/filters/translate.rb +++ b/lib/logstash/filters/translate.rb @@ -105,7 +105,8 @@ class Translate < LogStash::Filters::Base # Setting the maximum bytes size of the file in `dictionary_path`. This setting is effective for YAML file only. # Snakeyaml 1.33 has a default limit 3MB. YAML file over the limit throws exception. JSON and CSV currently do not have such limit. # The limit could be too small in some use cases. Setting a bigger number in `dictionary_file_max_bytes` to relax the restriction. - config :dictionary_file_max_bytes, :validate => :number, :default => 3_145_728 + # The default value is 128MB + config :dictionary_file_max_bytes, :validate => :number, :default => 134_217_728 # When using a dictionary file, this setting will indicate how frequently # (in seconds) logstash will check the dictionary file for updates. From a282e1993483d35e22bae077cf1c9fc52f0872f9 Mon Sep 17 00:00:00 2001 From: Kaise Cheng Date: Thu, 11 May 2023 22:33:42 +0100 Subject: [PATCH 06/20] rename `yaml_dictionary_code_point_limit` and add checking in register --- docs/index.asciidoc | 21 ++++++----- lib/logstash/filters/dictionary/file.rb | 4 +- lib/logstash/filters/translate.rb | 25 ++++++++++--- spec/filters/translate_spec.rb | 50 ++++++++++++++++++++++--- 4 files changed, 77 insertions(+), 23 deletions(-) diff --git a/docs/index.asciidoc b/docs/index.asciidoc index 46b8206..9f7ce17 100644 --- a/docs/index.asciidoc +++ b/docs/index.asciidoc @@ -96,7 +96,6 @@ This plugin supports the following configuration options plus the <> |<>|No | <> |<>|No -| <> |<>|No | <> |a valid filesystem path|No | <> |<>|No | <> |<>|No @@ -109,6 +108,7 @@ This plugin supports the following configuration options plus the <> |<>|Yes | <> |<>|No | <> |<>|No +| <> |<>|No |======================================================================= Also see <> for a list of options supported by all @@ -150,15 +150,6 @@ Example: NOTE: It is an error to specify both `dictionary` and `dictionary_path`. -[id="plugins-{type}s-{plugin}-dictionary_file_max_bytes"] -===== `dictionary_file_max_bytes` - - * Value type is <> - * Default value is 134217728 (128MB) - -The maximum size of the file in `dictionary_path`. This setting is effective for YAML file only. -YAML over the limit throws exception. JSON and CSV currently do not have such restriction. - [id="plugins-{type}s-{plugin}-dictionary_path"] ===== `dictionary_path` @@ -431,5 +422,15 @@ The target field you wish to populate with the translated code. If you set this value to the same value as `source` field, the plugin does a substitution, and the filter will succeed. This will clobber the old value of the source field! + +[id="plugins-{type}s-{plugin}-yaml_dictionary_code_point_limit"] +===== `yaml_dictionary_code_point_limit` + +* Value type is <> +* Default value is 134217728 (128MB) + +The max amount of code points in the YAML file in `dictionary_path`. Please be aware that byte limit depends on the encoding. +This setting is effective for YAML file only. YAML over the limit throws exception. + [id="plugins-{type}s-{plugin}-common-options"] include::{include_path}/{type}.asciidoc[] diff --git a/lib/logstash/filters/dictionary/file.rb b/lib/logstash/filters/dictionary/file.rb index 1ff3536..99346d2 100644 --- a/lib/logstash/filters/dictionary/file.rb +++ b/lib/logstash/filters/dictionary/file.rb @@ -9,9 +9,9 @@ class File include LogStash::Util::Loggable - def self.create(path, refresh_interval, refresh_behaviour, exact, regex, params) + def self.create(path, refresh_interval, refresh_behaviour, exact, regex, yaml_code_point_limit) if /\.y[a]?ml$/.match(path) - instance = YamlFile.new(path, refresh_interval, exact, regex, params["dictionary_file_max_bytes"]) + instance = YamlFile.new(path, refresh_interval, exact, regex, yaml_code_point_limit) elsif path.end_with?(".json") instance = JsonFile.new(path, refresh_interval, exact, regex) elsif path.end_with?(".csv") diff --git a/lib/logstash/filters/translate.rb b/lib/logstash/filters/translate.rb index 09e16ad..01577db 100644 --- a/lib/logstash/filters/translate.rb +++ b/lib/logstash/filters/translate.rb @@ -102,11 +102,11 @@ class Translate < LogStash::Filters::Base # as the original text, and the second column as the replacement. config :dictionary_path, :validate => :path - # Setting the maximum bytes size of the file in `dictionary_path`. This setting is effective for YAML file only. + # The max amount of code points in the YAML file in `dictionary_path`. Please be aware that byte limit depends on the encoding. # Snakeyaml 1.33 has a default limit 3MB. YAML file over the limit throws exception. JSON and CSV currently do not have such limit. - # The limit could be too small in some use cases. Setting a bigger number in `dictionary_file_max_bytes` to relax the restriction. + # The limit could be too small in some use cases. Setting a bigger number in `yaml_dictionary_code_point_limit` to relax the restriction. # The default value is 128MB - config :dictionary_file_max_bytes, :validate => :number, :default => 134_217_728 + config :yaml_dictionary_code_point_limit, :validate => :number # When using a dictionary file, this setting will indicate how frequently # (in seconds) logstash will check the dictionary file for updates. @@ -186,12 +186,21 @@ def register ) end - if @dictionary_path && @dictionary_file_max_bytes <= 0 - raise LogStash::ConfigurationError, "Please set a positive number in `dictionary_file_max_bytes => #{@dictionary_file_max_bytes}`." + # check and set yaml code point limit + if @dictionary_path + if yaml_file?(@dictionary_path) + @yaml_dictionary_code_point_limit ||= 134_217_728 + + if @yaml_dictionary_code_point_limit <= 0 + raise LogStash::ConfigurationError, "Please set a positive number in `yaml_dictionary_code_point_limit => #{@yaml_dictionary_code_point_limit}`." + end + elsif @yaml_dictionary_code_point_limit != nil + raise LogStash::ConfigurationError, "Please remove `yaml_dictionary_code_point_limit` for dictionary file in JSON or CSV format" + end end if @dictionary_path - @lookup = Dictionary::File.create(@dictionary_path, @refresh_interval, @refresh_behaviour, @exact, @regex, params) + @lookup = Dictionary::File.create(@dictionary_path, @refresh_interval, @refresh_behaviour, @exact, @regex, @yaml_dictionary_code_point_limit) else @lookup = Dictionary::Memory.new(@dictionary, @exact, @regex) end @@ -255,5 +264,9 @@ def filter(event) @logger.error("Something went wrong when attempting to translate from dictionary", :exception => e, :source => @source, :event => event.to_hash) end end # def filter + + def yaml_file?(path) + /\.y[a]?ml$/.match(path) + end end # class LogStash::Filters::Translate end end diff --git a/spec/filters/translate_spec.rb b/spec/filters/translate_spec.rb index ed566d4..0866b3a 100644 --- a/spec/filters/translate_spec.rb +++ b/spec/filters/translate_spec.rb @@ -240,19 +240,19 @@ def self.build_fixture_path(filename) end end - describe "when using a yml file with size limit" do + describe "when using a yml dictionary with code point limit" do let(:config) do { "source" => "status", "target" => "translation", "dictionary_path" => dictionary_path, - "dictionary_file_max_bytes" => dictionary_size # the file is 18 bytes + "yaml_dictionary_code_point_limit" => dictionary_size # the file is 18 bytes } end let(:dictionary_path) { TranslateUtil.build_fixture_path("dict.yml") } let(:event) { LogStash::Event.new("status" => "a") } - context "file is over size limit" do + context "dictionary is over limit" do let(:dictionary_size) { 17 } it "raises exception" do @@ -260,7 +260,7 @@ def self.build_fixture_path(filename) end end - context "file is within size limit" do + context "dictionary is within limit" do let(:dictionary_size) { 18 } it "returns the exact translation" do @@ -270,13 +270,53 @@ def self.build_fixture_path(filename) end end - context "file size set to zero" do + context "limit set to zero" do let(:dictionary_size) { 0 } it "raises configuration exception" do expect { subject.register }.to raise_error(LogStash::ConfigurationError, /Please set a positive number/) end end + + context "limit is unset" do + let(:config) do + { + "source" => "status", + "target" => "translation", + "dictionary_path" => dictionary_path, + } + end + + it "sets the limit to 128MB" do + subject.register + expect(subject.instance_variable_get(:@yaml_dictionary_code_point_limit)).to eq(134_217_728) + end + end + + context "dictionary is json and limit is set" do + let(:dictionary_path) { TranslateUtil.build_fixture_path("dict.json") } + let(:dictionary_size) { 100 } + + it "raises configuration exception" do + expect { subject.register }.to raise_error(LogStash::ConfigurationError, /Please remove `yaml_dictionary_code_point_limit` for dictionary file in JSON or CSV format/) + end + end + + context "dictionary is json and limit is unset" do + let(:config) do + { + "source" => "status", + "target" => "translation", + "dictionary_path" => TranslateUtil.build_fixture_path("dict.json"), + } + end + + it "returns the exact translation" do + subject.register + subject.filter(event) + expect(event.get("translation")).to eq(10) + end + end end context "when using a map tagged yml file" do From 2f8e90abbfb090a604d60bab9a03b351ae68ad5c Mon Sep 17 00:00:00 2001 From: kaisecheng <69120390+kaisecheng@users.noreply.github.com> Date: Fri, 12 May 2023 01:26:46 +0100 Subject: [PATCH 07/20] Update lib/logstash/filters/dictionary/file.rb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: João Duarte --- lib/logstash/filters/dictionary/file.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/logstash/filters/dictionary/file.rb b/lib/logstash/filters/dictionary/file.rb index 99346d2..0bcfdbe 100644 --- a/lib/logstash/filters/dictionary/file.rb +++ b/lib/logstash/filters/dictionary/file.rb @@ -31,7 +31,7 @@ def self.create(path, refresh_interval, refresh_behaviour, exact, regex, yaml_co attr_reader :dictionary, :fetch_strategy - def initialize(path, refresh_interval, exact, regex, file_max_bytes = nil) + def initialize(path, refresh_interval, exact, regex, yaml_code_point_limit = nil) @dictionary_path = path @refresh_interval = refresh_interval @short_refresh = @refresh_interval <= 300 From d78dfc6f88e3e5b74ddff69b6984b4c2bb27814c Mon Sep 17 00:00:00 2001 From: kaisecheng <69120390+kaisecheng@users.noreply.github.com> Date: Fri, 12 May 2023 01:26:54 +0100 Subject: [PATCH 08/20] Update lib/logstash/filters/dictionary/file.rb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: João Duarte --- lib/logstash/filters/dictionary/file.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/logstash/filters/dictionary/file.rb b/lib/logstash/filters/dictionary/file.rb index 0bcfdbe..6f03cb2 100644 --- a/lib/logstash/filters/dictionary/file.rb +++ b/lib/logstash/filters/dictionary/file.rb @@ -39,7 +39,7 @@ def initialize(path, refresh_interval, exact, regex, yaml_code_point_limit = nil @write_lock = rw_lock.writeLock @dictionary = Hash.new @update_method = method(:merge_dictionary) - initialize_for_file_type(file_max_bytes) + initialize_for_file_type(yaml_code_point_limit) args = [@dictionary, rw_lock] klass = case when exact && regex then FetchStrategy::File::ExactRegex From b385408db4842ae0ca7f921365703216855b3e2d Mon Sep 17 00:00:00 2001 From: kaisecheng <69120390+kaisecheng@users.noreply.github.com> Date: Fri, 12 May 2023 01:27:07 +0100 Subject: [PATCH 09/20] Update lib/logstash/filters/dictionary/yaml_file.rb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: João Duarte --- lib/logstash/filters/dictionary/yaml_file.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/logstash/filters/dictionary/yaml_file.rb b/lib/logstash/filters/dictionary/yaml_file.rb index 52fc6f2..21967cf 100644 --- a/lib/logstash/filters/dictionary/yaml_file.rb +++ b/lib/logstash/filters/dictionary/yaml_file.rb @@ -7,11 +7,11 @@ class YamlFile < File protected - def initialize_for_file_type(file_max_bytes) + def initialize_for_file_type(yaml_code_point_limit) @visitor = YamlVisitor.create @parser = Psych::Parser.new(Psych::TreeBuilder.new) - @parser.code_point_limit = file_max_bytes + @parser.code_point_limit = yaml_code_point_limit end def read_file_into_dictionary From 9abf5a81eb663a643e17875e37ab20b10896ff6a Mon Sep 17 00:00:00 2001 From: kaisecheng <69120390+kaisecheng@users.noreply.github.com> Date: Fri, 12 May 2023 01:27:18 +0100 Subject: [PATCH 10/20] Update lib/logstash/filters/translate.rb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: João Duarte --- lib/logstash/filters/translate.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/logstash/filters/translate.rb b/lib/logstash/filters/translate.rb index 01577db..1e7778e 100644 --- a/lib/logstash/filters/translate.rb +++ b/lib/logstash/filters/translate.rb @@ -105,7 +105,7 @@ class Translate < LogStash::Filters::Base # The max amount of code points in the YAML file in `dictionary_path`. Please be aware that byte limit depends on the encoding. # Snakeyaml 1.33 has a default limit 3MB. YAML file over the limit throws exception. JSON and CSV currently do not have such limit. # The limit could be too small in some use cases. Setting a bigger number in `yaml_dictionary_code_point_limit` to relax the restriction. - # The default value is 128MB + # The default value is 128MB for code points of size 1 byte config :yaml_dictionary_code_point_limit, :validate => :number # When using a dictionary file, this setting will indicate how frequently From 121c57d8e1824908f16bf13a118c3dac469ddce3 Mon Sep 17 00:00:00 2001 From: kaisecheng <69120390+kaisecheng@users.noreply.github.com> Date: Fri, 12 May 2023 01:27:28 +0100 Subject: [PATCH 11/20] Update lib/logstash/filters/dictionary/file.rb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: João Duarte --- lib/logstash/filters/dictionary/file.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/logstash/filters/dictionary/file.rb b/lib/logstash/filters/dictionary/file.rb index 6f03cb2..804aee0 100644 --- a/lib/logstash/filters/dictionary/file.rb +++ b/lib/logstash/filters/dictionary/file.rb @@ -68,7 +68,7 @@ def set_update_strategy(method_sym) protected - def initialize_for_file_type(file_max_bytes) + def initialize_for_file_type(yaml_code_point_limit) # sub class specific initializer end From e6eb949bf7d99cd9d98269cbafbd2055c8ee00d9 Mon Sep 17 00:00:00 2001 From: kaisecheng <69120390+kaisecheng@users.noreply.github.com> Date: Fri, 12 May 2023 01:27:37 +0100 Subject: [PATCH 12/20] Update docs/index.asciidoc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: João Duarte --- docs/index.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.asciidoc b/docs/index.asciidoc index 9f7ce17..1edd8cc 100644 --- a/docs/index.asciidoc +++ b/docs/index.asciidoc @@ -427,7 +427,7 @@ the filter will succeed. This will clobber the old value of the source field! ===== `yaml_dictionary_code_point_limit` * Value type is <> -* Default value is 134217728 (128MB) +* Default value is 134217728 (128MB for 1 byte code points) The max amount of code points in the YAML file in `dictionary_path`. Please be aware that byte limit depends on the encoding. This setting is effective for YAML file only. YAML over the limit throws exception. From 9346f7cea531f2667e598fc7022365556e7d0dc7 Mon Sep 17 00:00:00 2001 From: Kaise Cheng Date: Fri, 12 May 2023 01:45:05 +0100 Subject: [PATCH 13/20] fix logger --- lib/logstash/filters/dictionary/file.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/logstash/filters/dictionary/file.rb b/lib/logstash/filters/dictionary/file.rb index 804aee0..2748dba 100644 --- a/lib/logstash/filters/dictionary/file.rb +++ b/lib/logstash/filters/dictionary/file.rb @@ -55,7 +55,7 @@ def load_dictionary(raise_exception=false) @dictionary_mtime = ::File.mtime(@dictionary_path).to_f @update_method.call rescue Errno::ENOENT - @logger.warn("dictionary file read failure, continuing with old dictionary", :path => @dictionary_path) + logger.warn("dictionary file read failure, continuing with old dictionary", :path => @dictionary_path) rescue => e loading_exception(e, raise_exception) end @@ -120,7 +120,7 @@ def loading_exception(e, raise_exception) dfe.set_backtrace(e.backtrace) raise dfe else - @logger.warn("#{msg}, continuing with old dictionary", :dictionary_path => @dictionary_path) + logger.warn("#{msg}, continuing with old dictionary", :dictionary_path => @dictionary_path) end end end From 3a535d08908d03aacab09e7347a2328c23c12bc1 Mon Sep 17 00:00:00 2001 From: kaisecheng <69120390+kaisecheng@users.noreply.github.com> Date: Fri, 12 May 2023 11:15:45 +0100 Subject: [PATCH 14/20] Update lib/logstash/filters/dictionary/file.rb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: João Duarte --- lib/logstash/filters/dictionary/file.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/logstash/filters/dictionary/file.rb b/lib/logstash/filters/dictionary/file.rb index 2748dba..477a74e 100644 --- a/lib/logstash/filters/dictionary/file.rb +++ b/lib/logstash/filters/dictionary/file.rb @@ -31,7 +31,7 @@ def self.create(path, refresh_interval, refresh_behaviour, exact, regex, yaml_co attr_reader :dictionary, :fetch_strategy - def initialize(path, refresh_interval, exact, regex, yaml_code_point_limit = nil) + def initialize(path, refresh_interval, exact, regex, **file_type_args) @dictionary_path = path @refresh_interval = refresh_interval @short_refresh = @refresh_interval <= 300 From 23c3d07631bb18aa70677c18b88aa2b70de49b4b Mon Sep 17 00:00:00 2001 From: kaisecheng <69120390+kaisecheng@users.noreply.github.com> Date: Fri, 12 May 2023 11:16:01 +0100 Subject: [PATCH 15/20] Update lib/logstash/filters/translate.rb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: João Duarte --- lib/logstash/filters/translate.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/logstash/filters/translate.rb b/lib/logstash/filters/translate.rb index 1e7778e..c61dadc 100644 --- a/lib/logstash/filters/translate.rb +++ b/lib/logstash/filters/translate.rb @@ -193,14 +193,14 @@ def register if @yaml_dictionary_code_point_limit <= 0 raise LogStash::ConfigurationError, "Please set a positive number in `yaml_dictionary_code_point_limit => #{@yaml_dictionary_code_point_limit}`." + else + @lookup = Dictionary::File.create(@dictionary_path, @refresh_interval, @refresh_behaviour, @exact, @regex, yaml_code_point_limit: @yaml_dictionary_code_point_limit) end elsif @yaml_dictionary_code_point_limit != nil raise LogStash::ConfigurationError, "Please remove `yaml_dictionary_code_point_limit` for dictionary file in JSON or CSV format" + else + @lookup = Dictionary::File.create(@dictionary_path, @refresh_interval, @refresh_behaviour, @exact, @regex) end - end - - if @dictionary_path - @lookup = Dictionary::File.create(@dictionary_path, @refresh_interval, @refresh_behaviour, @exact, @regex, @yaml_dictionary_code_point_limit) else @lookup = Dictionary::Memory.new(@dictionary, @exact, @regex) end From db859e0aaa4481bb36bd31f32b78acc55f28b387 Mon Sep 17 00:00:00 2001 From: kaisecheng <69120390+kaisecheng@users.noreply.github.com> Date: Fri, 12 May 2023 11:16:10 +0100 Subject: [PATCH 16/20] Update lib/logstash/filters/dictionary/yaml_file.rb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: João Duarte --- lib/logstash/filters/dictionary/yaml_file.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/logstash/filters/dictionary/yaml_file.rb b/lib/logstash/filters/dictionary/yaml_file.rb index 21967cf..5bef3ca 100644 --- a/lib/logstash/filters/dictionary/yaml_file.rb +++ b/lib/logstash/filters/dictionary/yaml_file.rb @@ -7,11 +7,11 @@ class YamlFile < File protected - def initialize_for_file_type(yaml_code_point_limit) + def initialize_for_file_type(**file_type_args) @visitor = YamlVisitor.create @parser = Psych::Parser.new(Psych::TreeBuilder.new) - @parser.code_point_limit = yaml_code_point_limit + @parser.code_point_limit = file_type_args[:yaml_code_point_limit] end def read_file_into_dictionary From e4373625454ac09d4734e90094eb7c40ae50136f Mon Sep 17 00:00:00 2001 From: kaisecheng <69120390+kaisecheng@users.noreply.github.com> Date: Fri, 12 May 2023 11:16:15 +0100 Subject: [PATCH 17/20] Update lib/logstash/filters/dictionary/file.rb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: João Duarte --- lib/logstash/filters/dictionary/file.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/logstash/filters/dictionary/file.rb b/lib/logstash/filters/dictionary/file.rb index 477a74e..d72b343 100644 --- a/lib/logstash/filters/dictionary/file.rb +++ b/lib/logstash/filters/dictionary/file.rb @@ -39,7 +39,7 @@ def initialize(path, refresh_interval, exact, regex, **file_type_args) @write_lock = rw_lock.writeLock @dictionary = Hash.new @update_method = method(:merge_dictionary) - initialize_for_file_type(yaml_code_point_limit) + initialize_for_file_type(yaml_code_point_limit: file_type_args[:yaml_code_point_limit]) args = [@dictionary, rw_lock] klass = case when exact && regex then FetchStrategy::File::ExactRegex From 610f48d2cd75b59c795929f5e5325e0b8945e090 Mon Sep 17 00:00:00 2001 From: kaisecheng <69120390+kaisecheng@users.noreply.github.com> Date: Fri, 12 May 2023 11:16:24 +0100 Subject: [PATCH 18/20] Update lib/logstash/filters/dictionary/file.rb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: João Duarte --- lib/logstash/filters/dictionary/file.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/logstash/filters/dictionary/file.rb b/lib/logstash/filters/dictionary/file.rb index d72b343..1210161 100644 --- a/lib/logstash/filters/dictionary/file.rb +++ b/lib/logstash/filters/dictionary/file.rb @@ -68,7 +68,7 @@ def set_update_strategy(method_sym) protected - def initialize_for_file_type(yaml_code_point_limit) + def initialize_for_file_type(**file_type_args) # sub class specific initializer end From cf2b2ffe52395d3af7c185303bdde52f0f11930f Mon Sep 17 00:00:00 2001 From: Kaise Cheng Date: Fri, 12 May 2023 14:35:18 +0100 Subject: [PATCH 19/20] fix optional hash arguments --- lib/logstash/filters/dictionary/file.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/logstash/filters/dictionary/file.rb b/lib/logstash/filters/dictionary/file.rb index 1210161..a22a3a2 100644 --- a/lib/logstash/filters/dictionary/file.rb +++ b/lib/logstash/filters/dictionary/file.rb @@ -9,9 +9,9 @@ class File include LogStash::Util::Loggable - def self.create(path, refresh_interval, refresh_behaviour, exact, regex, yaml_code_point_limit) + def self.create(path, refresh_interval, refresh_behaviour, exact, regex, **file_type_args) if /\.y[a]?ml$/.match(path) - instance = YamlFile.new(path, refresh_interval, exact, regex, yaml_code_point_limit) + instance = YamlFile.new(path, refresh_interval, exact, regex, file_type_args) elsif path.end_with?(".json") instance = JsonFile.new(path, refresh_interval, exact, regex) elsif path.end_with?(".csv") @@ -39,7 +39,7 @@ def initialize(path, refresh_interval, exact, regex, **file_type_args) @write_lock = rw_lock.writeLock @dictionary = Hash.new @update_method = method(:merge_dictionary) - initialize_for_file_type(yaml_code_point_limit: file_type_args[:yaml_code_point_limit]) + initialize_for_file_type(file_type_args) args = [@dictionary, rw_lock] klass = case when exact && regex then FetchStrategy::File::ExactRegex From 294c672c4d20759e69ad13aeb4b09e50b971977d Mon Sep 17 00:00:00 2001 From: Kaise Cheng Date: Fri, 12 May 2023 15:05:03 +0100 Subject: [PATCH 20/20] add comment --- lib/logstash/filters/translate.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/logstash/filters/translate.rb b/lib/logstash/filters/translate.rb index c61dadc..f45ee1c 100644 --- a/lib/logstash/filters/translate.rb +++ b/lib/logstash/filters/translate.rb @@ -187,6 +187,7 @@ def register end # check and set yaml code point limit + # set lookup dictionary if @dictionary_path if yaml_file?(@dictionary_path) @yaml_dictionary_code_point_limit ||= 134_217_728