From 7ef4b48bad01273189949882ce4d81eddbf7f73d Mon Sep 17 00:00:00 2001 From: Kaise Cheng Date: Tue, 9 May 2023 14:37:03 +0100 Subject: [PATCH] added setting `dictionary_file_max_bytes` to config the maximum bytes size of the yaml file in `dictionary_path` to overcome the 3MB size limit from SnakeYaml 1.33 Fixed: #96 --- CHANGELOG.md | 3 ++ lib/logstash/filters/dictionary/file.rb | 10 ++--- lib/logstash/filters/dictionary/json_file.rb | 3 -- lib/logstash/filters/dictionary/yaml_file.rb | 12 +++--- lib/logstash/filters/translate.rb | 11 +++++- logstash-filter-translate.gemspec | 3 +- spec/filters/translate_spec.rb | 39 ++++++++++++++++++++ 7 files changed, 66 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b22f00e..43236ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +## 3.4.1 + - Fix the limitation of the size of yaml file that exceeds 3MB + ## 3.4.0 - Refactor: leverage scheduler mixin [#93](https://github.com/logstash-plugins/logstash-filter-translate/pull/93) diff --git a/lib/logstash/filters/dictionary/file.rb b/lib/logstash/filters/dictionary/file.rb index a520e6c..1ff3536 100644 --- a/lib/logstash/filters/dictionary/file.rb +++ b/lib/logstash/filters/dictionary/file.rb @@ -9,9 +9,9 @@ class File include LogStash::Util::Loggable - def self.create(path, refresh_interval, refresh_behaviour, exact, regex) + def self.create(path, refresh_interval, refresh_behaviour, exact, regex, params) if /\.y[a]?ml$/.match(path) - instance = YamlFile.new(path, refresh_interval, exact, regex) + instance = YamlFile.new(path, refresh_interval, exact, regex, params["dictionary_file_max_bytes"]) elsif path.end_with?(".json") instance = JsonFile.new(path, refresh_interval, exact, regex) elsif path.end_with?(".csv") @@ -31,7 +31,7 @@ def self.create(path, refresh_interval, refresh_behaviour, exact, regex) attr_reader :dictionary, :fetch_strategy - def initialize(path, refresh_interval, exact, regex) + def initialize(path, refresh_interval, exact, regex, file_max_bytes = nil) @dictionary_path = path @refresh_interval = refresh_interval @short_refresh = @refresh_interval <= 300 @@ -39,7 +39,7 @@ def initialize(path, refresh_interval, exact, regex) @write_lock = rw_lock.writeLock @dictionary = Hash.new @update_method = method(:merge_dictionary) - initialize_for_file_type + initialize_for_file_type(file_max_bytes) args = [@dictionary, rw_lock] klass = case when exact && regex then FetchStrategy::File::ExactRegex @@ -68,7 +68,7 @@ def set_update_strategy(method_sym) protected - def initialize_for_file_type + def initialize_for_file_type(file_max_bytes) # sub class specific initializer end diff --git a/lib/logstash/filters/dictionary/json_file.rb b/lib/logstash/filters/dictionary/json_file.rb index 29e0bf5..610c69a 100644 --- a/lib/logstash/filters/dictionary/json_file.rb +++ b/lib/logstash/filters/dictionary/json_file.rb @@ -6,9 +6,6 @@ class JsonFile < File protected - def initialize_for_file_type - end - def read_file_into_dictionary content = IO.read(@dictionary_path, :mode => 'r:bom|utf-8') @dictionary.update(LogStash::Json.load(content)) unless content.nil? || content.empty? diff --git a/lib/logstash/filters/dictionary/yaml_file.rb b/lib/logstash/filters/dictionary/yaml_file.rb index cc49128..52fc6f2 100644 --- a/lib/logstash/filters/dictionary/yaml_file.rb +++ b/lib/logstash/filters/dictionary/yaml_file.rb @@ -7,18 +7,20 @@ class YamlFile < File protected - def initialize_for_file_type + def initialize_for_file_type(file_max_bytes) @visitor = YamlVisitor.create + + @parser = Psych::Parser.new(Psych::TreeBuilder.new) + @parser.code_point_limit = file_max_bytes end def read_file_into_dictionary # low level YAML read that tries to create as # few intermediate objects as possible # this overwrites the value at key - @visitor.accept_with_dictionary( - @dictionary, Psych.parse_stream( - IO.read(@dictionary_path, :mode => 'r:bom|utf-8') - )) + yaml_string = IO.read(@dictionary_path, :mode => 'r:bom|utf-8') + @parser.parse(yaml_string, @dictionary_path) + @visitor.accept_with_dictionary(@dictionary, @parser.handler.root) end end end end end diff --git a/lib/logstash/filters/translate.rb b/lib/logstash/filters/translate.rb index 6876d58..6a326f9 100644 --- a/lib/logstash/filters/translate.rb +++ b/lib/logstash/filters/translate.rb @@ -102,6 +102,11 @@ class Translate < LogStash::Filters::Base # as the original text, and the second column as the replacement. config :dictionary_path, :validate => :path + # Setting the maximum bytes size of the file in `dictionary_path`. This setting is effective for YAML file only. + # Snakeyaml 1.33 has a default limit 3MB. YAML file over the limit throws exception. JSON and CSV currently do not have such limit. + # The limit could be too small in some use cases. Setting a bigger number in `dictionary_file_max_bytes` to relax the restriction. + config :dictionary_file_max_bytes, :validate => :number, :default => 3_145_728 + # When using a dictionary file, this setting will indicate how frequently # (in seconds) logstash will check the dictionary file for updates. config :refresh_interval, :validate => :number, :default => 300 @@ -180,8 +185,12 @@ def register ) end + if @dictionary_path && @dictionary_file_max_bytes <= 0 + raise LogStash::ConfigurationError, "Please set a positive number in `dictionary_file_max_bytes => #{@dictionary_file_max_bytes}`." + end + if @dictionary_path - @lookup = Dictionary::File.create(@dictionary_path, @refresh_interval, @refresh_behaviour, @exact, @regex) + @lookup = Dictionary::File.create(@dictionary_path, @refresh_interval, @refresh_behaviour, @exact, @regex, params) else @lookup = Dictionary::Memory.new(@dictionary, @exact, @regex) end diff --git a/logstash-filter-translate.gemspec b/logstash-filter-translate.gemspec index d82a5db..98d60b0 100644 --- a/logstash-filter-translate.gemspec +++ b/logstash-filter-translate.gemspec @@ -1,7 +1,7 @@ Gem::Specification.new do |s| s.name = 'logstash-filter-translate' - s.version = '3.4.0' + s.version = '3.4.1' s.licenses = ['Apache License (2.0)'] s.summary = "Replaces field contents based on a hash or YAML file" s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program" @@ -25,6 +25,7 @@ Gem::Specification.new do |s| s.add_runtime_dependency 'logstash-mixin-validator_support', '~> 1.0' s.add_runtime_dependency 'logstash-mixin-deprecation_logger_support', '~> 1.0' s.add_runtime_dependency "logstash-mixin-scheduler", '~> 1.0' + s.add_runtime_dependency "psych", ">= 5.1.0" s.add_development_dependency 'logstash-devutils' s.add_development_dependency 'rspec-sequencing' diff --git a/spec/filters/translate_spec.rb b/spec/filters/translate_spec.rb index 9a57e70..ed566d4 100644 --- a/spec/filters/translate_spec.rb +++ b/spec/filters/translate_spec.rb @@ -240,6 +240,45 @@ def self.build_fixture_path(filename) end end + describe "when using a yml file with size limit" do + let(:config) do + { + "source" => "status", + "target" => "translation", + "dictionary_path" => dictionary_path, + "dictionary_file_max_bytes" => dictionary_size # the file is 18 bytes + } + end + let(:dictionary_path) { TranslateUtil.build_fixture_path("dict.yml") } + let(:event) { LogStash::Event.new("status" => "a") } + + context "file is over size limit" do + let(:dictionary_size) { 17 } + + it "raises exception" do + expect { subject.register }.to raise_error(/The incoming YAML document exceeds/) + end + end + + context "file is within size limit" do + let(:dictionary_size) { 18 } + + it "returns the exact translation" do + subject.register + subject.filter(event) + expect(event.get("translation")).to eq(1) + end + end + + context "file size set to zero" do + let(:dictionary_size) { 0 } + + it "raises configuration exception" do + expect { subject.register }.to raise_error(LogStash::ConfigurationError, /Please set a positive number/) + end + end + end + context "when using a map tagged yml file" do let(:dictionary_path) { TranslateUtil.build_fixture_path("tag-map-dict.yml") } let(:event) { LogStash::Event.new("status" => "six") }