diff --git a/CHANGELOG.md b/CHANGELOG.md index b22f00e..68b8dc1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +## 3.4.1 + - Fix the limitation of the size of yaml file that exceeds 3MB [#97](https://github.com/logstash-plugins/logstash-filter-translate/pull/97) + ## 3.4.0 - Refactor: leverage scheduler mixin [#93](https://github.com/logstash-plugins/logstash-filter-translate/pull/93) diff --git a/docs/index.asciidoc b/docs/index.asciidoc index 0208d1e..1edd8cc 100644 --- a/docs/index.asciidoc +++ b/docs/index.asciidoc @@ -108,6 +108,7 @@ This plugin supports the following configuration options plus the <> |<>|Yes | <> |<>|No | <> |<>|No +| <> |<>|No |======================================================================= Also see <> for a list of options supported by all @@ -421,5 +422,15 @@ The target field you wish to populate with the translated code. If you set this value to the same value as `source` field, the plugin does a substitution, and the filter will succeed. This will clobber the old value of the source field! + +[id="plugins-{type}s-{plugin}-yaml_dictionary_code_point_limit"] +===== `yaml_dictionary_code_point_limit` + +* Value type is <> +* Default value is 134217728 (128MB for 1 byte code points) + +The max amount of code points in the YAML file in `dictionary_path`. Please be aware that byte limit depends on the encoding. +This setting is effective for YAML file only. YAML over the limit throws exception. + [id="plugins-{type}s-{plugin}-common-options"] include::{include_path}/{type}.asciidoc[] diff --git a/lib/logstash/filters/dictionary/file.rb b/lib/logstash/filters/dictionary/file.rb index a520e6c..a22a3a2 100644 --- a/lib/logstash/filters/dictionary/file.rb +++ b/lib/logstash/filters/dictionary/file.rb @@ -9,9 +9,9 @@ class File include LogStash::Util::Loggable - def self.create(path, refresh_interval, refresh_behaviour, exact, regex) + def self.create(path, refresh_interval, refresh_behaviour, exact, regex, **file_type_args) if /\.y[a]?ml$/.match(path) - instance = YamlFile.new(path, refresh_interval, exact, regex) + instance = YamlFile.new(path, refresh_interval, exact, regex, file_type_args) elsif path.end_with?(".json") instance = JsonFile.new(path, refresh_interval, exact, regex) elsif path.end_with?(".csv") @@ -31,7 +31,7 @@ def self.create(path, refresh_interval, refresh_behaviour, exact, regex) attr_reader :dictionary, :fetch_strategy - def initialize(path, refresh_interval, exact, regex) + def initialize(path, refresh_interval, exact, regex, **file_type_args) @dictionary_path = path @refresh_interval = refresh_interval @short_refresh = @refresh_interval <= 300 @@ -39,7 +39,7 @@ def initialize(path, refresh_interval, exact, regex) @write_lock = rw_lock.writeLock @dictionary = Hash.new @update_method = method(:merge_dictionary) - initialize_for_file_type + initialize_for_file_type(file_type_args) args = [@dictionary, rw_lock] klass = case when exact && regex then FetchStrategy::File::ExactRegex @@ -55,7 +55,7 @@ def load_dictionary(raise_exception=false) @dictionary_mtime = ::File.mtime(@dictionary_path).to_f @update_method.call rescue Errno::ENOENT - @logger.warn("dictionary file read failure, continuing with old dictionary", :path => @dictionary_path) + logger.warn("dictionary file read failure, continuing with old dictionary", :path => @dictionary_path) rescue => e loading_exception(e, raise_exception) end @@ -68,7 +68,7 @@ def set_update_strategy(method_sym) protected - def initialize_for_file_type + def initialize_for_file_type(**file_type_args) # sub class specific initializer end @@ -120,7 +120,7 @@ def loading_exception(e, raise_exception) dfe.set_backtrace(e.backtrace) raise dfe else - @logger.warn("#{msg}, continuing with old dictionary", :dictionary_path => @dictionary_path) + logger.warn("#{msg}, continuing with old dictionary", :dictionary_path => @dictionary_path) end end end diff --git a/lib/logstash/filters/dictionary/json_file.rb b/lib/logstash/filters/dictionary/json_file.rb index 29e0bf5..610c69a 100644 --- a/lib/logstash/filters/dictionary/json_file.rb +++ b/lib/logstash/filters/dictionary/json_file.rb @@ -6,9 +6,6 @@ class JsonFile < File protected - def initialize_for_file_type - end - def read_file_into_dictionary content = IO.read(@dictionary_path, :mode => 'r:bom|utf-8') @dictionary.update(LogStash::Json.load(content)) unless content.nil? || content.empty? diff --git a/lib/logstash/filters/dictionary/yaml_file.rb b/lib/logstash/filters/dictionary/yaml_file.rb index cc49128..5bef3ca 100644 --- a/lib/logstash/filters/dictionary/yaml_file.rb +++ b/lib/logstash/filters/dictionary/yaml_file.rb @@ -7,18 +7,20 @@ class YamlFile < File protected - def initialize_for_file_type + def initialize_for_file_type(**file_type_args) @visitor = YamlVisitor.create + + @parser = Psych::Parser.new(Psych::TreeBuilder.new) + @parser.code_point_limit = file_type_args[:yaml_code_point_limit] end def read_file_into_dictionary # low level YAML read that tries to create as # few intermediate objects as possible # this overwrites the value at key - @visitor.accept_with_dictionary( - @dictionary, Psych.parse_stream( - IO.read(@dictionary_path, :mode => 'r:bom|utf-8') - )) + yaml_string = IO.read(@dictionary_path, :mode => 'r:bom|utf-8') + @parser.parse(yaml_string, @dictionary_path) + @visitor.accept_with_dictionary(@dictionary, @parser.handler.root) end end end end end diff --git a/lib/logstash/filters/translate.rb b/lib/logstash/filters/translate.rb index 6876d58..f45ee1c 100644 --- a/lib/logstash/filters/translate.rb +++ b/lib/logstash/filters/translate.rb @@ -102,6 +102,12 @@ class Translate < LogStash::Filters::Base # as the original text, and the second column as the replacement. config :dictionary_path, :validate => :path + # The max amount of code points in the YAML file in `dictionary_path`. Please be aware that byte limit depends on the encoding. + # Snakeyaml 1.33 has a default limit 3MB. YAML file over the limit throws exception. JSON and CSV currently do not have such limit. + # The limit could be too small in some use cases. Setting a bigger number in `yaml_dictionary_code_point_limit` to relax the restriction. + # The default value is 128MB for code points of size 1 byte + config :yaml_dictionary_code_point_limit, :validate => :number + # When using a dictionary file, this setting will indicate how frequently # (in seconds) logstash will check the dictionary file for updates. config :refresh_interval, :validate => :number, :default => 300 @@ -180,8 +186,22 @@ def register ) end + # check and set yaml code point limit + # set lookup dictionary if @dictionary_path - @lookup = Dictionary::File.create(@dictionary_path, @refresh_interval, @refresh_behaviour, @exact, @regex) + if yaml_file?(@dictionary_path) + @yaml_dictionary_code_point_limit ||= 134_217_728 + + if @yaml_dictionary_code_point_limit <= 0 + raise LogStash::ConfigurationError, "Please set a positive number in `yaml_dictionary_code_point_limit => #{@yaml_dictionary_code_point_limit}`." + else + @lookup = Dictionary::File.create(@dictionary_path, @refresh_interval, @refresh_behaviour, @exact, @regex, yaml_code_point_limit: @yaml_dictionary_code_point_limit) + end + elsif @yaml_dictionary_code_point_limit != nil + raise LogStash::ConfigurationError, "Please remove `yaml_dictionary_code_point_limit` for dictionary file in JSON or CSV format" + else + @lookup = Dictionary::File.create(@dictionary_path, @refresh_interval, @refresh_behaviour, @exact, @regex) + end else @lookup = Dictionary::Memory.new(@dictionary, @exact, @regex) end @@ -245,5 +265,9 @@ def filter(event) @logger.error("Something went wrong when attempting to translate from dictionary", :exception => e, :source => @source, :event => event.to_hash) end end # def filter + + def yaml_file?(path) + /\.y[a]?ml$/.match(path) + end end # class LogStash::Filters::Translate end end diff --git a/logstash-filter-translate.gemspec b/logstash-filter-translate.gemspec index d82a5db..98d60b0 100644 --- a/logstash-filter-translate.gemspec +++ b/logstash-filter-translate.gemspec @@ -1,7 +1,7 @@ Gem::Specification.new do |s| s.name = 'logstash-filter-translate' - s.version = '3.4.0' + s.version = '3.4.1' s.licenses = ['Apache License (2.0)'] s.summary = "Replaces field contents based on a hash or YAML file" s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program" @@ -25,6 +25,7 @@ Gem::Specification.new do |s| s.add_runtime_dependency 'logstash-mixin-validator_support', '~> 1.0' s.add_runtime_dependency 'logstash-mixin-deprecation_logger_support', '~> 1.0' s.add_runtime_dependency "logstash-mixin-scheduler", '~> 1.0' + s.add_runtime_dependency "psych", ">= 5.1.0" s.add_development_dependency 'logstash-devutils' s.add_development_dependency 'rspec-sequencing' diff --git a/spec/filters/scheduling_spec.rb b/spec/filters/scheduling_spec.rb index 1d9b202..ed706f4 100644 --- a/spec/filters/scheduling_spec.rb +++ b/spec/filters/scheduling_spec.rb @@ -57,9 +57,9 @@ end end .then_after(1.2, "wait then translate again") do - subject.filter(event) try(5) do - wait(0.1).for{event.get("[translation]")}.to eq("12"), "field [translation] did not eq '12'" + subject.filter(event) + wait(0.5).for{event.get("[translation]")}.to eq("12"), "field [translation] did not eq '12'" end end .then("stop") do @@ -88,9 +88,9 @@ end end .then_after(1.2, "wait then translate again") do - subject.filter(event) try(5) do - wait(0.1).for{event.get("[translation]")}.to eq("22"), "field [translation] did not eq '22'" + subject.filter(event) + wait(0.5).for{event.get("[translation]")}.to eq("22"), "field [translation] did not eq '22'" end end .then("stop") do diff --git a/spec/filters/translate_spec.rb b/spec/filters/translate_spec.rb index 9a57e70..0866b3a 100644 --- a/spec/filters/translate_spec.rb +++ b/spec/filters/translate_spec.rb @@ -240,6 +240,85 @@ def self.build_fixture_path(filename) end end + describe "when using a yml dictionary with code point limit" do + let(:config) do + { + "source" => "status", + "target" => "translation", + "dictionary_path" => dictionary_path, + "yaml_dictionary_code_point_limit" => dictionary_size # the file is 18 bytes + } + end + let(:dictionary_path) { TranslateUtil.build_fixture_path("dict.yml") } + let(:event) { LogStash::Event.new("status" => "a") } + + context "dictionary is over limit" do + let(:dictionary_size) { 17 } + + it "raises exception" do + expect { subject.register }.to raise_error(/The incoming YAML document exceeds/) + end + end + + context "dictionary is within limit" do + let(:dictionary_size) { 18 } + + it "returns the exact translation" do + subject.register + subject.filter(event) + expect(event.get("translation")).to eq(1) + end + end + + context "limit set to zero" do + let(:dictionary_size) { 0 } + + it "raises configuration exception" do + expect { subject.register }.to raise_error(LogStash::ConfigurationError, /Please set a positive number/) + end + end + + context "limit is unset" do + let(:config) do + { + "source" => "status", + "target" => "translation", + "dictionary_path" => dictionary_path, + } + end + + it "sets the limit to 128MB" do + subject.register + expect(subject.instance_variable_get(:@yaml_dictionary_code_point_limit)).to eq(134_217_728) + end + end + + context "dictionary is json and limit is set" do + let(:dictionary_path) { TranslateUtil.build_fixture_path("dict.json") } + let(:dictionary_size) { 100 } + + it "raises configuration exception" do + expect { subject.register }.to raise_error(LogStash::ConfigurationError, /Please remove `yaml_dictionary_code_point_limit` for dictionary file in JSON or CSV format/) + end + end + + context "dictionary is json and limit is unset" do + let(:config) do + { + "source" => "status", + "target" => "translation", + "dictionary_path" => TranslateUtil.build_fixture_path("dict.json"), + } + end + + it "returns the exact translation" do + subject.register + subject.filter(event) + expect(event.get("translation")).to eq(10) + end + end + end + context "when using a map tagged yml file" do let(:dictionary_path) { TranslateUtil.build_fixture_path("tag-map-dict.yml") } let(:event) { LogStash::Event.new("status" => "six") }