Skip to content

Commit

Permalink
Merge pull request #97 from kaisecheng/fix_yaml_size_limit
Browse files Browse the repository at this point in the history
added setting `yaml_dictionary_code_point_limit` to config the maximum code point limit of the yaml file in `dictionary_path` to overcome the 3MB size limit from SnakeYaml 1.33. This setting is only effective for yaml. Yaml file over the size limit throws an exception. JSON and CSV currently do not have such restriction. The default value of yaml_dictionary_code_point_limit is 128MB.

Fixed: #96
  • Loading branch information
kaisecheng authored May 12, 2023
2 parents 6590dce + 294c672 commit b3e4bd5
Show file tree
Hide file tree
Showing 9 changed files with 138 additions and 21 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
## 3.4.1
- Fix the limitation of the size of yaml file that exceeds 3MB [#97](https://github.com/logstash-plugins/logstash-filter-translate/pull/97)

## 3.4.0
- Refactor: leverage scheduler mixin [#93](https://github.com/logstash-plugins/logstash-filter-translate/pull/93)

Expand Down
11 changes: 11 additions & 0 deletions docs/index.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ This plugin supports the following configuration options plus the <<plugins-{typ
| <<plugins-{type}s-{plugin}-source>> |<<string,string>>|Yes
| <<plugins-{type}s-{plugin}-refresh_behaviour>> |<<string,string>>|No
| <<plugins-{type}s-{plugin}-target>> |<<string,string>>|No
| <<plugins-{type}s-{plugin}-yaml_dictionary_code_point_limit>> |<<number,number>>|No
|=======================================================================

Also see <<plugins-{type}s-{plugin}-common-options>> for a list of options supported by all
Expand Down Expand Up @@ -421,5 +422,15 @@ The target field you wish to populate with the translated code.
If you set this value to the same value as `source` field, the plugin does a substitution, and
the filter will succeed. This will clobber the old value of the source field!


[id="plugins-{type}s-{plugin}-yaml_dictionary_code_point_limit"]
===== `yaml_dictionary_code_point_limit`

* Value type is <<number,number>>
* Default value is 134217728 (128MB for 1 byte code points)

The max amount of code points in the YAML file in `dictionary_path`. Please be aware that byte limit depends on the encoding.
This setting is effective for YAML file only. YAML over the limit throws exception.

[id="plugins-{type}s-{plugin}-common-options"]
include::{include_path}/{type}.asciidoc[]
14 changes: 7 additions & 7 deletions lib/logstash/filters/dictionary/file.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ class File

include LogStash::Util::Loggable

def self.create(path, refresh_interval, refresh_behaviour, exact, regex)
def self.create(path, refresh_interval, refresh_behaviour, exact, regex, **file_type_args)
if /\.y[a]?ml$/.match(path)
instance = YamlFile.new(path, refresh_interval, exact, regex)
instance = YamlFile.new(path, refresh_interval, exact, regex, file_type_args)
elsif path.end_with?(".json")
instance = JsonFile.new(path, refresh_interval, exact, regex)
elsif path.end_with?(".csv")
Expand All @@ -31,15 +31,15 @@ def self.create(path, refresh_interval, refresh_behaviour, exact, regex)

attr_reader :dictionary, :fetch_strategy

def initialize(path, refresh_interval, exact, regex)
def initialize(path, refresh_interval, exact, regex, **file_type_args)
@dictionary_path = path
@refresh_interval = refresh_interval
@short_refresh = @refresh_interval <= 300
rw_lock = java.util.concurrent.locks.ReentrantReadWriteLock.new
@write_lock = rw_lock.writeLock
@dictionary = Hash.new
@update_method = method(:merge_dictionary)
initialize_for_file_type
initialize_for_file_type(file_type_args)
args = [@dictionary, rw_lock]
klass = case
when exact && regex then FetchStrategy::File::ExactRegex
Expand All @@ -55,7 +55,7 @@ def load_dictionary(raise_exception=false)
@dictionary_mtime = ::File.mtime(@dictionary_path).to_f
@update_method.call
rescue Errno::ENOENT
@logger.warn("dictionary file read failure, continuing with old dictionary", :path => @dictionary_path)
logger.warn("dictionary file read failure, continuing with old dictionary", :path => @dictionary_path)
rescue => e
loading_exception(e, raise_exception)
end
Expand All @@ -68,7 +68,7 @@ def set_update_strategy(method_sym)

protected

def initialize_for_file_type
def initialize_for_file_type(**file_type_args)
# sub class specific initializer
end

Expand Down Expand Up @@ -120,7 +120,7 @@ def loading_exception(e, raise_exception)
dfe.set_backtrace(e.backtrace)
raise dfe
else
@logger.warn("#{msg}, continuing with old dictionary", :dictionary_path => @dictionary_path)
logger.warn("#{msg}, continuing with old dictionary", :dictionary_path => @dictionary_path)
end
end
end
Expand Down
3 changes: 0 additions & 3 deletions lib/logstash/filters/dictionary/json_file.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@ class JsonFile < File

protected

def initialize_for_file_type
end

def read_file_into_dictionary
content = IO.read(@dictionary_path, :mode => 'r:bom|utf-8')
@dictionary.update(LogStash::Json.load(content)) unless content.nil? || content.empty?
Expand Down
12 changes: 7 additions & 5 deletions lib/logstash/filters/dictionary/yaml_file.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,20 @@ class YamlFile < File

protected

def initialize_for_file_type
def initialize_for_file_type(**file_type_args)
@visitor = YamlVisitor.create

@parser = Psych::Parser.new(Psych::TreeBuilder.new)
@parser.code_point_limit = file_type_args[:yaml_code_point_limit]
end

def read_file_into_dictionary
# low level YAML read that tries to create as
# few intermediate objects as possible
# this overwrites the value at key
@visitor.accept_with_dictionary(
@dictionary, Psych.parse_stream(
IO.read(@dictionary_path, :mode => 'r:bom|utf-8')
))
yaml_string = IO.read(@dictionary_path, :mode => 'r:bom|utf-8')
@parser.parse(yaml_string, @dictionary_path)
@visitor.accept_with_dictionary(@dictionary, @parser.handler.root)
end
end
end end end
26 changes: 25 additions & 1 deletion lib/logstash/filters/translate.rb
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,12 @@ class Translate < LogStash::Filters::Base
# as the original text, and the second column as the replacement.
config :dictionary_path, :validate => :path

# The max amount of code points in the YAML file in `dictionary_path`. Please be aware that byte limit depends on the encoding.
# Snakeyaml 1.33 has a default limit 3MB. YAML file over the limit throws exception. JSON and CSV currently do not have such limit.
# The limit could be too small in some use cases. Setting a bigger number in `yaml_dictionary_code_point_limit` to relax the restriction.
# The default value is 128MB for code points of size 1 byte
config :yaml_dictionary_code_point_limit, :validate => :number

# When using a dictionary file, this setting will indicate how frequently
# (in seconds) logstash will check the dictionary file for updates.
config :refresh_interval, :validate => :number, :default => 300
Expand Down Expand Up @@ -180,8 +186,22 @@ def register
)
end

# check and set yaml code point limit
# set lookup dictionary
if @dictionary_path
@lookup = Dictionary::File.create(@dictionary_path, @refresh_interval, @refresh_behaviour, @exact, @regex)
if yaml_file?(@dictionary_path)
@yaml_dictionary_code_point_limit ||= 134_217_728

if @yaml_dictionary_code_point_limit <= 0
raise LogStash::ConfigurationError, "Please set a positive number in `yaml_dictionary_code_point_limit => #{@yaml_dictionary_code_point_limit}`."
else
@lookup = Dictionary::File.create(@dictionary_path, @refresh_interval, @refresh_behaviour, @exact, @regex, yaml_code_point_limit: @yaml_dictionary_code_point_limit)
end
elsif @yaml_dictionary_code_point_limit != nil
raise LogStash::ConfigurationError, "Please remove `yaml_dictionary_code_point_limit` for dictionary file in JSON or CSV format"
else
@lookup = Dictionary::File.create(@dictionary_path, @refresh_interval, @refresh_behaviour, @exact, @regex)
end
else
@lookup = Dictionary::Memory.new(@dictionary, @exact, @regex)
end
Expand Down Expand Up @@ -245,5 +265,9 @@ def filter(event)
@logger.error("Something went wrong when attempting to translate from dictionary", :exception => e, :source => @source, :event => event.to_hash)
end
end # def filter

def yaml_file?(path)
/\.y[a]?ml$/.match(path)
end
end # class LogStash::Filters::Translate
end end
3 changes: 2 additions & 1 deletion logstash-filter-translate.gemspec
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Gem::Specification.new do |s|

s.name = 'logstash-filter-translate'
s.version = '3.4.0'
s.version = '3.4.1'
s.licenses = ['Apache License (2.0)']
s.summary = "Replaces field contents based on a hash or YAML file"
s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program"
Expand All @@ -25,6 +25,7 @@ Gem::Specification.new do |s|
s.add_runtime_dependency 'logstash-mixin-validator_support', '~> 1.0'
s.add_runtime_dependency 'logstash-mixin-deprecation_logger_support', '~> 1.0'
s.add_runtime_dependency "logstash-mixin-scheduler", '~> 1.0'
s.add_runtime_dependency "psych", ">= 5.1.0"

s.add_development_dependency 'logstash-devutils'
s.add_development_dependency 'rspec-sequencing'
Expand Down
8 changes: 4 additions & 4 deletions spec/filters/scheduling_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@
end
end
.then_after(1.2, "wait then translate again") do
subject.filter(event)
try(5) do
wait(0.1).for{event.get("[translation]")}.to eq("12"), "field [translation] did not eq '12'"
subject.filter(event)
wait(0.5).for{event.get("[translation]")}.to eq("12"), "field [translation] did not eq '12'"
end
end
.then("stop") do
Expand Down Expand Up @@ -88,9 +88,9 @@
end
end
.then_after(1.2, "wait then translate again") do
subject.filter(event)
try(5) do
wait(0.1).for{event.get("[translation]")}.to eq("22"), "field [translation] did not eq '22'"
subject.filter(event)
wait(0.5).for{event.get("[translation]")}.to eq("22"), "field [translation] did not eq '22'"
end
end
.then("stop") do
Expand Down
79 changes: 79 additions & 0 deletions spec/filters/translate_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,85 @@ def self.build_fixture_path(filename)
end
end

describe "when using a yml dictionary with code point limit" do
let(:config) do
{
"source" => "status",
"target" => "translation",
"dictionary_path" => dictionary_path,
"yaml_dictionary_code_point_limit" => dictionary_size # the file is 18 bytes
}
end
let(:dictionary_path) { TranslateUtil.build_fixture_path("dict.yml") }
let(:event) { LogStash::Event.new("status" => "a") }

context "dictionary is over limit" do
let(:dictionary_size) { 17 }

it "raises exception" do
expect { subject.register }.to raise_error(/The incoming YAML document exceeds/)
end
end

context "dictionary is within limit" do
let(:dictionary_size) { 18 }

it "returns the exact translation" do
subject.register
subject.filter(event)
expect(event.get("translation")).to eq(1)
end
end

context "limit set to zero" do
let(:dictionary_size) { 0 }

it "raises configuration exception" do
expect { subject.register }.to raise_error(LogStash::ConfigurationError, /Please set a positive number/)
end
end

context "limit is unset" do
let(:config) do
{
"source" => "status",
"target" => "translation",
"dictionary_path" => dictionary_path,
}
end

it "sets the limit to 128MB" do
subject.register
expect(subject.instance_variable_get(:@yaml_dictionary_code_point_limit)).to eq(134_217_728)
end
end

context "dictionary is json and limit is set" do
let(:dictionary_path) { TranslateUtil.build_fixture_path("dict.json") }
let(:dictionary_size) { 100 }

it "raises configuration exception" do
expect { subject.register }.to raise_error(LogStash::ConfigurationError, /Please remove `yaml_dictionary_code_point_limit` for dictionary file in JSON or CSV format/)
end
end

context "dictionary is json and limit is unset" do
let(:config) do
{
"source" => "status",
"target" => "translation",
"dictionary_path" => TranslateUtil.build_fixture_path("dict.json"),
}
end

it "returns the exact translation" do
subject.register
subject.filter(event)
expect(event.get("translation")).to eq(10)
end
end
end

context "when using a map tagged yml file" do
let(:dictionary_path) { TranslateUtil.build_fixture_path("tag-map-dict.yml") }
let(:event) { LogStash::Event.new("status" => "six") }
Expand Down

0 comments on commit b3e4bd5

Please sign in to comment.