diff --git a/cpp/ebnf_script_creator.cc b/cpp/ebnf_script_creator.cc new file mode 100644 index 0000000..470409d --- /dev/null +++ b/cpp/ebnf_script_creator.cc @@ -0,0 +1,86 @@ +/*! + * Copyright (c) 2023 by Contributors + * \file tokenizer.cc + */ +#include "ebnf_script_creator.h" + +#include +#include +#include +#include + +#include "support/logging.h" + +namespace xgrammar { + +class EBNFScriptCreator::Impl { + public: + Impl() {} + + std::string AddRule(const std::string& rule_name_hint, const std::string& rule_body); + std::string GetScript(); + std::string GetRuleContent(const std::string& rule_name); + + private: + std::string GetRuleName(const std::string& rule_name_hint); + std::vector> rules_; + std::unordered_set rule_names_; + const int NAME_SUFFIX_MAXIMUM = 10000; +}; + +std::string EBNFScriptCreator::Impl::GetRuleName(const std::string& rule_name_hint) { + if (rule_names_.find(rule_name_hint) == rule_names_.end()) { + rule_names_.insert(rule_name_hint); + return rule_name_hint; + } + for (int i = 0; i < NAME_SUFFIX_MAXIMUM; ++i) { + std::string rule_name = rule_name_hint + "_" + std::to_string(i); + if (rule_names_.find(rule_name) == rule_names_.end()) { + rule_names_.insert(rule_name); + return rule_name; + } + } + XGRAMMAR_LOG(FATAL) << "Cannot find a unique rule name for " << rule_name_hint; +} + +std::string EBNFScriptCreator::Impl::AddRule( + const std::string& rule_name_hint, const std::string& rule_body +) { + std::string rule_name = GetRuleName(rule_name_hint); + rules_.emplace_back(rule_name, rule_body); + return rule_name; +} + +std::string EBNFScriptCreator::Impl::GetScript() { + std::string script = ""; + for (const auto& rule : rules_) { + script += rule.first + " ::= " + rule.second + "\n"; + } + return script; +} + +std::string EBNFScriptCreator::Impl::GetRuleContent(const std::string& rule_name) { + auto it = std::find_if(rules_.begin(), rules_.end(), [rule_name](const auto& rule) { + return rule.first == rule_name; + }); + if (it != rules_.end()) { + return it->second; + } + return ""; +} + +EBNFScriptCreator::EBNFScriptCreator(EmptyConstructorTag) : pimpl_(std::make_shared()) {} + +std::string EBNFScriptCreator::AddRule( + const std::string& rule_name_hint, const std::string& rule_body +) { + return pimpl_->AddRule(rule_name_hint, rule_body); +} + +std::string EBNFScriptCreator::GetScript() { return pimpl_->GetScript(); } + +std::string EBNFScriptCreator::GetRuleContent(const std::string& rule_name) { + return pimpl_->GetRuleContent(rule_name); +} + +} // namespace xgrammar diff --git a/cpp/ebnf_script_creator.h b/cpp/ebnf_script_creator.h new file mode 100644 index 0000000..c9eb842 --- /dev/null +++ b/cpp/ebnf_script_creator.h @@ -0,0 +1,53 @@ +/*! + * Copyright (c) 2024 by Contributors + * \file xgrammar/ebnf_script_creator.h + * \brief The header for the creating EBNF script. + */ + +#ifndef XGRAMMAR_EBNF_SCRIPT_CREATOR_H_ +#define XGRAMMAR_EBNF_SCRIPT_CREATOR_H_ + +#include + +#include + +namespace xgrammar { + +/*! + * \brief A class for creating EBNF grammar scripts. + * + * This class helps build EBNF (Extended Backus-Naur Form) grammar scripts + * by managing rules and their content. + */ +class EBNFScriptCreator { + public: + /*! \brief Constructor using empty constructor tag pattern */ + EBNFScriptCreator(EmptyConstructorTag); + + /*! + * \brief Adds a new rule to the grammar + * \param rule_name_hint Suggested name for the rule + * \param rule_body The EBNF content/definition of the rule + * \return The actual name assigned to the rule + */ + std::string AddRule(const std::string& rule_name_hint, const std::string& rule_body); + + /*! + * \brief Gets the complete EBNF grammar script + * \return The full EBNF grammar script as a string + */ + std::string GetScript(); + + /*! + * \brief Retrieves the content/definition of a specific rule + * \param rule_name The name of the rule to look up + * \return The EBNF content/definition of the specified rule + */ + std::string GetRuleContent(const std::string& rule_name); + + XGRAMMAR_DEFINE_PIMPL_METHODS(EBNFScriptCreator); +}; + +} // namespace xgrammar + +#endif // XGRAMMAR_EBNF_SCRIPT_CREATOR_H_ diff --git a/cpp/json_schema_converter.cc b/cpp/json_schema_converter.cc index b628b10..89c3098 100644 --- a/cpp/json_schema_converter.cc +++ b/cpp/json_schema_converter.cc @@ -17,6 +17,7 @@ #include #include +#include "ebnf_script_creator.h" #include "regex_converter.h" #include "support/logging.h" @@ -336,6 +337,8 @@ class JSONSchemaConverter { const std::string& rule_name ); + // The EBNF script creator + EBNFScriptCreator ebnf_script_creator_{EmptyConstructorTag{}}; // The indent manager to get separators std::optional indentManager_; // The root JSON schema @@ -346,8 +349,6 @@ class JSONSchemaConverter { bool allow_empty_; // The colon separator std::string colon_pattern_; - // The rules constructed - std::vector> rules_; // The cache for basic rules. Mapping from the key of schema returned by GetSchemaCacheIndex() // to the basic rule name. std::map basic_rules_cache_; @@ -386,11 +387,7 @@ JSONSchemaConverter::JSONSchemaConverter( std::string JSONSchemaConverter::Convert() { CreateRuleFromSchema(json_schema_, "root"); - std::string res; - for (auto& rule : rules_) { - res += rule.first + " ::= " + rule.second + "\n"; - } - return res; + return ebnf_script_creator_.GetScript(); } void JSONSchemaConverter::AddBasicRules() { @@ -433,14 +430,14 @@ void JSONSchemaConverter::AddBasicRules() { } void JSONSchemaConverter::AddHelperRules() { - rules_.push_back(std::make_pair( + ebnf_script_creator_.AddRule( kBasicEscape, "[\"\\\\/bfnrt] | \"u\" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9]" - )); - rules_.push_back(std::make_pair( + ); + ebnf_script_creator_.AddRule( kBasicStringSub, "(\"\\\"\" | [^\"\\\\\\r\\n] " + kBasicStringSub + " | \"\\\\\" " + kBasicEscape + " " + kBasicStringSub + ") (= [ \\n\\t]* [,}\\]:])" - )); + ); } void JSONSchemaConverter::CreateBasicRule(const picojson::value& schema, const std::string& name) { @@ -485,8 +482,9 @@ std::string JSONSchemaConverter::CreateRuleFromSchema( return basic_rules_cache_[idx]; } - rules_.push_back(std::make_pair(rule_name_hint, VisitSchema(schema, rule_name_hint))); - return rule_name_hint; + std::string rule_name = + ebnf_script_creator_.AddRule(rule_name_hint, VisitSchema(schema, rule_name_hint)); + return rule_name; } std::string JSONSchemaConverter::GetSchemaCacheIndex(const picojson::value& schema) { @@ -1036,7 +1034,7 @@ std::string JSONSchemaConverter::GetPartialRuleForPropertiesAllOptional( std::string last_rule_body = "(" + mid_sep + " " + additional_prop_pattern + ")*"; std::string last_rule_name = rule_name + "_part_" + std::to_string(static_cast(properties.size()) - 1); - rules_.push_back(std::make_pair(last_rule_name, last_rule_body)); + last_rule_name = ebnf_script_creator_.AddRule(last_rule_name, last_rule_body); rule_names.back() = last_rule_name; } else { rule_names.back() = "\"\""; @@ -1049,7 +1047,7 @@ std::string JSONSchemaConverter::GetPartialRuleForPropertiesAllOptional( std::string cur_rule_body = last_rule_name + " | " + mid_sep + " " + prop_pattern + " " + last_rule_name; std::string cur_rule_name = rule_name + "_part_" + std::to_string(i); - rules_.push_back(std::make_pair(cur_rule_name, cur_rule_body)); + cur_rule_name = ebnf_script_creator_.AddRule(cur_rule_name, cur_rule_body); rule_names[i] = cur_rule_name; } diff --git a/include/xgrammar/compiler.h b/include/xgrammar/compiler.h index b2e352b..a89e28e 100644 --- a/include/xgrammar/compiler.h +++ b/include/xgrammar/compiler.h @@ -29,17 +29,17 @@ class CompiledGrammar { }; /*! - * \brief A cache to get the grammar state compiled grammar for grammar or schema. This class avoids + * \brief A cache to get the compiled grammar for grammar or schema. This class avoids * redundant preprocessing of the grammar or schema when constructing a CompiledGrammar. * \note This class is associated with a vocabulary when constructed. The vocabulary is used to - * create every grammar state compiled grammar. If multiple toke tables are used to create init + * create every compiled grammar. If multiple toke tables are used to create init * contexts, an instance of this class for each vocabulary should be created. */ class GrammarCompiler { public: /*! * \brief Construct a GrammarCompiler with a vocabulary. This class will always - * create grammar state compiled grammars with this vocabulary. + * create compiled grammars with this vocabulary. * \param decoded_vocab The vocabulary that the grammar will use. */ GrammarCompiler( diff --git a/include/xgrammar/object.h b/include/xgrammar/object.h index 35925a4..4438d39 100644 --- a/include/xgrammar/object.h +++ b/include/xgrammar/object.h @@ -12,6 +12,15 @@ namespace xgrammar { +/*! + * \brief A tag type for empty constructor. + * + * Since XGRAMMAR_DEFINE_PIMPL_METHODS already occupies the default constructor to + * construct a null object, this tag is used to define an empty constructor for + * the object. + */ +struct EmptyConstructorTag {}; + #define XGRAMMAR_DEFINE_PIMPL_METHODS(TypeName) \ public: \ class Impl; \ @@ -30,7 +39,7 @@ namespace xgrammar { const Impl* operator->() const { return pimpl_.get(); } \ \ private: \ - std::shared_ptr pimpl_ + std::shared_ptr pimpl_; } // namespace xgrammar