Skip to content

Commit

Permalink
apply review comments 2
Browse files Browse the repository at this point in the history
  • Loading branch information
chenhu-wang committed Nov 8, 2023
1 parent 03752c5 commit e437e51
Show file tree
Hide file tree
Showing 12 changed files with 33 additions and 90 deletions.
4 changes: 1 addition & 3 deletions src/common/snippets/include/snippets/generator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,8 @@ class Generator;
class LoweringResult {
friend class Generator;
// Some emitters rely on other precompiled kernels.
// We need to keep the pointers to such emitters alive, so the kernels would still be accessible at runtime.
// We need to keep the pointers to such emitters alive, so the kernels or nodes would still be accessible at runtime.
std::vector<std::shared_ptr<Emitter>> m_saved_emitters{};
// For perf count nodes, kernel will read/write these nodes, so should be alive in execution.
std::vector<std::shared_ptr<ov::Node>> m_saved_nodes{};

public:
std::shared_ptr<CompiledSnippet> compiled_snippet = nullptr;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,6 @@ class LinearIR::ExpressionFactory {
return create(loop_begin, params...);
} else if (const auto loop_end = ov::as_type_ptr<op::LoopEnd>(n)) {
return create(loop_end, params...);
} else if (const auto pc_begin = ov::as_type_ptr<op::PerfCountBegin>(n)) {
return create(pc_begin, params...);
} else if (const auto pc_end = ov::as_type_ptr<op::PerfCountEnd>(n)) {
return create(pc_end, params...);
}
return create(n, params...);
}
Expand All @@ -52,8 +48,6 @@ class LinearIR::ExpressionFactory {
static ExpressionPtr create(const std::shared_ptr<op::LoopBegin>& n, const std::vector<PortConnectorPtr>& inputs, const LinearIR& linear_ir);
static ExpressionPtr create(const std::shared_ptr<op::LoopEnd>& n, const std::vector<PortConnectorPtr>& inputs, const LinearIR& linear_ir);
static ExpressionPtr create(const std::shared_ptr<ov::Node>& n, const std::vector<PortConnectorPtr>& inputs, const LinearIR& linear_ir);
static ExpressionPtr create(const std::shared_ptr<op::PerfCountBegin>& n, const std::vector<PortConnectorPtr>& inputs, const LinearIR& linear_ir);
static ExpressionPtr create(const std::shared_ptr<op::PerfCountEnd>& n, const std::vector<PortConnectorPtr>& inputs, const LinearIR& linear_ir);

// Creates inputs for expression using parent output port connectors
static void create_expression_inputs(const LinearIR& linear_ir, const ExpressionPtr& expr);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace pass {

/**
* @interface InsertPerfCount
* @brief Insert PerfCountBegin node after last paramter and insert PerfCountEnd node before first result.
* @brief Insert PerfCountBegin node after last parameter and insert PerfCountEnd node before first result.
* This is a illustration transformation to enable perf count in snippets.
* Developers could modify this to insert perf count pairs around interested sequence of nodes.
* @ingroup snippets
Expand Down
1 change: 0 additions & 1 deletion src/common/snippets/include/snippets/op/subgraph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,6 @@ class Subgraph : public ov::op::util::SubGraphOp {
LoweringResult& lowering_result,
const lowered::pass::PassPipeline& backend_passes_pre_common,
const lowered::pass::PassPipeline& backend_passes_post_common) const;
void perf_count_transformations(lowered::LinearIR& linear_ir) const;
void init_config();
// Count of Subgraph virtual ports:
// - Potential non-scalar Constants that will be created after some transformations (At the moment it's relevant only for FakeQuantize decomposition)
Expand Down
12 changes: 2 additions & 10 deletions src/common/snippets/src/generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,23 +44,15 @@ void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, c
}
OV_ITT_TASK_NEXT(GENERATE, "::GetSnippet")

// Note: some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime.
// 1. some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime.
// 2. perf count node as field of emitter should be alive at runtime.
if (linear_ir.get_config().m_save_expressions) {
for (const auto& expr : linear_ir) {
const auto& emitter = expr->get_emitter();
if (uses_precompiled_kernel(emitter))
result.m_saved_emitters.emplace_back(emitter);
}
}
// perf count node should be alive in execution.
if (linear_ir.get_config().perf_count_mode != lowered::PerfCountMode::Disabled) {
for (const auto& expr : linear_ir) {
const auto& node = expr->get_node();
if (should_node_alive_in_execution(node)) {
result.m_saved_nodes.emplace_back(node);
}
}
}
result.compiled_snippet = target->get_snippet();
}

Expand Down
35 changes: 3 additions & 32 deletions src/common/snippets/src/lowered/expression_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,37 +117,6 @@ ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr<op::Loop
return expr;
}

ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr<op::PerfCountBegin>& n,
const std::vector<PortConnectorPtr>& inputs,
const LinearIR& linear_ir) {
OPENVINO_ASSERT(inputs.empty(), "PerfCountBegin cannot have inputs");
auto expr = std::make_shared<Expression>(Expression(n, linear_ir.m_shape_infer_factory));
init_expression_inputs(expr, inputs);
create_expression_outputs(expr);
expr->validate();
return expr;
}

ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr<op::PerfCountEnd>& n,
const std::vector<PortConnectorPtr>& inputs,
const LinearIR& linear_ir) {
auto expr = std::shared_ptr<Expression>(new Expression(n, linear_ir.m_shape_infer_factory));
expr->m_input_port_descriptors.resize(inputs.size(), nullptr);
for (size_t i = 0; i < inputs.size() - 1; ++i) {
expr->m_input_port_descriptors[i] = std::make_shared<PortDescriptor>();
}
const auto& last_input = inputs.back()->get_source();
OPENVINO_ASSERT(ov::is_type<op::PerfCountBegin>(last_input.get_expr()->get_node()),
"PerfCountEnd expression expects PerfCountBegin on last input");
expr->m_input_port_descriptors[inputs.size() - 1] = last_input.get_descriptor_ptr()->clone();
init_expression_inputs(expr, inputs);
// The PerfCountEnd node don't need output port (because of sense of the node). But each node in ngraph must have one output at least.
// The port descriptors are automatically created in constructor. We manually clean output ports.
expr->m_output_port_descriptors.clear();
expr->validate();
return expr;
}

ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr<ov::Node>& n,
const std::vector<PortConnectorPtr>& inputs,
const LinearIR& linear_ir) {
Expand All @@ -160,7 +129,9 @@ ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr<ov::Node
expr->validate();
// todo: here we blindly synchronize input shapes from parent and child. Remove this when shapes will be stored in
// port connector itself
if (linear_ir.m_shape_infer_factory)
if (linear_ir.m_shape_infer_factory &&
!ov::is_type<op::PerfCountBeginBase>(n) &&
!ov::is_type<op::PerfCountEndBase>(n))
expr->updateShapes();
return expr;
}
Expand Down
13 changes: 6 additions & 7 deletions src/common/snippets/src/op/perf_count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ bool PerfCountBeginBase::visit_attributes(AttributeVisitor &visitor) {
}

void PerfCountBeginBase::validate_and_infer_types_except_PerfCountEnd() {
NODE_VALIDATION_CHECK(this, get_input_size() == 0, "PerfCountBegin dosen't expect any inputs");
NODE_VALIDATION_CHECK(this, get_input_size() == 0, "PerfCountBegin doesn't expect any inputs");
set_output_type(0, element::f32, {});
}

Expand All @@ -34,7 +34,7 @@ PerfCountEndBase::PerfCountEndBase(const std::vector<Output<Node>> &args) : Op(a

void PerfCountEndBase::validate_and_infer_types() {
NODE_VALIDATION_CHECK(this, get_input_size() == 1, "PerfCountEndBase must have one input");
const auto pc_begin = ov::as_type_ptr<PerfCountBeginBase>(get_input_node_shared_ptr(0));
const auto& pc_begin = ov::as_type_ptr<PerfCountBeginBase>(get_input_node_shared_ptr(0));
NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEndBase must have PerfCountBeginBase as the last argument");
set_output_type(0, element::f32, {});
}
Expand Down Expand Up @@ -87,14 +87,14 @@ void PerfCountEnd::output_perf_count() {
auto iterator_iter = iteration.begin();
auto iterator_acc = accumulation.begin();
int t_num = 0;
std::vector<uint64_t> avg_list;
std::string friendly_name = get_friendly_name();
uint64_t avg_max = 0;
std::cout << "Perf count data in perfCountEnd node with name " << get_friendly_name() << " is:"<< std::endl;
for (; iterator_iter != iteration.end(); ++iterator_iter, ++iterator_acc) {
const auto iter = *iterator_iter;
const auto acc = *iterator_acc;
uint64_t avg = iter == 0 ? 0 : acc / iter;
avg_list.push_back(avg);
if (avg > avg_max)
avg_max = avg;
std::cout << "accumulated time:" << acc << "ns, iteration:" << iter << " avg time:" << avg << "ns"<< " on thread:" << t_num << std::endl;
t_num++;
}
Expand All @@ -107,8 +107,7 @@ void PerfCountEnd::output_perf_count() {
uint64_t acc_max = accumulation.combine(BinaryFunc);
std::cout << "max accumulated time:" << acc_max << "ns" << std::endl;
// max avg
auto avg_max = std::max_element(avg_list.begin(), avg_list.end(), BinaryFunc);
std::cout << "max avg time:" << *avg_max << "ns" << std::endl;
std::cout << "max avg time:" << avg_max << "ns" << std::endl;
}

} // namespace op
Expand Down
15 changes: 4 additions & 11 deletions src/common/snippets/src/op/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,8 @@ VectorDims Subgraph::infer_master_shape() {
std::shared_ptr<lowered::LinearIR>
Subgraph::convert_body_to_linear_ir(const std::shared_ptr<IShapeInferSnippetsFactory>& shape_infer_factory) {
lowered::Config lowering_config;
lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops;
lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops ||
(lowering_config.perf_count_mode != lowered::PerfCountMode::Disabled);
lowering_config.m_need_fill_tail_register = config.m_has_domain_sensitive_ops;
lowering_config.m_loop_depth = tileRank;
lowering_config.m_enable_domain_optimization = !config.m_has_domain_sensitive_ops;
Expand Down Expand Up @@ -469,15 +470,6 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,
lowering_result.buffer_scratchpad_size = buffer_allocation_pass->get_scratchpad_size();
}

void Subgraph::perf_count_transformations(lowered::LinearIR& linear_ir) const {
INTERNAL_OP_SCOPE(Subgraph);
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::op::perf_count_transformations")

lowered::pass::PassPipeline perf_count_pipeline;
perf_count_pipeline.register_pass<lowered::pass::InsertPerfCount>();
perf_count_pipeline.run(linear_ir);
}

snippets::Schedule Subgraph::generate(const BlockedShapeVector& blocked_input_shapes,
const std::vector<ov::element::Type>& input_precisions,
const std::vector<ov::element::Type>& output_precisions,
Expand Down Expand Up @@ -506,7 +498,8 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const lowered::pass::PassPi
LoweringResult lowering_result;
control_flow_transformations(linear_ir, lowering_result, backend_passes_pre_common, backend_passes_post_common);
if (linear_ir.get_config().perf_count_mode == lowered::PerfCountMode::Chrono) {
perf_count_transformations(linear_ir);
lowered::pass::InsertPerfCount perf_count_pass;
perf_count_pass.run(linear_ir);
}
m_generator->generate(linear_ir, lowering_result, compile_params);

Expand Down
8 changes: 6 additions & 2 deletions src/plugins/intel_cpu/src/emitters/x64/cpu_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,10 @@ snippets::Generator::opRegType intel_cpu::CPUGenerator::get_specific_op_reg_type
}
bool intel_cpu::CPUGenerator::uses_precompiled_kernel(const std::shared_ptr<snippets::Emitter>& e) const {
return std::dynamic_pointer_cast<intel_cpu::BrgemmEmitter>(e) ||
std::dynamic_pointer_cast<intel_cpu::BrgemmCopyBEmitter>(e);
std::dynamic_pointer_cast<intel_cpu::BrgemmCopyBEmitter>(e) ||
std::dynamic_pointer_cast<intel_cpu::jit_perf_count_chrono_start_emitter>(e) ||
std::dynamic_pointer_cast<intel_cpu::jit_perf_count_chrono_end_emitter>(e) ||
std::dynamic_pointer_cast<intel_cpu::jit_perf_count_rdtsc_start_emitter>(e) ||
std::dynamic_pointer_cast<intel_cpu::jit_perf_count_rdtsc_end_emitter>(e);
}
} // namespace ov
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ namespace intel_cpu {

jit_perf_count_rdtsc_start_emitter::jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
auto start_op = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscBegin>(n);
m_current_count = &(start_op->start_count);
m_start_node = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscBegin>(n);
}

size_t jit_perf_count_rdtsc_start_emitter::get_inputs_num() const {
Expand All @@ -37,7 +36,7 @@ void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector<size_t> &in
h->shl(h->rdx, 0x20); // shift to higher half of rdx 0x20(32)
h->or_(h->rdx, h->rax); // rdx has current tsc

h->mov(h->rax, reinterpret_cast<size_t>(m_current_count));
h->mov(h->rax, reinterpret_cast<size_t>(&m_start_node->start_count));
h->mov(qword[h->rax], h->rdx);

h->pop(h->rdx);
Expand All @@ -47,10 +46,7 @@ void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector<size_t> &in
///////////////////jit_perf_count_rdtsc_end_emitter////////////////////////////////////
jit_perf_count_rdtsc_end_emitter::jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
auto end_op = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscEnd>(n);
m_accumulation = &(end_op->accumulation);
m_iteration = &(end_op->iteration);
m_start_count = &(end_op->get_pc_begin()->start_count);
m_end_node = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscEnd>(n);
}

size_t jit_perf_count_rdtsc_end_emitter::get_inputs_num() const {
Expand All @@ -68,16 +64,16 @@ void jit_perf_count_rdtsc_end_emitter::emit_impl(const std::vector<size_t> &in_i
h->or_(h->rdx, h->rax); // rdx has current tsc

// tsc duration
h->mov(h->rax, reinterpret_cast<size_t>(m_start_count));
h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->get_pc_begin()->start_count));
h->sub(h->rdx, qword[h->rax]); // rdx has tsc duration

// m_accumulation = m_accumulation + tsc duration
h->mov(h->rax, reinterpret_cast<size_t>(m_accumulation));
// accumulation = accumulation + tsc duration
h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->accumulation));
h->add(h->rdx, qword[h->rax]);
h->mov(qword[h->rax], h->rdx);

// iteration++
h->mov(h->rax, reinterpret_cast<size_t>(m_iteration));
h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->iteration));
h->mov(h->rdx, qword[h->rax]);
h->add(h->rdx, 0x01);
h->mov(qword[h->rax], h->rdx);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class jit_perf_count_rdtsc_start_emitter : public jit_emitter {

private:
void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
mutable uint64_t* m_current_count = nullptr;
std::shared_ptr<ov::intel_cpu::PerfCountRdtscBegin> m_start_node = nullptr;
};

class jit_perf_count_rdtsc_end_emitter : public jit_emitter {
Expand All @@ -30,9 +30,7 @@ class jit_perf_count_rdtsc_end_emitter : public jit_emitter {

private:
void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
mutable uint64_t* m_start_count = nullptr;
mutable uint64_t* m_accumulation = nullptr;
mutable uint32_t* m_iteration = nullptr;
std::shared_ptr<ov::intel_cpu::PerfCountRdtscEnd> m_end_node = nullptr;
};

} // namespace intel_cpu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ std::shared_ptr<Node> PerfCountRdtscEnd::clone_with_new_inputs(const OutputVecto

std::shared_ptr<PerfCountRdtscBegin> PerfCountRdtscEnd::get_pc_begin() {
const auto& pc_begin = ov::as_type_ptr<PerfCountRdtscBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
if (!pc_begin)
throw std::invalid_argument("PerfCountRdtscEnd last input is not connected to PerfCountRdtscBegin");
OPENVINO_ASSERT(pc_begin != nullptr, "PerfCountRdtscEnd last input is not connected to PerfCountRdtscBegin");
return pc_begin;
}

0 comments on commit e437e51

Please sign in to comment.