Skip to content

Commit

Permalink
TMP
Browse files Browse the repository at this point in the history
  • Loading branch information
ilya-lavrenov committed May 16, 2024
1 parent 7ac0a66 commit 7a4e1a7
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 15 deletions.
8 changes: 4 additions & 4 deletions text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,9 @@ void initialize_inputs(const ov::Tensor& input_ids, const ov::Tensor& attention_

ov::Shape input_shape = input_ids.get_shape();

ov::Tensor position_ids = request.get_tensor("position_ids");
position_ids.set_shape(input_shape);
initialize_position_ids(position_ids, attention_mask);
// ov::Tensor position_ids = request.get_tensor("position_ids");
// position_ids.set_shape(input_shape);
// initialize_position_ids(position_ids, attention_mask);

ov::Tensor beam_idx = request.get_tensor("beam_idx");
beam_idx.set_shape({input_shape.at(0)});
Expand Down Expand Up @@ -209,7 +209,7 @@ int main(int argc, char* argv[]) try {
lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
// Set auxiliary inputs
set_attention_mask(lm.get_tensor("attention_mask"), next_beams);
set_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask"));
// set_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask"));
}

for (const std::vector<std::vector<Beam>>& prompt_group : finalize(std::move(group_beam_searcher))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,11 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
OPENVINO_ASSERT(running_sequences.size() == 1);

int64_t sampled_token_id = _greedy_sample(sequence_group_logits);
// in case of greedy search we always have a single parent sequence to sample from
running_sequences[0]->append_token(sampled_token_id, sequence_group_logits.data<const float>()[sampled_token_id]);

// if (sampled_token_id != sampling_params.eos_token_id) {
// in case of greedy search we always have a single parent sequence to sample from
running_sequences[0]->append_token(sampled_token_id, sequence_group_logits.data<const float>()[sampled_token_id]);
// }

if (sampling_params.max_new_tokens == running_sequences[0]->get_generated_len() ||
sampled_token_id == sampling_params.eos_token_id && !sampling_params.ignore_eos) {
Expand All @@ -269,7 +272,6 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
}
}


// current algorithm already adds new tokens to running sequences and
m_beam_search_info.at(request_id).select_next_tokens(sequence_group_logits, sampler_output);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ def get_test_dataset() -> Tuple[List[str], List[GenerationConfig]]:
]
generation_configs = [
get_greedy(),
get_beam_search(),
get_greedy(),
get_beam_search()
get_greedy(),
get_greedy()
]
return (prompts, generation_configs)

Expand Down Expand Up @@ -104,7 +104,7 @@ def run_hugging_face(
tmp_path: Path
) -> Tuple[List[GenerationResult], str]:
hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \
model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=True) if use_optimum else \
AutoModelForCausalLM.from_pretrained(model_id)
generation_results: List[GenerationResult] = []
model_path : Path = tmp_path / model_id
Expand All @@ -122,7 +122,7 @@ def run_hugging_face(
inputs = hf_tokenizer(prompt, return_tensors="pt")
prompt_len = len(inputs['input_ids'][0])
generate_outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], generation_config=convert_to_hf(model.generation_config, generation_config), return_dict_in_generate=True)
all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)
all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences])

generation_result = GenerationResult()
generation_result.m_generation_ids = all_text_batch
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,5 +83,5 @@ rinna/bilingual-gpt-neox-4b
google/pegasus-big_patent
google/pegasus-large
# optimum-intel: The generation config instance is invalid -- `.validate()` throws warnings and/or exceptions: openchat/openchat_3.5
pankajmathur/orca_mini_3b
togethercomputer/RedPajama-INCITE-Chat-3B-v1
# CPU: head size must be multiple of 16, current: 100: pankajmathur/orca_mini_3b
# passed: togethercomputer/RedPajama-INCITE-Chat-3B-v1
4 changes: 2 additions & 2 deletions text_generation/causal_lm/cpp/group_beam_searcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@ struct Parameters {
std::vector<std::vector<int64_t>> prompts;
int64_t eos_token;
size_t n_groups = 3;
size_t group_size = 5;
size_t group_size = 2;
float diversity_penalty = 1.0;
size_t max_new_tokens = 20;
size_t max_new_tokens = 30;
StopCriteria stop_criteria = StopCriteria::heuristic;
float length_penalty = 1.0;
size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
Expand Down

0 comments on commit 7a4e1a7

Please sign in to comment.