Skip to content

Commit

Permalink
feat: prepare llama3.2 embedded
Browse files Browse the repository at this point in the history
  • Loading branch information
louis030195 committed Sep 27, 2024
1 parent 660cf63 commit af16042
Show file tree
Hide file tree
Showing 7 changed files with 472 additions and 295 deletions.
11 changes: 11 additions & 0 deletions screenpipe-core/examples/llama.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
use anyhow::Result;
use screenpipe_core::llama::LlamaInitConfig;
use screenpipe_core::llama_stream_text;

fn main() -> Result<()> {
llama_stream_text(LlamaInitConfig::default(), |text| {
println!("{}", text);
Ok(())
})?;
Ok(())
}
88 changes: 3 additions & 85 deletions screenpipe-core/src/google.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,95 +4,13 @@ mod google_module {
use candle::{DType, Device, Tensor};
use candle_nn::VarBuilder;
use candle_transformers::generation::{LogitsProcessor, Sampling};
use candle_transformers::models::gemma::{Config as Config1, Model as Model1};
use candle_transformers::models::gemma2::{Config as Config2, Model as Model2};
use candle_transformers::models::gemma::Model as Model1;
use candle_transformers::models::gemma2::Model as Model2;
use hf_hub::api::sync::ApiBuilder;
use hf_hub::{Repo, RepoType};
use tokenizers::Tokenizer;

use crate::hub_load_safetensors;

pub struct TokenOutputStream {
tokenizer: tokenizers::Tokenizer,
tokens: Vec<u32>,
prev_index: usize,
current_index: usize,
}

impl TokenOutputStream {
pub fn new(tokenizer: tokenizers::Tokenizer) -> Self {
Self {
tokenizer,
tokens: Vec::new(),
prev_index: 0,
current_index: 0,
}
}

pub fn into_inner(self) -> tokenizers::Tokenizer {
self.tokenizer
}

fn decode(&self, tokens: &[u32]) -> Result<String> {
match self.tokenizer.decode(tokens, true) {
Ok(str) => Ok(str),
Err(err) => anyhow::bail!("cannot decode: {err}"),
}
}

pub fn next_token(&mut self, token: u32) -> Result<Option<String>> {
let prev_text = if self.tokens.is_empty() {
String::new()
} else {
let tokens = &self.tokens[self.prev_index..self.current_index];
self.decode(tokens)?
};
self.tokens.push(token);
let text = self.decode(&self.tokens[self.prev_index..])?;
if text.len() > prev_text.len() && text.chars().last().unwrap().is_alphanumeric() {
let text = text.split_at(prev_text.len());
self.prev_index = self.current_index;
self.current_index = self.tokens.len();
Ok(Some(text.1.to_string()))
} else {
Ok(None)
}
}

pub fn decode_rest(&self) -> Result<Option<String>> {
let prev_text = if self.tokens.is_empty() {
String::new()
} else {
let tokens = &self.tokens[self.prev_index..self.current_index];
self.decode(tokens)?
};
let text = self.decode(&self.tokens[self.prev_index..])?;
if text.len() > prev_text.len() {
let text = text.split_at(prev_text.len());
Ok(Some(text.1.to_string()))
} else {
Ok(None)
}
}

pub fn decode_all(&self) -> Result<String> {
self.decode(&self.tokens)
}

pub fn get_token(&self, token_s: &str) -> Option<u32> {
self.tokenizer.get_vocab(true).get(token_s).copied()
}

pub fn tokenizer(&self) -> &tokenizers::Tokenizer {
&self.tokenizer
}

pub fn clear(&mut self) {
self.tokens.clear();
self.prev_index = 0;
self.current_index = 0;
}
}
use crate::{hub_load_safetensors, TokenOutputStream};

enum Model {
V1(Model1),
Expand Down
8 changes: 8 additions & 0 deletions screenpipe-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,21 @@ pub mod llm;
#[cfg(feature = "llm")]
pub use llm::*;
#[cfg(feature = "llm")]
pub mod phi;
#[cfg(feature = "llm")]
pub use phi::*;
#[cfg(feature = "llm")]
pub mod google;
#[cfg(feature = "llm")]
pub use google::*;
#[cfg(feature = "llm")]
pub mod mistral;
#[cfg(feature = "llm")]
pub use mistral::*;
#[cfg(feature = "llm")]
pub mod llama;
#[cfg(feature = "llm")]
pub use llama::*;
#[cfg(feature = "pipes")]
pub mod pipes;
#[cfg(feature = "pipes")]
Expand Down
242 changes: 242 additions & 0 deletions screenpipe-core/src/llama.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
#[cfg(feature = "llm")]
mod llm_module {

use anyhow::{Error as E, Result};

use candle::{DType, Device, Tensor};
use candle_nn::VarBuilder;
use candle_transformers::generation::{LogitsProcessor, Sampling};
use hf_hub::{
api::sync::{Api, ApiBuilder},
Repo, RepoType,
};

use candle_transformers::models::llama as model;
use model::{Llama, LlamaConfig};
use tokenizers::Tokenizer;

use crate::{hub_load_safetensors, TokenOutputStream};

const EOS_TOKEN: &str = "</s>";

#[derive(Clone, Debug, Copy, PartialEq, Eq)]
enum Which {
V1,
V2,
V3,
V31,
V3Instruct,
V31Instruct,
V32_1b,
V32_1bInstruct,
V32_3b,
V32_3bInstruct,
Solar10_7B,
TinyLlama1_1BChat,
}

#[derive(Debug)]
pub struct LlamaInitConfig {
/// The temperature used to generate samples.
temperature: f64,

/// Nucleus sampling probability cutoff.
top_p: Option<f64>,

/// Only sample among the top K samples.
top_k: Option<usize>,

/// The seed to use when generating random samples.
seed: u64,

/// The length of the sample to generate (in tokens).
sample_len: usize,

/// Disable the key-value cache.
no_kv_cache: bool,

/// The initial prompt.
prompt: Option<String>,

/// Use different dtype than f16
dtype: Option<String>,

model_id: Option<String>,

revision: Option<String>,

/// The model size to use.
which: Which,

use_flash_attn: bool,

/// Penalty to be applied for repeating tokens, 1. means no penalty.
repeat_penalty: f32,

/// The context size to consider for the repeat penalty.
repeat_last_n: usize,
}

impl Default for LlamaInitConfig {
fn default() -> Self {
Self {
use_flash_attn: false,
prompt: None,
temperature: 0.8,
top_p: Some(0.95),
top_k: None,
seed: 299792458,
sample_len: 100,
which: Which::V32_3bInstruct,
model_id: None,
revision: None,
repeat_penalty: 1.1,
repeat_last_n: 128,
no_kv_cache: false,
dtype: None,
}
}
}

pub fn llama_stream_text<F>(args: LlamaInitConfig, mut callback: F) -> Result<()>
where
F: FnMut(String) -> Result<()>,
{
println!(
"avx: {}, neon: {}, simd128: {}, f16c: {}",
candle::utils::with_avx(),
candle::utils::with_neon(),
candle::utils::with_simd128(),
candle::utils::with_f16c()
);
println!(
"temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
args.temperature, args.repeat_penalty, args.repeat_last_n
);

let start = std::time::Instant::now();
let api = ApiBuilder::new()
// ! hardcoded louis token dont CARE
.with_token(Some("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".to_string()))
.build()?;
let model_id = args.model_id.unwrap_or_else(|| match args.which {
Which::V1 => "Narsil/amall-7b".to_string(),
Which::V2 => "meta-llama/Llama-2-7b-hf".to_string(),
Which::V3 => "meta-llama/Meta-Llama-3-8B".to_string(),
Which::V3Instruct => "meta-llama/Meta-Llama-3-8B-Instruct".to_string(),
Which::V31 => "meta-llama/Meta-Llama-3.1-8B".to_string(),
Which::V31Instruct => "meta-llama/Meta-Llama-3.1-8B-Instruct".to_string(),
Which::V32_1b => "meta-llama/Llama-3.2-1B".to_string(),
Which::V32_1bInstruct => "meta-llama/Llama-3.2-1B-Instruct".to_string(),
Which::V32_3b => "meta-llama/Llama-3.2-3B".to_string(),
Which::V32_3bInstruct => "meta-llama/Llama-3.2-3B-Instruct".to_string(),
Which::Solar10_7B => "upstage/SOLAR-10.7B-v1.0".to_string(),
Which::TinyLlama1_1BChat => "TinyLlama/TinyLlama-1.1B-Chat-v1.0".to_string(),
});
println!("loading the model weights from {model_id}");
let revision = args.revision.unwrap_or("main".to_string());
let api = api.repo(Repo::with_revision(model_id, RepoType::Model, revision));

let tokenizer_filename = api.get("tokenizer.json")?;
let config_filename = api.get("config.json")?;
let config: LlamaConfig = serde_json::from_slice(&std::fs::read(config_filename)?)?;
let config = config.into_config(args.use_flash_attn);

let filenames = hub_load_safetensors(&api, "model.safetensors.index.json")?;
println!("retrieved the files in {:?}", start.elapsed());

let device = Device::new_metal(0).unwrap_or(Device::new_cuda(0).unwrap_or(Device::Cpu));

let dtype = match args.dtype.as_deref() {
Some("f16") => DType::F16,
Some("bf16") => DType::BF16,
Some("f32") => DType::F32,
Some(dtype) => anyhow::bail!("Unsupported dtype {dtype}"),
None => DType::F16,
};

let start = std::time::Instant::now();
let mut cache = model::Cache::new(!args.no_kv_cache, dtype, &config, &device)?;
let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
let llama = Llama::load(vb, &config)?;
println!("loaded the model in {:?}", start.elapsed());

let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
let mut tokens = tokenizer
.encode(args.prompt.unwrap(), true)
.map_err(E::msg)?
.get_ids()
.to_vec();

let mut tokenizer = TokenOutputStream::new(tokenizer);
for &t in tokens.iter() {
if let Some(t) = tokenizer.next_token(t)? {
callback(t)?;
}
}

let mut logits_processor = {
let temperature = args.temperature;
let sampling = if temperature <= 0. {
Sampling::ArgMax
} else {
match (args.top_k, args.top_p) {
(None, None) => Sampling::All { temperature },
(Some(k), None) => Sampling::TopK { k, temperature },
(None, Some(p)) => Sampling::TopP { p, temperature },
(Some(k), Some(p)) => Sampling::TopKThenTopP { k, p, temperature },
}
};
LogitsProcessor::from_sampling(args.seed, sampling)
};

let mut index_pos = 0;
let mut token_generated = 0;
let start_gen = std::time::Instant::now();
for index in 0..args.sample_len {
let (context_size, context_index) = if cache.use_kv_cache && index > 0 {
(1, index_pos)
} else {
(tokens.len(), 0)
};
let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
let input = Tensor::new(ctxt, &device)?.unsqueeze(0)?;
let logits = llama.forward(&input, context_index, &mut cache)?;
let logits = logits.squeeze(0)?;
let logits = if args.repeat_penalty == 1. {
logits
} else {
let start_at = tokens.len().saturating_sub(args.repeat_last_n);
candle_transformers::utils::apply_repeat_penalty(
&logits,
args.repeat_penalty,
&tokens[start_at..],
)?
};
index_pos += ctxt.len();

let next_token = logits_processor.sample(&logits)?;
token_generated += 1;
tokens.push(next_token);

if let Some(t) = tokenizer.next_token(next_token)? {
callback(t)?;
}
}

if let Some(rest) = tokenizer.decode_rest().map_err(E::msg)? {
callback(rest)?;
}

let dt = start_gen.elapsed();
println!(
"\n\n{} tokens generated ({} token/s)\n",
token_generated,
(token_generated - 1) as f64 / dt.as_secs_f64(),
);
Ok(())
}
}
// Optionally, you can re-export the module contents if needed
#[cfg(feature = "llm")]
pub use llm_module::*;
Loading

0 comments on commit af16042

Please sign in to comment.