Skip to content

Commit

Permalink
added aquila2
Browse files Browse the repository at this point in the history
Signed-off-by: 严照东 <[email protected]>
  • Loading branch information
严照东 committed Oct 7, 2023
1 parent 46018f9 commit 3ca71af
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 14 deletions.
2 changes: 1 addition & 1 deletion flagai/model/aquila2/aquila2_flash_attn_monkey_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def forward(
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if output_attentions:
warnings.warn(
"Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
"Output attentions is not supported for patched `AquilaAttention`, returning `None` instead."
)

bsz, q_len, _ = hidden_states.size()
Expand Down
14 changes: 1 addition & 13 deletions flagai/model/aquila2/modeling_aquila.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)


# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Aquila
class AquilaRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
Expand All @@ -99,8 +98,6 @@ def forward(self, hidden_states):

return (self.weight * hidden_states).to(input_dtype)


# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Aquila
class AquilaRotaryEmbedding(torch.nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
super().__init__()
Expand Down Expand Up @@ -136,7 +133,6 @@ def forward(self, x, seq_len=None):
self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
)

# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Aquila
class AquilaLinearScalingRotaryEmbedding(AquilaRotaryEmbedding):
"""AquilaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""

Expand All @@ -155,7 +151,6 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)

# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Aquila
class AquilaDynamicNTKScalingRotaryEmbedding(AquilaRotaryEmbedding):
"""AquilaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""

Expand Down Expand Up @@ -200,7 +195,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
return q_embed, k_embed


# Copied from transformers.models.llama.modeling_llama.LlamaMLP with Llama->Aquila
class AquilaMLP(nn.Module):
def __init__(self, config):
super().__init__()
Expand Down Expand Up @@ -247,7 +241,6 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->Aquila
class AquilaAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config: AquilaConfig):
Expand Down Expand Up @@ -397,7 +390,6 @@ def forward(
return attn_output, attn_weights, past_key_value


# Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Aquila
class AquilaDecoderLayer(nn.Module):
def __init__(self, config: AquilaConfig):
super().__init__()
Expand Down Expand Up @@ -482,7 +474,6 @@ def forward(
"The bare Aquila Model outputting raw hidden-states without any specific head on top.",
AQUILA_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->Aquila
class AquilaPreTrainedModel(PreTrainedModel):
config_class = AquilaConfig
base_model_prefix = "model"
Expand Down Expand Up @@ -574,7 +565,6 @@ def _set_gradient_checkpointing(self, module, value=False):
"The bare Aquila Model outputting raw hidden-states without any specific head on top.",
AQUILA_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaModel with LLAMA->AQUILA,Llama->Aquila
class AquilaModel(AquilaPreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`AquilaDecoderLayer`]
Expand Down Expand Up @@ -752,7 +742,6 @@ def custom_forward(*inputs):
attentions=all_self_attns,
)

# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->AQUILA,Llama->Aquila
class AquilaForCausalLM(AquilaPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]

Expand Down Expand Up @@ -1025,7 +1014,7 @@ def predict(self, text, tokenizer=None,

@add_start_docstrings(
"""
The LLaMa Model transformer with a sequence classification head on top (linear layer).
The Aquila Model transformer with a sequence classification head on top (linear layer).
[`AquilaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
(e.g. GPT-2) do.
Expand All @@ -1038,7 +1027,6 @@ def predict(self, text, tokenizer=None,
""",
AQUILA_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with LLAMA->AQUILA,Llama->Aquila
class AquilaForSequenceClassification(AquilaPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]

Expand Down

0 comments on commit 3ca71af

Please sign in to comment.