pytorch
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 36 additions & 6 deletions b/‎.ci/scripts/test_llama.sh
Lines changed: 36 additions & 6 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/pull.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎CONTRIBUTING.md
Lines changed: 9 additions & 4 deletions b/‎CONTRIBUTING.md
Lines changed: 9 additions & 4 deletions
diff --git a/‎backends/arm/test/common.py
Lines changed: 3 additions & 0 deletions b/‎backends/arm/test/common.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/test/runner_utils.py
Lines changed: 12 additions & 0 deletions b/‎backends/arm/test/runner_utils.py
Lines changed: 12 additions & 0 deletions
diff --git a/‎backends/cadence/aot/functions.yaml
Lines changed: 10 additions & 0 deletions b/‎backends/cadence/aot/functions.yaml
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/cadence/aot/functions_hifi.yaml
Lines changed: 17 additions & 2 deletions b/‎backends/cadence/aot/functions_hifi.yaml
Lines changed: 17 additions & 2 deletions
diff --git a/‎backends/cadence/aot/fuse_ops.py
Lines changed: 3 additions & 0 deletions b/‎backends/cadence/aot/fuse_ops.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/cadence/aot/quantizer/patterns.py
Lines changed: 8 additions & 0 deletions b/‎backends/cadence/aot/quantizer/patterns.py
Lines changed: 8 additions & 0 deletions
@@ -9,11 +9,41 @@ set -exu
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
-MODEL_NAME=$1 # stories110M
-BUILD_TOOL=$2 # buck2 or cmake
-DTYPE=$3 # fp16, bf16, or fp32
-MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe
-UPLOAD_DIR=${5:-}
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    -model)
+      MODEL_NAME="$2" # stories110M
+      shift 2
+      ;;
+    -build_tool)
+      BUILD_TOOL="$2" # buck2 or cmake
+      shift 2
+      ;;
+    -dtype)
+      DTYPE="$2" # fp16, bf16, or fp32
+      shift 2
+      ;;
+    -mode)
+      MODE="$2" # portable or xnnpack+custom or xnnpack+custom+qe
+      shift 2
+      ;;
+    -upload)
+      UPLOAD_DIR="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1"
+      usage
+      ;;
+  esac
+done
+
+# Default mode to xnnpack+custom if not set
+MODE=${MODE:-"xnnpack+custom"}
+
+# Default UPLOAD_DIR to empty string if not set
+UPLOAD_DIR="${UPLOAD_DIR:-}"
+
 if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
     echo "Expecting atleast 4 positional arguments"
     echo "Usage: [...]"
@@ -150,7 +180,7 @@ cleanup_files() {
 }
 
 prepare_artifacts_upload() {
-  if [ -n "$UPLOAD_DIR" ]; then
+  if [ -n "${UPLOAD_DIR}" ]; then
     echo "Preparing for uploading generated artifacs"
     zip -j model.zip "${EXPORTED_MODEL_NAME}" tokenizer.bin
     mkdir -p "${UPLOAD_DIR}"
 
@@ -117,7 +117,7 @@ jobs:
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" "${ARTIFACTS_DIR_NAME}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}" -upload "${ARTIFACTS_DIR_NAME}"
 
   test-llama-runner-linux-android:
     name: test-llama-runner-linux-android
@@ -393,7 +393,7 @@ jobs:
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}"
 
   test-phi-3-mini-runner-linux:
     name: test-phi-3-mini-runner-linux
 
@@ -261,7 +261,7 @@ jobs:
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
         # Test llama2
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M cmake "${DTYPE}" "${MODE}"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh -model stories110M -build_tool cmake -dtype "${DTYPE}" -mode "${MODE}"
 
   # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
   # test-llava-runner-macos:
 
@@ -283,10 +283,15 @@ for basics.
    - If the reviewers have requests or questions, follow up with them.
    - The goal of the reviewer is to ensure that the code in the `main` branch of
      the repo is consistent, maintainable, and of high quality.
-1. Once approved, your reviewer will import the PR into Meta's internal system
-   and merge it from there.
-   - If the PR is approved and not merged within a few business days, please
-     comment on the PR to ask about its status.
+1. Once the PR has been approved,
+   - If you have the "write permission" in this repo, you can merge it yourself
+     by clicking the "Squash and merge" button once it is green and all CI
+     signals are passing.
+   - If you don't have "write permission" in this repo, the reviewer will take
+     care of the PR. The reviewer may import the PR into Meta's internal system
+     to validate it against internal CI.
+   - If the PR is approved but not merged within 5 business days, please comment
+     on the PR to ask about its status.
    - Note that if the `main` [CI](#continuous-integration) jobs are broken, we
      will only merge PRs that fix the broken jobs until all critical jobs are
      fixed.
 
@@ -29,6 +29,7 @@ class arm_test_options(Enum):
     corstone300 = auto()
     dump_path = auto()
     date_format = auto()
+    fast_fvp = auto()
 
 
 _test_options: dict[arm_test_options, Any] = {}
@@ -41,6 +42,7 @@ def pytest_addoption(parser):
     parser.addoption("--arm_run_corstone300", action="store_true")
     parser.addoption("--default_dump_path", default=None)
     parser.addoption("--date_format", default="%d-%b-%H:%M:%S")
+    parser.addoption("--fast_fvp", action="store_true")
 
 
 def pytest_configure(config):
@@ -63,6 +65,7 @@ def pytest_configure(config):
                 f"Supplied argument 'default_dump_path={dump_path}' that does not exist or is not a directory."
             )
     _test_options[arm_test_options.date_format] = config.option.date_format
+    _test_options[arm_test_options.fast_fvp] = config.option.fast_fvp
     logging.basicConfig(level=logging.INFO, stream=sys.stdout)
 
 
 
@@ -17,6 +17,8 @@
 import numpy as np
 import torch
 
+from executorch.backends.arm.test.common import arm_test_options, is_option_enabled
+
 from torch.export import ExportedProgram
 from torch.fx.node import Node
 
@@ -249,6 +251,10 @@ def run_corstone(
         for input_path in input_paths:
             cmd_line += f" -i {input_path}"
 
+        ethos_u_extra_args = ""
+        if is_option_enabled(arm_test_options.fast_fvp):
+            ethos_u_extra_args = ethos_u_extra_args + "--fast"
+
         command_args = {
             "corstone-300": [
                 "FVP_Corstone_SSE-300_Ethos-U55",
@@ -267,6 +273,8 @@ def run_corstone(
                 "-C",
                 "cpu0.semihosting-stack_base=0",
                 "-C",
+                f"ethosu.extra_args='{ethos_u_extra_args}'",
+                "-C",
                 "cpu0.semihosting-heap_limit=0",
                 "-C",
                 f"cpu0.semihosting-cmd_line='{cmd_line}'",
@@ -282,6 +290,8 @@ def run_corstone(
                 "-C",
                 "mps4_board.visualisation.disable-visualisation=1",
                 "-C",
+                "vis_hdlcd.disable_visualisation=1",
+                "-C",
                 "mps4_board.telnetterminal0.start_telnet=0",
                 "-C",
                 "mps4_board.uart0.out_file='-'",
@@ -296,6 +306,8 @@ def run_corstone(
                 "-C",
                 "mps4_board.subsystem.cpu0.semihosting-heap_limit=0",
                 "-C",
+                f"mps4_board.subsystem.ethosu.extra_args='{ethos_u_extra_args}'",
+                "-C",
                 f"mps4_board.subsystem.cpu0.semihosting-cmd_line='{cmd_line}'",
                 "-a",
                 elf_path,
 
@@ -77,6 +77,16 @@
     - arg_meta: null
       kernel_name: torch::executor::gelu_out
 
+- op: hardtanh.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::hardtanh_out
+
+- op: max_pool2d_with_indices.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::max_pool2d_with_indices_out
+
 - op: mean.out
   kernels:
     - arg_meta: null
 
@@ -62,11 +62,26 @@
     - arg_meta: null
       kernel_name: torch::executor::full_out
 
+- op: gelu.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::gelu_out
+
+- op: hardtanh.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::hardtanh_out
+
+- op: max_pool2d_with_indices.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::max_pool2d_with_indices_out
+
 - op: mean.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::mean_dim_out   
-      
+      kernel_name: cadence::impl::HiFi::mean_dim_out
+
 - op: mul.out
   kernels:
     - arg_meta: null
 
@@ -426,6 +426,9 @@ def fuse_quantized_batch_norm_with_conv(
         # Note: there is a quantized.conv2d.new operator in the resulting graph
         # that takes a torch.classes.quantized.Conv2dPackedParamsBase as one of the input
         # this prevents us to directly call graph_module.recompile().
+        # pyre-fixme[16]: `GraphModule` has no attribute `_code`.
+        # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
+        #  `python_code`.
         graph_module._code = graph_module._graph.python_code(root_module="self").src
 
     def __init__(self):
 
@@ -75,6 +75,7 @@ def partition_types(self) -> List[OpOverload]:
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
     ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         addmm_node = fused_partition[0].nodes[-1]
 
         bias_qspec = DerivedQuantizationSpec(
@@ -107,6 +108,7 @@ def partition_types(self) -> List[OpOverload]:
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
     ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         bmm_node = fused_partition[0].nodes[-1]
 
         return PartitionAnchors(
@@ -127,6 +129,7 @@ def partition_types(self) -> List[OpOverload]:
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
     ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         conv1d_node = fused_partition[0].nodes[-1]
 
         bias_qspec = DerivedQuantizationSpec(
@@ -165,6 +168,7 @@ def partition_types(self) -> List[OpOverload]:
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
     ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         conv2d_node = fused_partition[0].nodes[-1]
 
         bias_qspec = DerivedQuantizationSpec(
@@ -203,6 +207,7 @@ def partition_types(self) -> List[OpOverload]:
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
     ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         layer_norm_node = fused_partition[0].nodes[-1]
 
         others = [(layer_norm_node, 1)]
@@ -237,6 +242,7 @@ def partition_types(self) -> List[OpOverload]:
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
     ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         linear_node = fused_partition[0].nodes[-1]
 
         bias_qspec = DerivedQuantizationSpec(
@@ -275,6 +281,7 @@ def partition_types(self) -> List[OpOverload]:
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
     ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         matmul_node = fused_partition[0].nodes[-1]
 
         return PartitionAnchors(
@@ -297,6 +304,7 @@ def partition_types(self) -> List[OpOverload]:
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
     ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         relu_node = fused_partition[0].nodes[-1]
 
         return PartitionAnchors(