aten/src/ATen/native/native_functions.yaml

# See README.md in this directory for more guidance


# Temporary type cast operators. These are needed to trace type-casts now since
# Type's are not supported in the IR. Instead, we call down to these
# specialized operators for each datatype.
# TODO: remove when we have Type support in the IR
- func: _cast_Byte(Tensor self, bool non_blocking=false) -> Tensor
  variants: function

- func: _cast_Char(Tensor self, bool non_blocking=false) -> Tensor
  variants: function

- func: _cast_Double(Tensor self, bool non_blocking=false) -> Tensor
  variants: function

- func: _cast_Float(Tensor self, bool non_blocking=false) -> Tensor
  variants: function

- func: _cast_Int(Tensor self, bool non_blocking=false) -> Tensor
  variants: function

- func: _cast_Long(Tensor self, bool non_blocking=false) -> Tensor
  variants: function

- func: _cast_Short(Tensor self, bool non_blocking=false) -> Tensor
  variants: function

- func: _cast_Half(Tensor self, bool non_blocking=false) -> Tensor
  variants: function

- func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank, bool deterministic) -> (Tensor, Tensor)
  dispatch:
    CUDA: _cudnn_ctc_loss

- func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor
  dispatch:
    CUDA: _cudnn_rnn_flatten_weight

- func: _cudnn_rnn(Tensor input, TensorList weight, int64_t weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
  dispatch:
    CUDA: _cudnn_rnn

- func: _cudnn_rnn_backward(Tensor input, TensorList weight, int64_t weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state, Tensor reserve, std::array<bool,4> output_mask) -> (Tensor, Tensor, Tensor, TensorList)
  dispatch:
    CUDA: _cudnn_rnn_backward

- func: _cudnn_init_dropout_state(double dropout, bool train, int64_t dropout_seed, TensorOptions options) -> Tensor
  dispatch:
    CUDA: _cudnn_init_dropout_state

- func: _fused_dropout(Tensor self, double p, Generator* generator=nullptr) -> (Tensor, Tensor)
  variants: function
  dispatch:
     CUDA: fused_dropout_cuda

- func: _masked_scale(Tensor self, Tensor mask, double scale) -> Tensor
  variants: function
  dispatch:
     CUDA: masked_scale_cuda

- func: _reshape_from_tensor(Tensor self, Tensor shape) -> Tensor

- func: _shape_as_tensor(Tensor self) -> Tensor

- func: dropout(Tensor input, double p, bool train) -> Tensor

- func: dropout_(Tensor self, double p, bool train) -> Tensor

- func: feature_dropout(Tensor input, double p, bool train) -> Tensor

- func: feature_dropout_(Tensor self, double p, bool train) -> Tensor

- func: alpha_dropout(Tensor input, double p, bool train) -> Tensor

- func: alpha_dropout_(Tensor self, double p, bool train) -> Tensor

- func: feature_alpha_dropout(Tensor input, double p, bool train) -> Tensor

- func: feature_alpha_dropout_(Tensor self, double p, bool train) -> Tensor

- func: abs(Tensor self) -> Tensor
  variants: function, method

- func: abs_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _abs__cpu
    CUDA: _abs__cuda

- func: abs_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _abs_out_cpu
    CUDA: _abs_out_cuda

- func: acos(Tensor self) -> Tensor
  variants: function, method

- func: acos_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _acos__cpu
    CUDA: _acos__cuda

- func: acos_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _acos_out_cpu
    CUDA: _acos_out_cuda

- func: avg_pool1d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, bool ceil_mode=false, bool count_include_pad=true) -> Tensor

- func: adaptive_avg_pool1d(Tensor self, IntList[1] output_size) -> Tensor

- func: adaptive_max_pool1d(Tensor self, IntList[1] output_size) -> (Tensor, Tensor)

- func: add(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
  variants: function, method

- func: add_(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
  variants: method

- func: add_out(Tensor result, Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor

# For C++ only, until we have conversion from C++ numbers to Tensor
- func: add(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
  variants: function, method

- func: add_(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
  variants: method

- func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  variants: function, method

- func: addmv_(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  variants: function, method

- func: addmv_out(Tensor result, Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor

- func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  variants: function, method

- func: addr_(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  variants: method

- func: addr_out(Tensor result, Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor

- func: all(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
  variants: function, method

- func: all_out(Tensor result, Tensor self, int64_t dim, bool keepdim=false) -> Tensor

- func: allclose(Tensor self, Tensor other, double rtol=1e-5, double atol=1e-8, bool equal_nan=False) -> bool
  variants: function, method

- func: any(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
  variants: function, method

- func: any_out(Tensor result, Tensor self, int64_t dim, bool keepdim=false) -> Tensor

- func: arange(Scalar start, Scalar end, TensorOptions options={}) -> Tensor

- func: arange(Scalar start, Scalar end, Scalar step, TensorOptions options={}) -> Tensor

- func: arange_out(Tensor result, Scalar start, Scalar end) -> Tensor

- func: arange_out(Tensor result, Scalar start, Scalar end, Scalar step) -> Tensor

- func: arange(Scalar end, TensorOptions options={}) -> Tensor

- func: arange_out(Tensor result, Scalar end) -> Tensor

# This function is a temporary hack to allow tracing of arange like constructs with dynamic
# bounds on arange.  Normal arange is not traceable because it does not take any tensor inputs;
# if the range you need is based on another tensor, calling this function directly will
# preserve tracing.  Get rid of this when arange can directly take tensors for bounds
# (so that it can be traced directly).
- func: _dim_arange(Tensor like, int64_t dim) -> Tensor

# `argmin` and `argmax` are exposed in C++ but not in Python, where we only
# expose `_argmin` and `_argmax` (which call the first versions). In Python, we
# then define our own `argmax` and `argmin` that handle passing `dim=None`,
# which gets the argmax/argmin of the flattened array.

- func: argmax(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
  variants: function, method

- func: argmax(Tensor self) -> Tensor
  variants: function, method

- func: _argmax(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
  variants: function

- func: argmin(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
  variants: function, method

- func: argmin(Tensor self) -> Tensor
  variants: function, method

- func: _argmin(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
  variants: function

- func: as_strided(Tensor self, IntList size, IntList stride) -> Tensor
  variants: function, method
  device_guard: false

- func: as_strided_(Tensor self, IntList size, IntList stride) -> Tensor
  variants: function, method
  device_guard: false

- func: as_strided(Tensor self, IntList size, IntList stride, int64_t storage_offset) -> Tensor
  variants: function, method
  device_guard: false
  python_default_init:
    storage_offset: self.storage_offset()

- func: as_strided_(Tensor self, IntList size, IntList stride, int64_t storage_offset) -> Tensor
  variants: function, method
  device_guard: false
  python_default_init:
    storage_offset: self.storage_offset()

- func: asin(Tensor self) -> Tensor
  variants: function, method

- func: asin_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _asin__cpu
    CUDA: _asin__cuda

- func: asin_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _asin_out_cpu
    CUDA: _asin_out_cuda

- func: atan(Tensor self) -> Tensor
  variants: function, method

- func: atan_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _atan__cpu
    CUDA: _atan__cuda

- func: atan_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _atan_out_cpu
    CUDA: _atan_out_cuda

- func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  variants: function, method
  dispatch:
    CPU: baddbmm_cpu
    CUDA: baddbmm_cuda

- func: baddbmm_(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  variants: method
  dispatch:
    CPU: baddbmm__cpu
    CUDA: baddbmm__cuda

- func: _baddbmm_mkl_(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  variants: function

- func: baddbmm_out(Tensor result, Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  variants: function
  dispatch:
    CPU: baddbmm_out_cpu
    CUDA: baddbmm_out_cuda

- func: bartlett_window(int64_t window_length, TensorOptions options={}) -> Tensor

- func: bartlett_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor

- func: batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, double momentum, double eps, bool cudnn_enabled) -> Tensor

# Sample bernoulli with values in `self` as probability.
- func: bernoulli(Tensor self, *, Generator* generator=nullptr) -> Tensor
  variants: function, method

- func: bernoulli_out(Tensor result, Tensor self, *, Generator* generator=nullptr) -> Tensor
  variants: function

- func: bernoulli_(Tensor self, Tensor p, *, Generator* generator=nullptr) -> Tensor
  variants: method
  dispatch:
    CPU: bernoulli_tensor_cpu_
    CUDA: bernoulli_tensor_cuda_

- func: bernoulli_(Tensor self, double p=0.5, *, Generator* generator=nullptr) -> Tensor
  variants: method
  dispatch:
    CPU: bernoulli_scalar_cpu_
    CUDA: bernoulli_scalar_cuda_

# This out-of-place version isn't used explicitly, but needed by jit.
# There is no default valid on `p` here because it would introduce ambiguity
# with `bernoulli(Tensor self, *, Generator* generator=nullptr)` declaration.
- func: bernoulli(Tensor self, double p, *, Generator* generator=nullptr) -> Tensor
  variants: function, method

- func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias) -> Tensor

- func: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight, Tensor? pos_weight, int64_t reduction) -> Tensor
  variants: function

- func: binary_cross_entropy_with_logits_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, Tensor? pos_weight, int64_t reduction) -> Tensor
  variants: function

- func: bincount(Tensor self, Tensor? weights={}, int64_t minlength=0) -> Tensor
  variants: function, method
  dispatch:
    CPU: _bincount_cpu
    CUDA: _bincount_cuda

- func: blackman_window(int64_t window_length, TensorOptions options={}) -> Tensor

- func: blackman_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor

- func: bmm(Tensor self, Tensor mat2) -> Tensor
  variants: function, method
  dispatch:
    CPU: bmm_cpu
    CUDA: bmm_cuda

- func: bmm_out(Tensor result, Tensor self, Tensor mat2) -> Tensor
  variants: function
  dispatch:
    CPU: bmm_out_cpu
    CUDA: bmm_out_cuda

- func: broadcast_tensors(TensorList tensors) -> TensorList
  device_guard: false

- func: cat(TensorList tensors, int64_t dim=0) -> Tensor

- func: cat_out(Tensor result, TensorList tensors, int64_t dim=0) -> Tensor

- func: ceil(Tensor self) -> Tensor
  variants: function, method

- func: ceil_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _ceil__cpu
    CUDA: _ceil__cuda

- func: ceil_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _ceil_out_cpu
    CUDA: _ceil_out_cuda

- func: chain_matmul(TensorList matrices) -> Tensor
  variants: function

- func: chunk(Tensor self, int64_t chunks, int64_t dim=0) -> TensorList
  variants: function, method
  device_guard: false

- func: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
  variants: function, method

- func: clamp_(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
  variants: function, method
  dispatch:
    CPU: _clamp__cpu
    CUDA: _clamp__cuda

- func: clamp_out(Tensor result, Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
  dispatch:
    CPU: _clamp_out_cpu
    CUDA: _clamp_out_cuda

- func: clamp_max(Tensor self, Scalar max) -> Tensor
  variants: function, method

- func: clamp_max_(Tensor self, Scalar max) -> Tensor
  variants: function, method
  dispatch:
    CPU: _clamp_max__cpu
    CUDA: _clamp_max__cuda

- func: clamp_max_out(Tensor result, Tensor self, Scalar max) -> Tensor
  dispatch:
    CPU: _clamp_max_out_cpu
    CUDA: _clamp_max_out_cuda

- func: clamp_min(Tensor self, Scalar min) -> Tensor
  variants: function, method

- func: clamp_min_(Tensor self, Scalar min) -> Tensor
  variants: function, method
  dispatch:
    CPU: _clamp_min__cpu
    CUDA: _clamp_min__cuda

- func: clamp_min_out(Tensor result, Tensor self, Scalar min) -> Tensor
  dispatch:
    CPU: _clamp_min_out_cpu
    CUDA: _clamp_min_out_cuda

- func: cudnn_is_acceptable(Tensor self) -> bool
  device_guard: false

- func: constant_pad_nd(Tensor self, IntList pad, Scalar value=0) -> Tensor
  variants: function

- func: contiguous(Tensor self) -> Tensor
  variants: method

- func: convolution(Tensor input, Tensor weight, Tensor? bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups) -> Tensor

- func: _convolution(Tensor input, Tensor weight, Tensor? bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor

- func: _convolution_nogroup(Tensor input, Tensor weight, Tensor? bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding) -> Tensor

- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)

- func: conv1d(Tensor input, Tensor weight, Tensor bias={}, IntList[1] stride=1, IntList[1] padding=0, IntList[1] dilation=1, int64_t groups=1) -> Tensor

- func: conv2d(Tensor input, Tensor weight, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0, IntList[2] dilation=1, int64_t groups=1) -> Tensor

- func: conv3d(Tensor input, Tensor weight, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] dilation=1, int64_t groups=1) -> Tensor

- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad=0) -> Tensor

- func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int64_t pad) -> (Tensor, Tensor, Tensor)

# NB: we inherit the goofy argument order from PyTorch torch.nn.functional
- func: conv_transpose1d(Tensor input, Tensor weight, Tensor bias={}, IntList[1] stride=1, IntList[1] padding=0, IntList[1] output_padding=0, int64_t groups=1, IntList[1] dilation=1) -> Tensor

- func: conv_transpose2d(Tensor input, Tensor weight, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0, IntList[2] output_padding=0, int64_t groups=1, IntList[2] dilation=1) -> Tensor

- func: conv_transpose3d(Tensor input, Tensor weight, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] output_padding=0, int64_t groups=1, IntList[3] dilation=1) -> Tensor

- func: cos(Tensor self) -> Tensor
  variants: function, method

- func: cos_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _cos__cpu
    CUDA: _cos__cuda

- func: cos_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _cos_out_cpu
    CUDA: _cos_out_cuda

- func: cosh(Tensor self) -> Tensor
  variants: function, method

- func: cosh_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _cosh__cpu
    CUDA: _cosh__cuda

- func: cosh_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _cosh_out_cpu
    CUDA: _cosh_out_cuda

- func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, double margin=0.0, int64_t reduction=Reduction::Mean) -> Tensor

- func: cudnn_affine_grid_generator(Tensor theta, int64_t N, int64_t C, int64_t H, int64_t W) -> Tensor
  return:
    - type: Tensor
      name: grid
  dispatch:
    CUDA: cudnn_affine_grid_generator_forward

# TODO: Why do I have to call this grad?!
- func: cudnn_affine_grid_generator_backward(Tensor grad, int64_t N, int64_t C, int64_t H, int64_t W)
  return:
    - type: Tensor
      name: grad_theta
  dispatch:
    CUDA: cudnn_affine_grid_generator_backward

- func: cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, double exponential_average_factor, double epsilon) -> (Tensor, Tensor, Tensor)
  dispatch:
    CUDA: cudnn_batch_norm

# NB: You can only use this if you used cudnn_batch_norm training=True
- func: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, double epsilon) -> (Tensor, Tensor, Tensor)
  dispatch:
    CUDA: cudnn_batch_norm_backward

- func: cudnn_convolution(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: cudnn_convolution

- func: cudnn_convolution_backward_input(IntList self_size, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: cudnn_convolution_backward_input

- func: cudnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
  dispatch:
    CUDA: cudnn_convolution_backward

- func: cudnn_convolution_backward_bias(Tensor grad_output) -> Tensor
  dispatch:
    CUDA: cudnn_convolution_backward_bias

- func: cudnn_convolution_backward_weight(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: cudnn_convolution_backward_weight

- func: cudnn_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: cudnn_convolution_transpose

# NB: output_padding not strictly needed here, but it's helpful for the double
# backwards
- func: cudnn_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
  dispatch:
    CUDA: cudnn_convolution_transpose_backward

- func: cudnn_convolution_transpose_backward_bias(Tensor grad_output) -> Tensor
  dispatch:
    CUDA: cudnn_convolution_backward_bias

- func: cudnn_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: cudnn_convolution_transpose_backward_input

- func: cudnn_convolution_transpose_backward_weight(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: cudnn_convolution_transpose_backward_weight

# NB: input is special cased in a way I don't quite understand
- func: cudnn_grid_sampler(Tensor self, Tensor grid)
  return:
    - type: Tensor
      name: output
  dispatch:
    CUDA: cudnn_grid_sampler_forward

- func: cudnn_grid_sampler_backward(Tensor self, Tensor grid, Tensor grad_output)
  return:
    - type: Tensor
      name: grad_self
    - type: Tensor
      name: grad_grid
  dispatch:
    CUDA: cudnn_grid_sampler_backward

# FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
- func: cumsum(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
  variants: function, method

- func: cumsum(Tensor self, int64_t dim) -> Tensor
  variants: function, method

- func: cumsum_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor

- func: cumsum_out(Tensor result, Tensor self, int64_t dim) -> Tensor

# FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
- func: cumprod(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
  variants: function, method

- func: cumprod(Tensor self, int64_t dim) -> Tensor
  variants: function, method

- func: cumprod_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor

- func: cumprod_out(Tensor result, Tensor self, int64_t dim) -> Tensor

- func: ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0, int64_t reduction=Reduction::Mean) -> Tensor

# convenience function that converts to intlists for you
- func: ctc_loss(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int64_t blank=0, int64_t reduction=Reduction::Mean) -> Tensor

- func: _ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0) -> (Tensor, Tensor)
  dispatch:
    CPU:  ctc_loss_cpu
    CUDA: ctc_loss_gpu

- func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int64_t blank) -> Tensor
  dispatch:
    CPU: ctc_loss_backward_cpu
    CUDA: ctc_loss_backward_gpu

- func: det(Tensor self) -> Tensor
  variants: function, method

- func: diagflat(Tensor self, int64_t offset=0) -> Tensor
  variants: function, method

- func: diagonal(Tensor self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) -> Tensor
  variants: function, method

- func: div(Tensor self, Tensor other) -> Tensor
  variants: function, method

- func: div_(Tensor self, Tensor other) -> Tensor
  variants: method

- func: div_out(Tensor result, Tensor self, Tensor other) -> Tensor

# For C++ only, until we have conversion from C++ numbers to Tensor
- func: div(Tensor self, Scalar other) -> Tensor
  variants: function, method

- func: div_(Tensor self, Scalar other) -> Tensor
  variants: method

- func: dot(Tensor self, Tensor tensor) -> Tensor
  variants: function, method

- func: dot_out(Tensor result, Tensor self, Tensor tensor) -> Tensor

- func: einsum(std::string equation, TensorList tensors) -> Tensor

- func: embedding(Tensor weight, IndexTensor indices, int64_t padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false) -> Tensor

- func: embedding_backward(Tensor grad, IndexTensor indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor

- func: embedding_dense_backward(Tensor grad, IndexTensor indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) -> Tensor
  dispatch:
    CPU: embedding_dense_backward_cpu
    CUDA: embedding_dense_backward_cuda

- func: embedding_renorm_(Tensor self, IndexTensor indices, double max_norm, double norm_type) -> Tensor
  dispatch:
    CPU: embedding_renorm_cpu_
    CUDA: embedding_renorm_cuda_

- func: embedding_sparse_backward(Tensor grad, IndexTensor indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) -> Tensor

# NOTE [ embedding_bag Native Functions ]
# The `_embedding_bag.*` variants assume that input tensors except for `weight`,
# e.g. `indices` and `offsets` (and `offset2bag`), are contiguous.
# We really only need to enforce this for `_embedding_bag` (the forward) because
# the backward inputs are the same as forward ones.
# The above `embedding_bag` wrapper is created to achieve this, e.g.,
# applying indices = indices.contiguous().
# The backward functions apply a check that these input tensors are contiguous.

- func: embedding_bag(Tensor weight, IndexTensor indices, IndexTensor offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false) -> (Tensor, Tensor, Tensor, Tensor)

- func: _embedding_bag(Tensor weight, IndexTensor indices, IndexTensor offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false) -> (Tensor, Tensor, Tensor, Tensor)
  dispatch:
    CPU: _embedding_bag_cpu
    CUDA: _embedding_bag_cuda

- func: _embedding_bag_backward(Tensor grad, IndexTensor indices, IndexTensor offsets, IndexTensor offset2bag, IndexTensor bag_size, IndexTensor maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse) -> Tensor

- func: _embedding_bag_sparse_backward(Tensor grad, IndexTensor indices, IndexTensor offsets, IndexTensor offset2bag, IndexTensor bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) -> Tensor

- func: _embedding_bag_dense_backward(Tensor grad, IndexTensor indices, IndexTensor offsets, IndexTensor offset2bag, IndexTensor bag_size, IndexTensor maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) -> Tensor
  dispatch:
    CPU: _embedding_bag_dense_backward_cpu
    CUDA: _embedding_bag_dense_backward_cuda

- func: empty(IntList size, TensorOptions options={}) -> Tensor
  cpu_half: True
  dispatch:
    CPU: empty_cpu
    CUDA: empty_cuda
    SparseCPU: empty_sparse
    SparseCUDA: empty_sparse

- func: resize_(Tensor self, IntList size) -> Tensor
  variants: method
  cpu_half: True
  device_guard: False
  dispatch:
    CPU: resize_cpu_
    CUDA: resize_cuda_

- func: empty_out(Tensor result, IntList size) -> Tensor

- func: empty_like(Tensor self) -> Tensor

- func: empty_like(Tensor self, *, TensorOptions options) -> Tensor

- func: empty_strided(IntList size, IntList stride, *, TensorOptions options={}) -> Tensor

- func: erf(Tensor self) -> Tensor
  variants: function, method

- func: erf_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _erf__cpu
    CUDA: _erf__cuda

- func: erf_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _erf_out_cpu
    CUDA: _erf_out_cuda

- func: erfc(Tensor self) -> Tensor
  variants: function, method

- func: erfc_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _erfc__cpu
    CUDA: _erfc__cuda

- func: erfc_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _erfc_out_cpu
    CUDA: _erfc_out_cuda

- func: exp(Tensor self) -> Tensor
  variants: function, method

- func: exp_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _exp__cpu
    CUDA: _exp__cuda

- func: exp_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _exp_out_cpu
    CUDA: _exp_out_cuda

- func: expm1(Tensor self) -> Tensor
  variants: function, method

- func: expm1_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _expm1__cpu
    CUDA: _expm1__cuda

- func: expm1_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _expm1_out_cpu
    CUDA: _expm1_out_cuda

- func: expand(Tensor self, IntList size, *, bool implicit=false) -> Tensor
  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
  device_guard: false

- func: expand_as(Tensor self, Tensor other) -> Tensor
  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
  device_guard: false

- func: eye(int64_t n, TensorOptions options={}) -> Tensor

- func: eye(int64_t n, int64_t m, TensorOptions options={}) -> Tensor

- func: eye_out(Tensor result, int64_t n) -> Tensor
  dispatch:
    CPU: eye_out_cpu
    CUDA: eye_out_cuda

- func: eye_out(Tensor result, int64_t n, int64_t m) -> Tensor
  dispatch:
    CPU: eye_out_cpu
    CUDA: eye_out_cuda

- func: flatten(Tensor self, int64_t start_dim=0, int64_t end_dim=-1) -> Tensor
  variants: function, method

- func: fill_(Tensor self, Scalar value) -> Tensor
  variants: function, method

- func: fill_(Tensor self, Tensor value) -> Tensor
  variants: function, method

- func: floor(Tensor self) -> Tensor
  variants: function, method

- func: floor_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _floor__cpu
    CUDA: _floor__cuda

- func: floor_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _floor_out_cpu
    CUDA: _floor_out_cuda

- func: full(IntList size, Scalar fill_value, TensorOptions options={}) -> Tensor

- func: full_out(Tensor result, IntList size, Scalar fill_value) -> Tensor

- func: full_like(Tensor self, Scalar fill_value) -> Tensor

- func: full_like(Tensor self, Scalar fill_value, *, TensorOptions options) -> Tensor

# NOTE [ grid_sampler Native Functions ]
# `grid_sampler` does all the shape checking and then dispatches to one of
# `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of which
# has the corresponding backward defined as native functions as well. Therefore,
# in these functions and their backwards, no more shape checking is done.
#
# Additionally, arguments `padding_mode` and `interpolation_mode` are cast to
# enums defined in `native/GridSampler.h`. `cudnn_grid_sampler` doesn't take in
# `interpolation_mode` because it only supports Bilinear interpolation mode.
- func: grid_sampler(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> Tensor

- func: grid_sampler_2d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> Tensor
  dispatch:
    CPU: grid_sampler_2d_cpu
    CUDA: grid_sampler_2d_cuda

- func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> (Tensor, Tensor)
  dispatch:
    CPU: grid_sampler_2d_backward_cpu
    CUDA: grid_sampler_2d_backward_cuda

- func: grid_sampler_3d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> Tensor
  dispatch:
    CPU: grid_sampler_3d_cpu
    CUDA: grid_sampler_3d_cuda

- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> (Tensor, Tensor)
  dispatch:
    CPU: grid_sampler_3d_backward_cpu
    CUDA: grid_sampler_3d_backward_cuda

- func: hann_window(int64_t window_length, TensorOptions options={}) -> Tensor

- func: hann_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor

- func: hamming_window(int64_t window_length, TensorOptions options={}) -> Tensor

- func: hamming_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor

- func: hamming_window(int64_t window_length, bool periodic, double alpha, TensorOptions options={}) -> Tensor

- func: hamming_window(int64_t window_length, bool periodic, double alpha, double beta, TensorOptions options={}) -> Tensor

- func: hinge_embedding_loss(Tensor self, Tensor target, double margin=1.0, int64_t reduction=Reduction::Mean) -> Tensor

- func: ger(Tensor self, Tensor vec2) -> Tensor
  variants: function, method

- func: ger_out(Tensor result, Tensor self, Tensor vec2) -> Tensor

- func: gesv(Tensor self, Tensor A) -> (Tensor, Tensor)
  variants: function, method

- func: gesv_out(Tensor solution, Tensor lu, Tensor self, Tensor A) -> (Tensor, Tensor)

# gesv handles broadcasting of arbitrary batch dims while _gesv_helper does not.
- func: _gesv_helper(Tensor self, Tensor A) -> (Tensor, Tensor)
  variants: function
  dispatch:
    CPU: _gesv_helper_cpu
    CUDA: _gesv_helper_cuda

- func: group_norm(Tensor input, int64_t num_groups, Tensor? weight={}, Tensor? bias={}, double eps=1e-5, bool cudnn_enabled=True) -> Tensor

# FFT

- func: fft(Tensor self, int64_t signal_ndim, bool normalized=false) -> Tensor
  variants: function, method

- func: ifft(Tensor self, int64_t signal_ndim, bool normalized=false) -> Tensor
  variants: function, method

- func: rfft(Tensor self, int64_t signal_ndim, bool normalized=false, bool onesided=true) -> Tensor
  variants: function, method

- func: irfft(Tensor self, int64_t signal_ndim, bool normalized=false, bool onesided=true, IntList signal_sizes={}) -> Tensor
  variants: function, method

- func: _fft_with_size(Tensor self, int64_t signal_ndim, bool complex_input, bool complex_output, bool inverse, IntList checked_signal_sizes, bool normalized, bool onesided, IntList output_sizes) -> Tensor
  variants: function
  dispatch:
    CPU: _fft_mkl
    CUDA: _fft_cufft

- func: _cufft_get_plan_cache_size() -> int64_t
  device_guard: false

- func: _cufft_get_plan_cache_max_size() -> int64_t
  device_guard: false

- func: _cufft_set_plan_cache_max_size(int64_t max_size)
  device_guard: false

- func: _cufft_clear_plan_cache()
  device_guard: false

- func: index(Tensor self, TensorList indices) -> Tensor
  variants: function, method
  # NB: This function is special-cased in tools/autograd/gen_variable_type.py

- func: index_copy_(Tensor self, int64_t dim, IndexTensor index, Tensor source) -> Tensor
  variants: method

- func: index_put(Tensor self, TensorList indices, Tensor values) -> Tensor
  variants: function, method

- func: index_put_(Tensor self, TensorList indices, Tensor values) -> Tensor
  variants: function, method

- func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, double momentum, double eps, bool cudnn_enabled) -> Tensor
  variants: function

- func: inverse(Tensor self) -> Tensor
  variants: function, method

- func: inverse_out(Tensor result, Tensor self) -> Tensor

- func: _inverse_helper(Tensor self) -> Tensor
  dispatch:
    CPU: _inverse_helper_cpu
    CUDA: _inverse_helper_cuda

- func: isclose(Tensor self, Tensor other, double rtol=1e-5, double atol=1e-8, bool equal_nan=False) -> Tensor
  variants: function, method

- func: is_distributed(Tensor self) -> bool
  variants: function, method
  device_guard: false

- func: is_floating_point(Tensor self) -> bool
  variants: function, method
  device_guard: false

- func: is_complex(Tensor self) -> bool
  variants: function, method
  device_guard: false

- func: is_nonzero(Tensor self) -> bool
  variants: function, method
  device_guard: false

- func: is_same_size(Tensor self, Tensor other) -> bool
  variants: function, method
  device_guard: false

- func: is_signed(Tensor self) -> bool
  variants: function, method
  device_guard: false

- func: kl_div(Tensor self, Tensor target, int64_t reduction=Reduction::Mean) -> Tensor

- func: kl_div_backward(Tensor grad_output, Tensor self, Tensor target, int64_t reduction=Reduction::Mean) -> Tensor
  dispatch:
    CPU: kl_div_backward_cpu
    CUDA: kl_div_backward_cuda

- func: kthvalue(Tensor self, int64_t k, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor)
  variants: function, method

- func: kthvalue_out(Tensor values, Tensor indices, Tensor self, int64_t k, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor)

- func: layer_norm(Tensor input, IntList normalized_shape, Tensor? weight={}, Tensor? bias={}, double eps=1e-5, bool cudnn_enable=True) -> Tensor

- func: linear(Tensor input, Tensor weight, Tensor bias={}) -> Tensor

- func: linspace(Scalar start, Scalar end, TensorOptions options={}) -> Tensor

- func: linspace(Scalar start, Scalar end, int64_t steps, TensorOptions options={}) -> Tensor

- func: linspace_out(Tensor result, Scalar start, Scalar end) -> Tensor

- func: linspace_out(Tensor result, Scalar start, Scalar end, int64_t steps) -> Tensor

- func: log(Tensor self) -> Tensor
  variants: function, method

- func: log_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _log__cpu
    CUDA: _log__cuda

- func: log_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _log_out_cpu
    CUDA: _log_out_cuda

- func: log10(Tensor self) -> Tensor
  variants: function, method

- func: log10_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _log10__cpu
    CUDA: _log10__cuda

- func: log10_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _log10_out_cpu
    CUDA: _log10_out_cuda

- func: log1p(Tensor self) -> Tensor
  variants: function, method

- func: log1p_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _log1p__cpu
    CUDA: _log1p__cuda
    SparseCPU: log1p_sparse_
    SparseCUDA: log1p_sparse_

- func: log1p_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _log1p_out_cpu
    CUDA: _log1p_out_cuda
    SparseCPU: log1p_out_sparse
    SparseCUDA: log1p_out_sparse

- func: log2(Tensor self) -> Tensor
  variants: function, method

- func: log2_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _log2__cpu
    CUDA: _log2__cuda

- func: log2_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _log2_out_cpu
    CUDA: _log2_out_cuda

- func: logdet(Tensor self) -> Tensor
  variants: function, method

- func: logspace(Scalar start, Scalar end, TensorOptions options={}) -> Tensor

- func: logspace(Scalar start, Scalar end, int64_t steps, TensorOptions options={}) -> Tensor

- func: logspace_out(Tensor result, Scalar start, Scalar end) -> Tensor

- func: logspace_out(Tensor result, Scalar start, Scalar end, int64_t steps) -> Tensor

# FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
- func: log_softmax(Tensor self, int64_t dim, ScalarType dtype) -> Tensor
  variants: function, method

- func: log_softmax(Tensor self, int64_t dim) -> Tensor
  variants: function, method

- func: _log_softmax(Tensor self, int64_t dim, bool half_to_float) -> Tensor
  dispatch:
    CPU: log_softmax_cpu
    CUDA: log_softmax_cuda

- func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int64_t dim, Tensor self) -> Tensor
  dispatch:
    CPU: log_softmax_backward_cpu
    CUDA: log_softmax_backward_cuda

- func: logsumexp(Tensor self, int64_t dim, bool keepdim=False) -> Tensor
  variants: function, method

- func: logsumexp_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor

- func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, double margin=0.0, int64_t reduction=Reduction::Mean) -> Tensor

- func: matmul(Tensor self, Tensor other) -> Tensor
  variants: function, method

- func: matmul_out(Tensor result, Tensor self, Tensor other) -> Tensor

- func: matrix_rank(Tensor self, double tol, bool symmetric=false) -> Tensor

- func: matrix_rank(Tensor self, bool symmetric=false) -> Tensor

- func: matrix_power(Tensor self, int64_t n) -> Tensor
  variants: function, method

- func: max(Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)
  variants: function, method

- func: max_out(Tensor max, Tensor max_values, Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)

- func: max_values(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
  variants: function, method

- func: max_pool1d_with_indices(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> (Tensor, Tensor)

- func: max_pool1d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor

- func: max_pool2d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor

- func: max_pool3d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor

# FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
- func: mean(Tensor self, *, ScalarType dtype) -> Tensor
  variants: function, method

- func: mean(Tensor self) -> Tensor
  variants: function, method

- func: mean(Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor
  variants: function, method

- func: mean(Tensor self, int64_t dim, bool keepdim=False) -> Tensor
  variants: function, method

- func: mean(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
  variants: function, method

- func: mean_out(Tensor result, Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor

- func: mean_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor

- func: mean_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor

- func: median(Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)
  variants: function, method

- func: median_out(Tensor values, Tensor indices, Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)

- func: min(Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)
  variants: function, method

- func: min_out(Tensor min, Tensor min_indices, Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)

- func: min_values(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
  variants: function, method

- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList stride, IntList dilation, int64_t groups) -> Tensor

- func: mkldnn_convolution_backward_input(IntList self_size, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) -> Tensor

- func: mkldnn_convolution_backward_weights(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) -> (Tensor, Tensor)

- func: mkldnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)

- func: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, double exponential_average_factor, double epsilon) -> (Tensor, Tensor, Tensor)
  dispatch:
    CUDA: miopen_batch_norm

- func: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, double epsilon) -> (Tensor, Tensor, Tensor)
  dispatch:
    CUDA: miopen_batch_norm_backward

- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: miopen_convolution

- func: miopen_convolution_backward_input(IntList self_size, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: miopen_convolution_backward_input

- func: miopen_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
  dispatch:
    CUDA: miopen_convolution_backward

- func: miopen_convolution_backward_bias(Tensor grad_output) -> Tensor
  dispatch:
    CUDA: miopen_convolution_backward_bias

- func: miopen_convolution_backward_weight(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: miopen_convolution_backward_weight

- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: miopen_convolution_transpose

# NB: output_padding not strictly needed here, but it's helpful for the double
# backwards
- func: miopen_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
  dispatch:
    CUDA: miopen_convolution_transpose_backward

- func: miopen_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: miopen_convolution_transpose_backward_input

- func: miopen_convolution_transpose_backward_weight(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: miopen_convolution_transpose_backward_weight

- func: mm(Tensor self, Tensor mat2) -> Tensor
  variants: function, method

- func: mm_out(Tensor result, Tensor self, Tensor mat2) -> Tensor

- func: mode(Tensor self, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor)
  variants: function, method

- func: mode_out(Tensor values, Tensor indices, Tensor self, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor)

- func: mul(Tensor self, Tensor other) -> Tensor
  variants: function, method

- func: mul_(Tensor self, Tensor other) -> Tensor
  variants: method

- func: mul_out(Tensor result, Tensor self, Tensor other) -> Tensor

  # For C++ only, until we have conversion from C++ numbers to Tensor
- func: mul(Tensor self, Scalar other) -> Tensor
  variants: function, method

- func: mul_(Tensor self, Scalar other) -> Tensor
  variants: method

- func: mv(Tensor self, Tensor vec) -> Tensor
  variants: function, method

- func: mv_out(Tensor result, Tensor self, Tensor vec) -> Tensor

- func: mvlgamma(Tensor self, int64_t p) -> Tensor
  variants: function, method

- func: mvlgamma_(Tensor self, int64_t p) -> Tensor
  variants: method

- func: narrow_copy(Tensor self, int64_t dim, int64_t start, int64_t length) -> Tensor
  variants: method
  dispatch:
    CPU: narrow_copy_dense
    CUDA: narrow_copy_dense
    SparseCPU: narrow_copy_sparse
    SparseCUDA: narrow_copy_sparse

- func: narrow(Tensor self, int64_t dim, int64_t start, int64_t length) -> Tensor
  variants: function, method
  device_guard: false

- func: ones(IntList size, TensorOptions options={}) -> Tensor

- func: ones_out(Tensor result, IntList size) -> Tensor

- func: ones_like(Tensor self) -> Tensor

- func: ones_like(Tensor self, *, TensorOptions options) -> Tensor

- func: pairwise_distance(Tensor x1, Tensor x2, double p=2, double eps=1e-6, bool keepdim=false) -> Tensor

- func: pdist(Tensor self, double p=2) -> Tensor

- func: _pdist_forward(Tensor self, double p=2) -> Tensor

- func: _pdist_backward(Tensor grad, Tensor self, double p, Tensor pdist) -> Tensor

- func: permute(Tensor self, IntList dims) -> Tensor
  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.

- func: pixel_shuffle(Tensor self, int64_t upscale_factor) -> Tensor

- func: pin_memory(Tensor self) -> Tensor
  variants: function, method

- func: pinverse(Tensor self, double rcond=1e-15) -> Tensor
  variants: function, method

- func: rand(IntList size, *, TensorOptions options={}) -> Tensor

- func: rand(IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor

- func: rand_out(Tensor result, IntList size, *) -> Tensor

- func: rand_out(Tensor result, IntList size, *, Generator* generator) -> Tensor

- func: rand_like(Tensor self) -> Tensor

- func: rand_like(Tensor self, *, TensorOptions options) -> Tensor

- func: randint(int64_t high, IntList size, *, TensorOptions options={}) -> Tensor

- func: randint(int64_t high, IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor

- func: randint(int64_t low, int64_t high, IntList size, *, TensorOptions options={}) -> Tensor

- func: randint(int64_t low, int64_t high, IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor

- func: randint_out(Tensor result, int64_t high, IntList size, *) -> Tensor

- func: randint_out(Tensor result, int64_t high, IntList size, *, Generator* generator) -> Tensor

- func: randint_out(Tensor result, int64_t low, int64_t high, IntList size, *) -> Tensor

- func: randint_out(Tensor result, int64_t low, int64_t high, IntList size, *, Generator* generator) -> Tensor

- func: randint_like(Tensor self, int64_t high) -> Tensor

- func: randint_like(Tensor self, int64_t low, int64_t high) -> Tensor

- func: randint_like(Tensor self, int64_t high, *, TensorOptions options) -> Tensor

- func: randint_like(Tensor self, int64_t low, int64_t high, *, TensorOptions options) -> Tensor

- func: randn(IntList size, *, TensorOptions options={}) -> Tensor

- func: randn(IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor

- func: randn_out(Tensor result, IntList size, *) -> Tensor

- func: randn_out(Tensor result, IntList size, *, Generator* generator) -> Tensor

- func: randn_like(Tensor self) -> Tensor

- func: randn_like(Tensor self, *, TensorOptions options) -> Tensor

- func: randperm(int64_t n, *, TensorOptions options={}) -> Tensor

- func: randperm(int64_t n, *, Generator* generator, TensorOptions options={}) -> Tensor

- func: randperm_out(Tensor result, int64_t n, *) -> Tensor

- func: randperm_out(Tensor result, int64_t n, *, Generator* generator) -> Tensor
  dispatch:
    CPU: randperm_out_cpu
    CUDA: randperm_out_cuda

- func: range(Scalar start, Scalar end, TensorOptions options={}) -> Tensor

- func: range(Scalar start, Scalar end, Scalar step, TensorOptions options={}) -> Tensor

- func: range_out(Tensor result, Scalar start, Scalar end) -> Tensor

- func: range_out(Tensor result, Scalar start, Scalar end, Scalar step) -> Tensor

- func: repeat(Tensor self, IntList repeats) -> Tensor
  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.

- func: reshape(Tensor self, IntList shape) -> Tensor
  variants: function, method
  device_guard: false

- func: reshape_as(Tensor self, Tensor other) -> Tensor
  variants: method
  device_guard: false

- func: RoiPooling2d_forward(Tensor input, Tensor rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale) -> (Tensor, Tensor)
  dispatch:
    CPU: RoiPooling2d_forward_cpu
    CUDA: RoiPooling2d_forward_cuda

- func: RoiPooling2d_backward(Tensor input, Tensor rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale, Tensor gradOutput, Tensor argmaxes) -> Tensor
  dispatch:
    CPU: RoiPooling2d_backward_cpu
    CUDA: RoiPooling2d_backward_cuda

- func: round(Tensor self) -> Tensor
  variants: function, method

- func: round_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _round__cpu
    CUDA: _round__cuda

- func: round_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _round_out_cpu
    CUDA: _round_out_cuda

- func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr) -> Tensor

- func: rrelu_(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr) -> Tensor

- func: relu(Tensor self) -> Tensor
  variants: function, method

- func: relu_(Tensor self) -> Tensor
  variants: function, method

- func: prelu(Tensor self, Tensor weight) -> Tensor
  variants: function, method
  dispatch:
    CPU: prelu_cpu
    CUDA: prelu_cuda

- func: prelu_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)
  variants: function, method
  dispatch:
    CPU: prelu_backward_cpu
    CUDA: prelu_backward_cuda

- func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
  variants: function, method
  dispatch:
    CPU: hardshrink_cpu
    CUDA: hardshrink_cuda

- func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
  variants: function, method
  dispatch:
    CPU: hardshrink_backward_cpu
    CUDA: hardshrink_backward_cuda

- func: rsqrt(Tensor self) -> Tensor
  variants: function, method

- func: rsqrt_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _rsqrt__cpu
    CUDA: _rsqrt__cuda

- func: rsqrt_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _rsqrt_out_cpu
    CUDA: _rsqrt_out_cuda

- func: select(Tensor self, int64_t dim, int64_t index) -> Tensor
  variants: function, method
  device_guard: false

- func: selu(Tensor self) -> Tensor

- func: selu_(Tensor self) -> Tensor

- func: celu(Tensor self, Scalar alpha=1.0) -> Tensor

- func: celu_(Tensor self, Scalar alpha=1.0) -> Tensor

- func: sigmoid(Tensor self) -> Tensor
  variants: function, method

- func: sigmoid_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _sigmoid__cpu
    CUDA: _sigmoid__cuda

- func: sigmoid_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _sigmoid_out_cpu
    CUDA: _sigmoid_out_cuda

- func: sin(Tensor self) -> Tensor
  variants: function, method

- func: sin_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _sin__cpu
    CUDA: _sin__cuda

- func: sin_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _sin_out_cpu
    CUDA: _sin_out_cuda

- func: sinh(Tensor self) -> Tensor
  variants: function, method

- func: sinh_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _sinh__cpu
    CUDA: _sinh__cuda

- func: sinh_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _sinh_out_cpu
    CUDA: _sinh_out_cuda

- func: detach(Tensor self) -> Tensor
  variants: function, method

- func: detach_(Tensor self) -> Tensor
  variants: function, method

- func: size(Tensor self, int64_t dim) -> int64_t
  variants: function, method
  device_guard: false

- func: slice(Tensor self, int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) -> Tensor
  variants: function, method
  device_guard: false

- func: slogdet(Tensor self) -> (Tensor, Tensor)
  variants: function, method

- func: smm(Tensor self, Tensor mat2) -> Tensor
  variants: function, method

# FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
- func: softmax(Tensor self, int64_t dim, ScalarType dtype) -> Tensor
  variants: function, method

- func: softmax(Tensor self, int64_t dim) -> Tensor
  variants: function, method

- func: _softmax(Tensor self, int64_t dim, bool half_to_float) -> Tensor
  dispatch:
    CPU: softmax_cpu
    CUDA: softmax_cuda

- func: _softmax_backward_data(Tensor grad_output, Tensor output, int64_t dim, Tensor self) -> Tensor
  dispatch:
    CPU: softmax_backward_cpu
    CUDA: softmax_backward_cuda

- func: _sparse_add_out(Tensor result, Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
  dispatch:
    SparseCPU: add_out_sparse_cpu
    SparseCUDA: add_out_sparse_cuda

- func: _sparse_dense_add_out(Tensor result, Tensor self, SparseTensorRef other, *, Scalar alpha=1) -> Tensor
  dispatch:
    CPU: add_out_dense_sparse_cpu
    CUDA: add_out_dense_sparse_cuda

- func: _sparse_div_zerodim_out(Tensor result, Tensor self, Tensor other) -> Tensor
  dispatch:
    SparseCPU: div_out_sparse_zerodim
    SparseCUDA: div_out_sparse_zerodim

- func: _sparse_div_scalar_out(Tensor result, Tensor self, Scalar other) -> Tensor
  dispatch:
    SparseCPU: div_out_sparse_scalar
    SparseCUDA: div_out_sparse_scalar

- func: _sparse_mul_out(Tensor result, Tensor self, Tensor other) -> Tensor
  dispatch:
    SparseCPU: mul_out_sparse_cpu
    SparseCUDA: mul_out_sparse_cuda

- func: _sparse_mul_zerodim_out(Tensor result, Tensor self, Tensor other) -> Tensor
  dispatch:
    SparseCPU: mul_out_sparse_zerodim
    SparseCUDA: mul_out_sparse_zerodim

- func: _sparse_mul_scalar_out(Tensor result, Tensor self, Scalar other) -> Tensor
  dispatch:
    SparseCPU: mul_out_sparse_scalar
    SparseCUDA: mul_out_sparse_scalar

- func: split(Tensor self, int64_t split_size, int64_t dim=0) -> TensorList
  variants: function, method
  device_guard: false

- func: split_with_sizes(Tensor self, IntList split_sizes, int64_t dim=0) -> TensorList
  variants: function, method
  device_guard: false

- func: squeeze(Tensor self) -> Tensor
  variants: function, method
  device_guard: false

- func: squeeze(Tensor self, int64_t dim) -> Tensor
  variants: function, method
  device_guard: false

- func: squeeze_(Tensor self) -> Tensor
  variants: method
  device_guard: false

- func: squeeze_(Tensor self, int64_t dim) -> Tensor
  variants: method
  device_guard: false

- func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  variants: function, method

- func: sspaddmm_out(Tensor result, Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  dispatch:
    CPU: _sspaddmm_out_only_sparse
    CUDA: _sspaddmm_out_only_sparse_cuda
    SparseCPU: _sspaddmm_out_cpu
    SparseCUDA: _sspaddmm_out_cuda

- func: stack(TensorList tensors, int64_t dim=0) -> Tensor

- func: stack_out(Tensor result, TensorList tensors, int64_t dim=0) -> Tensor

# The signature is designed to be consistent with librosa except that it is
# missing the `pad_mode` and `center` arguments, which are taken care of at
# `torch.functional.py`. They shall be moved here once we have mapping between
# Python strings and C++ Enum in codegen.
- func: stft(Tensor self, int64_t n_fft, int64_t hop_length, int64_t win_length, Tensor? window={}, bool normalized=false, bool onesided=true) -> Tensor
  variants: function, method
  python_default_init:
    hop_length: n_fft >> 2
    win_length: n_fft

- func: stride(Tensor self, int64_t dim) -> int64_t
  variants: function, method
  device_guard: false

# FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
- func: sum(Tensor self, *, ScalarType dtype) -> Tensor
  variants: function, method

- func: sum(Tensor self) -> Tensor
  variants: function, method

- func: sum(Tensor self, IntList[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
  variants: function, method

- func: sum(Tensor self, IntList[1] dim, bool keepdim=False) -> Tensor
  variants: function, method

- func: sum(Tensor self, IntList[1] dim, *, ScalarType dtype) -> Tensor
  variants: function, method

- func: sum_out(Tensor result, Tensor self, IntList[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor

- func: sum_out(Tensor result, Tensor self, IntList[1] dim, bool keepdim=False) -> Tensor

- func: sum_out(Tensor result, Tensor self, IntList[1] dim, *, ScalarType dtype) -> Tensor

- func: sqrt(Tensor self) -> Tensor
  variants: function, method

- func: sqrt_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _sqrt__cpu
    CUDA: _sqrt__cuda

- func: sqrt_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _sqrt_out_cpu
    CUDA: _sqrt_out_cuda

- func: std(Tensor self, bool unbiased=true) -> Tensor
  variants: function, method

- func: std(Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor
  variants: function, method

- func: std_out(Tensor result, Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor

# FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
- func: prod(Tensor self, *, ScalarType dtype) -> Tensor
  variants: function, method

- func: prod(Tensor self) -> Tensor
  variants: function, method

- func: prod(Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor
  variants: function, method

- func: prod(Tensor self, int64_t dim, bool keepdim=False) -> Tensor
  variants: function, method

- func: prod(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
  variants: function, method

- func: prod_out(Tensor result, Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor

- func: prod_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor

- func: prod_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor

- func: t(Tensor self) -> Tensor
  variants: function, method

- func: t_(Tensor self) -> Tensor
  variants: method

- func: tan(Tensor self) -> Tensor
  variants: function, method

- func: tan_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _tan__cpu
    CUDA: _tan__cuda

- func: tan_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _tan_out_cpu
    CUDA: _tan_out_cuda

- func: tanh(Tensor self) -> Tensor
  variants: function, method

- func: tanh_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _tanh__cpu
    CUDA: _tanh__cuda

- func: tanh_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _tanh_out_cpu
    CUDA: _tanh_out_cuda

- func: tensordot(Tensor self, Tensor other, IntList dims_self, IntList dims_other) -> Tensor
  variants: function

- func: transpose(Tensor self, int64_t dim0, int64_t dim1) -> Tensor
  variants: function, method
  device_guard: false

- func: transpose_(Tensor self, int64_t dim0, int64_t dim1) -> Tensor
  variants: method
  device_guard: false

- func: flip(Tensor self, IntList dims) -> Tensor
  variants: function, method
  dispatch:
    CPU: flip_cpu
    CUDA: flip_cuda

# default IntList value {0,1} should not add space after comma, since native_parse.py uses ', ' to split args
- func: rot90(Tensor self, int64_t k=1, IntList dims={0,1}) -> Tensor
  variants: function, method

- func: _trilinear(Tensor i1, Tensor i2, Tensor i3, IntList expand1, IntList expand2, IntList expand3, IntList sumdim, int64_t unroll_dim=1) -> Tensor

- func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, double margin=1.0, double p=2, double eps=1e-6, bool swap=false, int64_t reduction=Reduction::Mean) -> Tensor

- func: trunc(Tensor self) -> Tensor
  variants: function, method

- func: trunc_(Tensor self) -> Tensor
  variants: function, method
  dispatch:
    CPU: _trunc__cpu
    CUDA: _trunc__cuda

- func: trunc_out(Tensor result, Tensor self) -> Tensor
  dispatch:
    CPU: _trunc_out_cpu
    CUDA: _trunc_out_cuda

- func: type_as(Tensor self, Tensor other) -> Tensor
  variants: method

- func: _unique(Tensor self, bool sorted=false, bool return_inverse=false) -> (Tensor, Tensor)
  variants: function
  dispatch:
    CPU: _unique_cpu
    CUDA: _unique_cuda

- func: _unique_dim(Tensor self, int64_t dim, bool sorted=false, bool return_inverse=false) -> (Tensor, Tensor)
  variants: function
  dispatch:
    CPU: _unique_dim_cpu
    CUDA: _unique_dim_cuda

- func: _unsafe_view(Tensor self, IntList size) -> Tensor

- func: unsqueeze(Tensor self, int64_t dim) -> Tensor
  variants: function, method
  device_guard: false

- func: unsqueeze_(Tensor self, int64_t dim) -> Tensor
  variants: method
  device_guard: false

- func: var(Tensor self, bool unbiased=true) -> Tensor
  variants: function, method

- func: var(Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor
  variants: function, method

- func: var_out(Tensor result, Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor

- func: view_as(Tensor self, Tensor other) -> Tensor
  variants: method
  device_guard: false

# we define both of these because 'where' does the broadcast and '_s_where' doesn't;
# this allows us to implicitly calculate the broadcast derivative, while only dealing with the
# _s_where derivative.
- func: where(BoolTensor condition, Tensor self, Tensor other) -> Tensor
  variants: function, method

- func: _s_where(BoolTensor condition, Tensor self, Tensor other) -> Tensor
  variants: function
  dispatch:
    CPU: _s_where_cpu
    CUDA: _s_where_cuda

- func: norm_except_dim(Tensor v, int64_t pow=2, int64_t dim=0) -> Tensor
  variants: function

# VariableType::_weight_norm does not want to be given a gap in the autograd graph,
# so we don't define "dispatch" variants for it.
- func: _weight_norm(Tensor v, Tensor g, int64_t dim=0) -> Tensor
  variants: function

- func: _weight_norm_cuda_interface(Tensor v, Tensor g, int64_t dim=0) -> (Tensor, Tensor)
  variants: function
  dispatch:
    CUDA: weight_norm_cuda

- func: _weight_norm_cuda_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int64_t dim) -> (Tensor, Tensor)
  variants: function
  dispatch:
    CUDA: weight_norm_cuda_backward

- func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int64_t dim) -> (Tensor, Tensor)
  variants: function

- func: zeros(IntList size, TensorOptions options={}) -> Tensor

- func: zeros_out(Tensor result, IntList size) -> Tensor

- func: zeros_like(Tensor self) -> Tensor

- func: zeros_like(Tensor self, *, TensorOptions options) -> Tensor

- func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
  variants: function
  dispatch:
    CPU: _standard_gamma_grad_cpu
    CUDA: _standard_gamma_grad_cuda

- func: _standard_gamma(Tensor self, Generator* generator=nullptr) -> Tensor
  variants: function
  dispatch:
    CPU: _s_gamma_cpu
    CUDA: _s_gamma_cuda

- func: poisson(Tensor self, Generator* generator=nullptr) -> Tensor
  dispatch:
    CPU: _s_poisson_cpu
    CUDA: _s_poisson_cuda

# When more variants get ported to native, this dispatch will get more
# complicated

- func: native_norm(Tensor self, Scalar p=2) -> Tensor
  dispatch:
    SparseCPU: norm_sparse
    SparseCUDA: norm_sparse

- func: norm(Tensor self, Scalar p=2) -> Tensor
  variants: function, method

- func: norm(Tensor self, Scalar p, int64_t dim, bool keepdim=false) -> Tensor
  variants: function, method
  python_default_init:
    p: 2

- func: norm_out(Tensor result, Tensor self, Scalar p, int64_t dim, bool keepdim=false) -> Tensor
  python_default_init:
    p: 2

- func: frobenius_norm(Tensor self) -> Tensor
  variants: function

- func: frobenius_norm(Tensor self, IntList[1] dim, bool keepdim=false) -> Tensor
  variants: function

- func: frobenius_norm_out(Tensor result, Tensor self, IntList[1] dim, bool keepdim=false) -> Tensor
  variants: function

- func: nuclear_norm(Tensor self, bool keepdim=false) -> Tensor
  variants: function

- func: nuclear_norm_out(Tensor result, Tensor self, bool keepdim=false) -> Tensor
  variants: function

- func: native_clone(Tensor self) -> Tensor
  dispatch:
    SparseCPU: clone_sparse
    SparseCUDA: clone_sparse

- func: clone(Tensor self) -> Tensor
  variants: function, method

- func: native_resize_as_(Tensor self, Tensor the_template) -> Tensor
  dispatch:
    SparseCPU: resize_as_sparse_
    SparseCUDA: resize_as_sparse_

- func: resize_as_(Tensor self, Tensor the_template) -> Tensor
  variants: function, method

- func: native_pow_out(Tensor result, Tensor self, Scalar exponent) -> Tensor
  dispatch:
    SparseCPU: pow_out_sparse_scalar
    SparseCUDA: pow_out_sparse_scalar

- func: native_pow(Tensor self, Scalar exponent) -> Tensor
  dispatch:
    SparseCPU: pow_sparse_scalar
    SparseCUDA: pow_sparse_scalar

- func: pow_out(Tensor result, Tensor self, Scalar exponent) -> Tensor

- func: pow(Tensor self, Scalar exponent) -> Tensor
  variants: function, method
  variants: method, function

- func: native_zero_(Tensor self) -> Tensor
  dispatch:
    SparseCPU: zero_sparse_
    SparseCUDA: zero_sparse_

- func: zero_(Tensor self) -> Tensor
  variants: method, function

- func: sub_out(Tensor result, Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor

- func: sub(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
  variants: function, method

- func: sub_(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
  variants: method

# For C++ only, until we have conversion from C++ numbers to Tensor
- func: sub(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
  variants: function, method

- func: sub_(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
  variants: method

- func: rsub(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
  variants: function

# For C++ only, until we have conversion from C++ numbers to Tensor
- func: rsub(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
  variants: function

- func: s_native_addmm_out(Tensor result, Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  dispatch:
    CPU: s_addmm_out_sparse_dense_cpu
    CUDA: s_addmm_out_sparse_dense_cuda

- func: s_native_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  dispatch:
    CPU: s_addmm_sparse_dense_cpu
    CUDA: s_addmm_sparse_dense_cuda

- func: s_native_addmm_(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  dispatch:
    CPU: s_addmm_sparse_dense_cpu_
    CUDA: s_addmm_sparse_dense_cuda_

- func: addmm_out(Tensor result, Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor

- func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  variants: function, method

- func: addmm_(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  variants: method


# NOTE [ Sparse: autograd and API ]
#
#
# Sparse Tensor Constructors
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# The API entry points to sparse tensor construction should be
# `sparse_coo tensor` and `_sparse_coo_tensor_unsafe`. Depending on whether the
# indices and values tensors are given, they eventually dispatch to either
# `sparse_coo_tensor_with_dims` or `sparse_coo_tensor_with_dims_and_tensors`.
#
# The autograd support for ctor is implement on `sparse_coo_tensor_with_dims_and_tensors`.
#
# The API methods `sparse_coo tensor` and `_sparse_coo_tensor_unsafe`
# **must not** have specific type dispatches because otherwise codegen will
# consider them as abstract methods (see Note [Abstract ATen methods]), dispatch
# using **Tensor** type, and thus lose autograd tracking on the actual method
# they dispatch to, e.g., `sparse_coo_tensor_with_dims_and_tensors`.
#
# The actual ctors `sparse_coo_tensor_with_dims` and `sparse_coo_tensor_with_dims_and_tensors`,
# on the other hand, need to create `SparseTensorImpl` and know nothing about
# how `VariableType`s work. So they need to be dispatched using Tensor types.
# We thus put `requires_tensor=True` to ensure that `VariableType` will unwrap
# the given variables and call with the Tensor type.
#
#
# Sparse Methods API Design
# ~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Goals: 1. Flexible API for users to write custom sparse ops
#        2. ctor and member accessor with autograd support
#
# To achieve 1, we need to provide a set of *dangerous* APIs (dangerous in the
# sense that misusing them will break sparse tensor invariant and may result in
# unexpected behavior, e.g., crash). These methods are all prefixed with
# underscore "_" to indicate that they should be used with care. We provide:
#
#   + `_indices()`: returns the *raw* indices within the sparse tensor (not just
#                   sharing storage). Any inplace operation will change the
#                   actual indices, including t_, set_, as_strided_, resize_,
#                   etc.
#   + `_values()`: returns the *raw* values within the sparse tensor. Similar
#                  semantics as `_indices()`
#   + `_nnz()`: returns the number of non-zero entries. This will always be
#               determined by the shapes of indices and values.
#   + `_coalesced_(bool)`: inplace sets whether the tensor is coalesced, and
#                          returns itself.
#
# These methods are very useful in writing new operations, e.g., a custom
# autograd Function.
#
# We also provide other public *safe* APIs:
#   + `indices()`: returns a **view** of the indices tensor if the sparse tensor
#                  is **coalesced**.
#   + `values()`: returns a **view** of the values tensor if the containing
#                 sparse tensor is **coalesced**.
#   + `sparse_dim()`: number of sparse dimensions
#   + `dense_dim()`: number of dense dimensions
#   + `is_coalesced()`: whether the sparse tensor is coalesced
#
# `_indices()` and `_values()` should returns the raw indices and values dense
# tensors within a sparse tensor. They can be quite unsafe with inplace
# operations like `t_()`, and exposes uncoalesced indices and values. The public
# recommended API is `indices()` and `values()`, both of which first check that
# the tensor is coalesced and return views on those tensors.
#
#
# Autograd Support
# ~~~~~~~~~~~~~~~~
#
# Autograd is supported on `values()` and sparse tensor ctor with indices and
# values tensors. E.g., `torch.sparse_coo_tensor(i, v).values().sum()` is
# differentiable w.r.t. `v`.
#
# NB: The `values()` and `_values()` operators are special in that they are
# layout-aware, i.e., the output depends not just on the data it represents, but
# also on the input layout details (in this case, the `indices` tensor). See
# NOTE [ as_strided Backward and layout-aware/agnostic autograd ] in Functions.cpp
# for discussion on layout-aware vs layout-agnostic autograd. Since PyTorch ops
# operate in the layout-agnostic mode, similar to `as_strided`, backward of
# these two operators need to consider them in a layout-agnostic way:
#   + `values()`:
#     Input is coalesced.
#     We just pretend having `input.indices()` as an additional argument
#     `input_indices`, then forward is similar to
#     `input.to(kStrided).index_select(input_indices)` regardless of the layout.
#     Note that `values()` normally is layout-aware even if we constrain
#     ourselves on sparse inputs since it may include all zeros values entries
#     as "present" entries.
#   + `_values()`:
#     Input may be uncoalesced.
#     It is not straightforward to construct a layout-agnostic version because
#     duplicate indices entries may exist and additional parameterization is
#     needed to distribute the value into different values entries. Furthermore,
#     this op is intended to provide ways to write custom sparse ops, rather
#     than being used in autograd graph, so it is marked as *non-differentiable*
#     in derivatives.yaml.
#
# Before reading the following, see NOTE [ Autograd Variable Views ] in
# variable.h for details on views that are tracked by autograd, and views that
# are not.
#
# Moreover, these methods return tensors that share storage with inputs, so we
# mark these methods as view ops to support autograd history tracking.
# The sparse tensor ctor output should technically be view of both input indices
# and values tensors, but currently we only support setting as view of a single
# Variable, so it is only view of the values tensor.
# TODO: clone indices in sparse tensor ctor.
#
# For other methods that return outputs that share storage with inputs, i.e.,
# `indices()` and `_indices()`. We mark their outputs as non-differentiable, so
# the view relation is not tracked by autograd, but the version counter is still
# shared. In other words, their outputs are non-differentiable views of the
# sparse tensor.


# FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
# the default would never make sense.
- func: sparse_coo_tensor(IntList size, *, TensorOptions options) -> Tensor

- func: sparse_coo_tensor(IndexTensor indices, Tensor values, *, TensorOptions options={}) -> Tensor

- func: sparse_coo_tensor(IndexTensor indices, Tensor values, IntList size, *, TensorOptions options={}) -> Tensor

- func: _sparse_coo_tensor_unsafe(IndexTensor indices, Tensor values, IntList size, *, TensorOptions options={}) -> Tensor


- func: _sparse_coo_tensor_with_dims(int64_t sparse_dim, int64_t dense_dim, IntList size, *, TensorOptions options) -> Tensor
  dispatch:
    SparseCPU: new_with_dims_sparse
    SparseCUDA: new_with_dims_sparse
  requires_tensor: True

- func: _sparse_coo_tensor_with_dims_and_tensors(int64_t sparse_dim, int64_t dense_dim, IntList size, Tensor indices, Tensor values, *, TensorOptions options) -> Tensor
  dispatch:
    SparseCPU: new_with_dims_and_tensor_sparse
    SparseCUDA: new_with_dims_and_tensor_sparse
  requires_tensor: True


- func: sparse_resize_(Tensor self, IntList size, int64_t sparse_dim, int64_t dense_dim) -> Tensor
  variants: method
  dispatch:
    SparseCPU: sparse_resize_
    SparseCUDA: sparse_resize_
  requires_tensor: True

- func: sparse_resize_and_clear_(Tensor self, IntList size, int64_t sparse_dim, int64_t dense_dim) -> Tensor
  variants: method
  dispatch:
    SparseCPU: sparse_resize_and_clear_
    SparseCUDA: sparse_resize_and_clear_
  requires_tensor: True


- func: sparse_mask(Tensor self, SparseTensorRef mask) -> Tensor
  variants: method
  dispatch:
    CPU: sparse_mask_cpu
    CUDA: sparse_mask_cuda
  requires_tensor: True


- func: to_dense(Tensor self) -> Tensor
  variants: method
  dispatch:
    SparseCPU: sparse_to_dense
    SparseCUDA: sparse_to_dense
  requires_tensor: True


- func: sparse_dim(Tensor self) -> int64_t
  variants: method
  dispatch:
    SparseCPU: sparse_dim_sparse
    SparseCUDA: sparse_dim_sparse
  requires_tensor: True
  device_guard: False

# legacy method
- func: _dimI(Tensor self) -> int64_t
  variants: method
  dispatch: sparse_dim_sparse
  requires_tensor: True
  device_guard: False


- func: dense_dim(Tensor self) -> int64_t
  variants: method
  dispatch:
    SparseCPU: dense_dim_sparse
    SparseCUDA: dense_dim_sparse
  requires_tensor: True
  device_guard: False

# legacy method
- func: _dimV(Tensor self) -> int64_t
  variants: method
  dispatch: dense_dim_sparse
  requires_tensor: True
  device_guard: False


- func: _nnz(Tensor self) -> int64_t
  variants: method
  dispatch:
    SparseCPU: _nnz_sparse
    SparseCUDA: _nnz_sparse
  requires_tensor: True
  device_guard: False


- func: coalesce(Tensor self) -> Tensor
  variants: method
  dispatch:
    SparseCPU: coalesce_sparse_cpu
    SparseCUDA: coalesce_sparse_cuda
  requires_tensor: True


- func: is_coalesced(Tensor self) -> bool
  variants: method
  dispatch:
    SparseCPU: is_coalesced_sparse
    SparseCUDA: is_coalesced_sparse
  requires_tensor: True
  device_guard: False


- func: _indices(Tensor self) -> Tensor
  variants: method
  dispatch:
    SparseCPU: _indices_sparse
    SparseCUDA: _indices_sparse
  requires_tensor: True
  device_guard: False

- func: _values(Tensor self) -> Tensor
  variants: method
  dispatch:
    SparseCPU: _values_sparse
    SparseCUDA: _values_sparse
  requires_tensor: True
  device_guard: False

# This method doesn't do any check but only directly sets the flag. So it can be
# a bit unsafe. Similar to _indices and _values, this is useful for implementing
# custom sparse operations in Python/C++ extension.
- func: _coalesced_(Tensor self, bool coalesced) -> Tensor
  variants: method
  dispatch:
    SparseCPU: _coalesced_sparse_
    SparseCUDA: _coalesced_sparse_
  requires_tensor: True
  device_guard: False

- func: indices(Tensor self) -> Tensor
  variants: method
  dispatch:
    SparseCPU: indices_sparse
    SparseCUDA: indices_sparse
  requires_tensor: True
  device_guard: False

- func: values(Tensor self) -> Tensor
  variants: method
  dispatch:
    SparseCPU: values_sparse
    SparseCUDA: values_sparse
  requires_tensor: True
  device_guard: False


- func: hspmm_out(Tensor result, Tensor mat1, Tensor mat2) -> Tensor
  dispatch:
    SparseCPU: hspmm_out_sparse_cpu
    SparseCUDA: hspmm_out_sparse_cuda
  requires_tensor: True

- func: hspmm(Tensor mat1, Tensor mat2) -> Tensor
  dispatch:
    SparseCPU: hspmm_sparse_cpu
    SparseCUDA: hspmm_sparse_cuda
  requires_tensor: True

- func: copy_sparse_to_sparse_(Tensor self, Tensor src, bool non_blocking=false) -> Tensor
  variants: function
  dispatch:
    SparseCPU: copy_sparse_
    SparseCUDA: copy_sparse_
  requires_tensor: True

- func: numel(Tensor self) -> int64_t
  variants: function, method
  device_guard: False

- func: unbind(Tensor self, int64_t dim=0) -> TensorList
  variants: function, method

- func: to_sparse(Tensor self, int64_t sparse_dim) -> Tensor
  variants: method
  dispatch:
    CPU: dense_to_sparse
    CUDA: dense_to_sparse

- func: to_sparse(Tensor self) -> Tensor
  variants: method
  dispatch:
    CPU: dense_to_sparse
    CUDA: dense_to_sparse

# to(Device) must not exist because all constructors of Device also works for
# TensorOptions. Otherwise, an ambiguity error is thrown.
# See NOTE [ TensorOptions Constructors ].
- func: to(Tensor self, TensorOptions options, bool non_blocking=false, bool copy=false) -> Tensor
  variants: method
  device_guard: False

- func: to(Tensor self, Device device, ScalarType dtype, bool non_blocking=false, bool copy=false) -> Tensor
  variants: method
  device_guard: False

- func: to(Tensor self, ScalarType dtype, bool non_blocking=false, bool copy=false) -> Tensor
  variants: method
  device_guard: False

- func: to(Tensor self, Tensor other, bool non_blocking=false, bool copy=false) -> Tensor
  variants: method
  device_guard: False

- func: meshgrid(TensorList tensors) -> TensorList

# This has a method dispatch to work around circular include problems
- func: _local_scalar(Tensor self) -> Scalar
  variants: function, method

# NB: Does NOT check precondition that numel == 1
# WARNING: Use of cpu_half here is generally not supported; please
# don't use it.
- func: _local_scalar_dense(Tensor self) -> Scalar
  cpu_half: True
  dispatch:
    CPU: _local_scalar_dense_cpu
    CUDA: _local_scalar_dense_cuda
  variants: function

# Fused RNN kernels
- func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias={}, Tensor? hidden_bias={}) -> (Tensor, Tensor, Tensor)
  dispatch:
    CUDA: _thnn_fused_lstm_cell_cuda

- func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
  dispatch:
    CUDA: _thnn_fused_lstm_cell_backward_cuda

- func: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias={}, Tensor? hidden_bias={}) -> (Tensor, Tensor)
  dispatch:
    CUDA: _thnn_fused_gru_cell_cuda

- func: _thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
  dispatch:
    CUDA: _thnn_fused_gru_cell_backward_cuda

# RNN cells and layers
- func: lstm(Tensor input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)

- func: lstm(Tensor data, Tensor batch_sizes, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor)

- func: gru(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)

- func: gru(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor)

- func: rnn_tanh(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)

- func: rnn_tanh(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor)

- func: rnn_relu(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)

- func: rnn_relu(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor)

- func: lstm_cell(Tensor input, TensorList hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih={}, Tensor? b_hh={}) -> (Tensor, Tensor)

- func: gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih={}, Tensor? b_hh={}) -> Tensor

- func: rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih={}, Tensor? b_hh={}) -> Tensor

- func: rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih={}, Tensor? b_hh={}) -> Tensor

# PackedSequence utilities
- func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor)

- func: _pack_padded_sequence_backward(Tensor grad, IntList input_size, Tensor batch_sizes, bool batch_first) -> Tensor

- func: _pad_packed_sequence(Tensor data, Tensor batch_sizes, bool batch_first, Scalar padding_value, int64_t total_length) -> (Tensor, Tensor)

# wrappers for legacy TH methods

- func: ndimension(Tensor self) -> int64_t
  variants: method
  device_guard: false

- func: data_ptr(Tensor self) -> void*
  variants: method
  device_guard: false

- func: set_(Tensor self, Storage source) -> Tensor
  variants: method
  device_guard: false

- func: set_(Tensor self, Storage source, int64_t storage_offset, IntList size, IntList stride={}) -> Tensor
  variants: method
  device_guard: false

- func: set_(Tensor self, Tensor source) -> Tensor
  variants: method
  device_guard: false

- func: set_(Tensor self) -> Tensor
  variants: method
  device_guard: false

- func: is_contiguous(Tensor self) -> bool
  variants: method
  device_guard: false

- func: is_set_to(Tensor self, Tensor tensor) -> bool
  variants: method
  device_guard: false

- func: masked_fill_(Tensor self, Tensor mask, Scalar value) -> Tensor
  variants: method
  device_guard: false

- func: masked_fill_(Tensor self, Tensor mask, Tensor value) -> Tensor
  variants: method
  device_guard: false

- func: masked_scatter_(Tensor self, Tensor mask, Tensor source) -> Tensor
  variants: method
  device_guard: false

- func: view(Tensor self, IntList size) -> Tensor
  variants: method
  device_guard: false

- func: put_(Tensor self, Tensor index, Tensor source, bool accumulate=false) -> Tensor
  variants: method
  device_guard: false

- func: index_add_(Tensor self, int64_t dim, Tensor index, Tensor source) -> Tensor
  variants: method
  device_guard: false

- func: index_fill_(Tensor self, int64_t dim, Tensor index, Scalar value) -> Tensor
  variants: method
  device_guard: false

- func: index_fill_(Tensor self, int64_t dim, Tensor index, Tensor value) -> Tensor
  variants: method
  device_guard: false

- func: scatter_(Tensor self, int64_t dim, Tensor index, Tensor src) -> Tensor
  variants: method
  device_guard: false

- func: scatter_(Tensor self, int64_t dim, Tensor index, Scalar value) -> Tensor
  variants: method
  device_guard: false

- func: scatter_add_(Tensor self, int64_t dim, Tensor index, Tensor src) -> Tensor
  variants: method
  device_guard: false

- func: lt_(Tensor self, Scalar other) -> Tensor
  variants: method
  device_guard: false

- func: lt_(Tensor self, Tensor other) -> Tensor
  variants: method
  device_guard: false

- func: gt_(Tensor self, Scalar other) -> Tensor
  variants: method
  device_guard: false

- func: gt_(Tensor self, Tensor other) -> Tensor
  variants: method
  device_guard: false

- func: le_(Tensor self, Scalar other) -> Tensor
  variants: method
  device_guard: false

- func: le_(Tensor self, Tensor other) -> Tensor
  variants: method
  device_guard: false

- func: ge_(Tensor self, Scalar other) -> Tensor
  variants: method
  device_guard: false

- func: ge_(Tensor self, Tensor other) -> Tensor
  variants: method
  device_guard: false

- func: eq_(Tensor self, Scalar other) -> Tensor
  variants: method
  device_guard: false

- func: eq_(Tensor self, Tensor other) -> Tensor
  variants: method
  device_guard: false

- func: ne_(Tensor self, Scalar other) -> Tensor
  variants: method
  device_guard: false

- func: ne_(Tensor self, Tensor other) -> Tensor
  variants: method
  device_guard: false

- func: lgamma_(Tensor self) -> Tensor
  variants: method
  device_guard: false

- func: atan2_(Tensor self, Tensor other) -> Tensor
  variants: method
  device_guard: false

- func: tril_(Tensor self, int64_t diagonal=0) -> Tensor
  variants: method
  device_guard: false

- func: triu_(Tensor self,  int64_t diagonal=0) -> Tensor
  variants: method
  device_guard: false

- func: digamma_(Tensor self) -> Tensor
  variants: method
  device_guard: false

- func: polygamma_(Tensor self, int64_t n) -> Tensor
  variants: method
  device_guard: false

- func: erfinv_(Tensor self) -> Tensor
  variants: method
  device_guard: false

- func: frac_(Tensor self) -> Tensor
  variants: method
  device_guard: false

- func: renorm_(Tensor self, Scalar p, int64_t dim, Scalar maxnorm) -> Tensor
  variants: method
  device_guard: false

- func: reciprocal_(Tensor self) -> Tensor
  variants: method
  device_guard: false

- func: neg_(Tensor self) -> Tensor
  variants: method
  device_guard: false

- func: pow_(Tensor self, Scalar exponent) -> Tensor
  variants: method
  device_guard: false

- func: pow_(Tensor self, Tensor exponent) -> Tensor
  variants: method
  device_guard: false

- func: lerp_(Tensor self, Tensor end, Scalar weight) -> Tensor
  variants: method
  device_guard: false

- func: sign_(Tensor self) -> Tensor
  variants: method
  device_guard: false

- func: fmod_(Tensor self, Scalar other) -> Tensor
  variants: method
  device_guard: false

- func: fmod_(Tensor self, Tensor other) -> Tensor
  variants: method
  device_guard: false

- func: remainder_(Tensor self, Scalar other) -> Tensor
  variants: method
  device_guard: false

- func: remainder_(Tensor self, Tensor other) -> Tensor
  variants: method
  device_guard: false

- func: addbmm_(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  variants: method
  device_guard: false

- func: addbmm_out(Tensor result, Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  device_guard: false

- func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  variants: method, function
  device_gurad: false

- func: addcmul_(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
  variants: method
  device_guard: false

- func: addcdiv_(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
  variants: method
  device_guard: false

- func: random_(Tensor self, int64_t from, int64_t to, *, Generator* generator=nullptr) -> Tensor
  variants: method
  device_guard: false

- func: random_(Tensor self, int64_t to, *, Generator* generator=nullptr) -> Tensor
  variants: method
  device_guard: false

- func: random_(Tensor self, *, Generator* generator=nullptr) -> Tensor
  variants: method
  device_guard: false

- func: uniform_(Tensor self, double from=0, double to=1, *, Generator* generator=nullptr) -> Tensor
  variants: method
  device_guard: false

- func: normal_(Tensor self, double mean=0, double std=1, *, Generator* generator=nullptr) -> Tensor
  variants: method
  device_guard: false

- func: cauchy_(Tensor self, double median=0, double sigma=1, *, Generator* generator=nullptr) -> Tensor
  variants: method
  device_guard: false

- func: log_normal_(Tensor self, double mean=1, double std=2, *, Generator* generator=nullptr) -> Tensor
  variants: method
  device_guard: false

- func: exponential_(Tensor self, double lambd=1, *, Generator* generator=nullptr) -> Tensor
  variants: method
  device_guard: false

- func: geometric_(Tensor self, double p, *, Generator* generator=nullptr) -> Tensor
  variants: method
  device_guard: false

# wrappers for TH functions

- func: diag_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor
  device_guard: false

- func: diag(Tensor self, int64_t diagonal=0) -> Tensor
  variants: method, function
  device_guard: false

- func: cross_out(Tensor result, Tensor self, Tensor other, int64_t dim=-1) -> Tensor
  device_guard: false

- func: cross(Tensor self, Tensor other, int64_t dim=-1) -> Tensor
  variants: method, function
  device_guard: false

- func: triu_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor
  device_guard: false

- func: triu(Tensor self, int64_t diagonal=0) -> Tensor
  variants: method, function
  device_guard: false

- func: tril_out(Tensor result, Tensor self, int64_t diagonal=0) -> Tensor
  device_guard: false

- func: tril(Tensor self, int64_t diagonal=0) -> Tensor
  variants: method, function
  device_guard: false

- func: trace(Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: ne_out(Tensor result, Tensor self, Scalar other) -> Tensor
  device_guard: false

- func: ne(Tensor self, Scalar other) -> Tensor
  variants: method, function
  device_guard: false

- func: ne_out(Tensor result, Tensor self, Tensor other) -> Tensor
  device_guard: false

- func: ne(Tensor self, Tensor other) -> Tensor
  variants: method, function
  device_guard: false

- func: eq_out(Tensor result, Tensor self, Scalar other) -> Tensor
  device_guard: false

- func: eq(Tensor self, Scalar other) -> Tensor
  variants: method, function
  device_guard: false

- func: eq_out(Tensor result, Tensor self, Tensor other) -> Tensor
  device_guard: false

- func: eq(Tensor self, Tensor other) -> Tensor
  variants: method, function
  device_guard: false

- func: ge_out(Tensor result, Tensor self, Scalar other) -> Tensor
  device_guard: false

- func: ge(Tensor self, Scalar other) -> Tensor
  variants: method, function
  device_guard: false

- func: ge_out(Tensor result, Tensor self, Tensor other) -> Tensor
  device_guard: false

- func: ge(Tensor self, Tensor other) -> Tensor
  variants: method, function
  device_guard: false

- func: le_out(Tensor result, Tensor self, Scalar other) -> Tensor
  device_guard: false

- func: le(Tensor self, Scalar other) -> Tensor
  variants: method, function
  device_guard: false

- func: le_out(Tensor result, Tensor self, Tensor other) -> Tensor
  device_guard: false

- func: le(Tensor self, Tensor other) -> Tensor
  variants: method, function
  device_guard: false

- func: gt_out(Tensor result, Tensor self, Scalar other) -> Tensor
  device_guard: false

- func: gt(Tensor self, Scalar other) -> Tensor
  variants: method, function
  device_guard: false

- func: gt_out(Tensor result, Tensor self, Tensor other) -> Tensor
  device_guard: false

- func: gt(Tensor self, Tensor other) -> Tensor
  variants: method, function
  device_guard: false

- func: lt_out(Tensor result, Tensor self, Scalar other) -> Tensor
  device_guard: false

- func: lt(Tensor self, Scalar other) -> Tensor
  variants: method, function
  device_guard: false

- func: lt_out(Tensor result, Tensor self, Tensor other) -> Tensor
  device_guard: false

- func: lt(Tensor self, Tensor other) -> Tensor
  variants: method, function
  device_guard: false

- func: take_out(Tensor result, Tensor self, Tensor index) -> Tensor
  device_guard: false

- func: take(Tensor self, Tensor index) -> Tensor
  variants: method, function
  device_guard: false

- func: index_select_out(Tensor result, Tensor self, int64_t dim, Tensor index) -> Tensor
  device_guard: false

- func: index_select(Tensor self, int64_t dim, Tensor index) -> Tensor
  variants: method, function
  device_guard: false

- func: masked_select_out(Tensor result, Tensor self, Tensor mask) -> Tensor
  device_guard: false

- func: masked_select(Tensor self, Tensor mask) -> Tensor
  variants: method, function
  device_guard: false

- func: nonzero_out(Tensor result, Tensor self) -> Tensor
  device_guard: false

- func: nonzero(Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: gather_out(Tensor result, Tensor self, int64_t dim, Tensor index) -> Tensor
  device_guard: false

- func: gather(Tensor self, int64_t dim, Tensor index) -> Tensor
  variants: method, function
  device_guard: false

- func: addcmul_out(Tensor result, Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
  device_guard: false

- func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
  variants: method, function
  device_guard: false

- func: addcdiv_out(Tensor result, Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
  device_guard: false

- func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
  variants: method, function
  device_guard: false

- func: gels_out(Tensor X, Tensor qr, Tensor self, Tensor A) -> (Tensor, Tensor)
  device_guard: false

- func: gels(Tensor self, Tensor A) -> (Tensor, Tensor)
  variants: method, function
  device_guard: false

- func: trtrs_out(Tensor X, Tensor M, Tensor self, Tensor A, bool upper=true, bool transpose=false, bool unitriangular=false) -> (Tensor, Tensor)
  device_guard: false

- func: trtrs(Tensor self, Tensor A, bool upper=true, bool transpose=false, bool unitriangular=false) -> (Tensor, Tensor)
  variants: method, function
  device_guard: false

- func: symeig_out(Tensor e, Tensor V, Tensor self, bool eigenvectors=false, bool upper=true) -> (Tensor, Tensor)
  device_guard: false

- func: symeig(Tensor self, bool eigenvectors=false, bool upper=true) -> (Tensor, Tensor)
  variants: method, function
  device_guard: false

- func: eig_out(Tensor e, Tensor v, Tensor self, bool eigenvectors=false) -> (Tensor, Tensor)
  device_guard: false

- func: eig(Tensor self, bool eigenvectors=false) -> (Tensor, Tensor)
  variants: method, function
  device_guard: false

- func: svd_out(Tensor U, Tensor S, Tensor V, Tensor self, bool some=true, bool compute_uv=true) -> (Tensor, Tensor, Tensor)
  device_guard: false

- func: svd(Tensor self, bool some=true, bool compute_uv=true) -> (Tensor, Tensor, Tensor)
  variants: method, function
  device_guard: false

- func: cholesky_out(Tensor result, Tensor self, bool upper=false) -> Tensor
  device_guard: false

- func: cholesky(Tensor self, bool upper=false) -> Tensor
  variants: method, function
  device_guard: false

- func: potrs_out(Tensor result, Tensor self, Tensor input2, bool upper=true) -> Tensor
  device_guard: false

- func: potrs(Tensor self, Tensor input2, bool upper=true) -> Tensor
  variants: method, function
  device_guard: false

- func: potri_out(Tensor result, Tensor self, bool upper=true) -> Tensor
  device_guard: false

- func: potri(Tensor self, bool upper=true) -> Tensor
  variants: method, function
  device_guard: false

- func: pstrf_out(Tensor u, Tensor piv, Tensor self, bool upper=true, Scalar tol=-1) -> (Tensor, Tensor)
  device_guard: false

- func: pstrf(Tensor self, bool upper=true, Scalar tol=-1) -> (Tensor, Tensor)
  variants: method, function
  device_guard: false

- func: qr_out(Tensor Q, Tensor R, Tensor self) -> (Tensor, Tensor)
  device_guard: false

- func: qr(Tensor self) -> (Tensor, Tensor)
  variants: method, function
  device_guard: false

- func: geqrf_out(Tensor result0, Tensor result1, Tensor self) -> (Tensor, Tensor)
  device_guard: false

- func: geqrf(Tensor self) -> (Tensor, Tensor)
  variants: method, function
  device_guard: false

- func: orgqr_out(Tensor result, Tensor self, Tensor input2) -> Tensor
  device_guard: false

- func: orgqr(Tensor self, Tensor input2) -> Tensor
  variants: method, function
  device_guard: false

- func: ormqr_out(Tensor result, Tensor self, Tensor input2, Tensor input3, bool left=true, bool transpose=false) -> Tensor
  device_guard: false

- func: ormqr(Tensor self, Tensor input2, Tensor input3, bool left=true, bool transpose=false) -> Tensor
  variants: method, function
  device_guard: false

- func: btrifact_out(Tensor A_LU, Tensor pivots, Tensor self, *, bool pivot=true) -> (Tensor, Tensor)
  device_guard: false

- func: btrifact(Tensor self, *, bool pivot=true) -> (Tensor, Tensor)
  variants: method, function
  device_guard: false

- func: btrifact_with_info_out(Tensor A_LU, Tensor pivots, Tensor info, Tensor self, *, bool pivot=true) -> (Tensor, Tensor, Tensor)
  device_guard: false

- func: btrifact_with_info(Tensor self, *, bool pivot=true) -> (Tensor, Tensor, Tensor)
  variants: method, function
  device_guard: false

- func: btrisolve_out(Tensor result, Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor
  device_guard: false

- func: btrisolve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor
  variants: method, function
  device_guard: false

- func: multinomial_out(Tensor result, Tensor self, int64_t num_samples, bool replacement=false, *, Generator* generator=nullptr) -> Tensor
  device_guard: false

- func: multinomial(Tensor self, int64_t num_samples, bool replacement=false, *, Generator* generator=nullptr) -> Tensor
  variants: method, function
  device_guard: false

- func: lgamma_out(Tensor result, Tensor self) -> Tensor
  device_guard: false

- func: lgamma(Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: digamma_out(Tensor result, Tensor self) -> Tensor
  device_guard: false

- func: digamma(Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: polygamma_out(Tensor result, int64_t n, Tensor self) -> Tensor
  device_guard: false

- func: polygamma(int64_t n, Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: erfinv_out(Tensor result, Tensor self) -> Tensor
  device_guard: false

- func: erfinv(Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: frac_out(Tensor result, Tensor self) -> Tensor
  device_guard: false

- func: frac(Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
  variants: method, function
  device_guard: false

- func: reciprocal_out(Tensor result, Tensor self) -> Tensor
  device_guard: false

- func: reciprocal(Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: neg_out(Tensor result, Tensor self) -> Tensor
  device_guard: false

- func: neg(Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: atan2_out(Tensor result, Tensor self, Tensor other) -> Tensor
  device_guard: false

- func: atan2(Tensor self, Tensor other) -> Tensor
  variants: method, function
  device_guard: false

- func: lerp_out(Tensor result, Tensor self, Tensor end, Scalar weight) -> Tensor
  device_guard: false

- func: lerp(Tensor self, Tensor end, Scalar weight) -> Tensor
  variants: method, function
  device_guard: false

- func: histc_out(Tensor result, Tensor self, int64_t bins=100, Scalar min=0, Scalar max=0) -> Tensor
  device_guard: false

- func: histc(Tensor self, int64_t bins=100, Scalar min=0, Scalar max=0) -> Tensor
  variants: method, function
  device_guard: false

- func: sign_out(Tensor result, Tensor self) -> Tensor
  device_guard: false

- func: sign(Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: fmod_out(Tensor result, Tensor self, Scalar other) -> Tensor
  device_guard: false

- func: fmod(Tensor self, Scalar other) -> Tensor
  variants: method, function
  device_guard: false

- func: fmod_out(Tensor result, Tensor self, Tensor other) -> Tensor
  device_guard: false

- func: fmod(Tensor self, Tensor other) -> Tensor
  variants: method, function
  device_guard: false

- func: remainder_out(Tensor result, Tensor self, Scalar other) -> Tensor
  device_guard: false

- func: remainder(Tensor self, Scalar other) -> Tensor
  variants: method, function
  device_guard: false

- func: remainder_out(Tensor result, Tensor self, Tensor other) -> Tensor
  device_guard: false

- func: remainder(Tensor self, Tensor other) -> Tensor
  variants: method, function
  device_guard: false

- func: min_out(Tensor result, Tensor self, Tensor other) -> Tensor
  device_guard: false

- func: min(Tensor self, Tensor other) -> Tensor
  variants: method, function
  device_guard: false

- func: min(Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: max_out(Tensor result, Tensor self, Tensor other) -> Tensor
  device_guard: false

- func: max(Tensor self, Tensor other) -> Tensor
  variants: method, function
  device_guard: false

- func: max(Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: median(Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: sort_out(Tensor values, Tensor indices, Tensor self, int64_t dim=-1, bool descending=false) -> (Tensor, Tensor)
  device_guard: false

- func: sort(Tensor self, int64_t dim=-1, bool descending=false) -> (Tensor, Tensor)
  variants: method, function
  device_guard: false

- func: topk_out(Tensor values, Tensor indices, Tensor self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) -> (Tensor, Tensor)
  device_guard: false

- func: topk(Tensor self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) -> (Tensor, Tensor)
  variants: method, function
  device_guard: false

- func: all(Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: any(Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: renorm_out(Tensor result, Tensor self, Scalar p, int64_t dim, Scalar maxnorm) -> Tensor
  device_guard: false

- func: renorm(Tensor self, Scalar p, int64_t dim, Scalar maxnorm) -> Tensor
  variants: method, function
  device_guard: false

- func: unfold(Tensor self, int64_t dimension, int64_t size, int64_t step) -> Tensor
  variants: method
  device_guard: false

- func: equal(Tensor self, Tensor other) -> bool
  variants: method, function
  device_guard: false

- func: pow_out(Tensor result, Tensor self, Tensor exponent) -> Tensor
  device_guard: false

- func: pow(Tensor self, Tensor exponent) -> Tensor
  variants: method, function
  device_guard: false

- func: pow_out(Tensor result, Scalar self, Tensor exponent) -> Tensor
  device_guard: false

- func: pow(Scalar self, Tensor exponent) -> Tensor
  device_guard: false

- func: normal_out(Tensor output, Tensor mean, double std=1, *, Generator* generator=nullptr) -> Tensor
  device_guard: false

- func: normal(Tensor mean, double std=1, *, Generator* generator=nullptr) -> Tensor
  device_guard: false

- func: normal_out(Tensor output, double mean, Tensor std, *, Generator* generator=nullptr) -> Tensor
  device_guard: false

- func: normal(double mean, Tensor std, *, Generator* generator=nullptr) -> Tensor
  device_guard: false

- func: normal_out(Tensor output, Tensor mean, Tensor std, *, Generator* generator=nullptr) -> Tensor
  device_guard: false

- func: normal(Tensor mean, Tensor std, *, Generator* generator=nullptr) -> Tensor
  device_guard: false

- func: alias(Tensor self) -> Tensor
  variants: method, function
  device_guard: false

- func: _dirichlet_grad_out(Tensor output, Tensor x, Tensor alpha, Tensor total) -> Tensor
  device_guard: false

- func: _dirichlet_grad(Tensor x, Tensor alpha, Tensor total) -> Tensor
  device_guard: false