Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multi gpu #729

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/benchmarks/lstm_tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def set_backend(name, gpu_id):
if gpu_id == -1:
set_current_ops(NumpyOps(use_blis=True))
else:
set_current_ops(CupyOps())
set_current_ops(CupyOps(device_id=gpu_id))
if name == "pytorch":
import torch

Expand Down
2 changes: 1 addition & 1 deletion thinc/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def get_ops(name: str, **kwargs) -> Ops:
def get_array_ops(arr):
"""Return CupyOps for a cupy array, NumpyOps otherwise."""
if is_cupy_array(arr):
return CupyOps()
return CupyOps(device_id=arr.device)
else:
return NumpyOps()

Expand Down
55 changes: 52 additions & 3 deletions thinc/backends/cupy_ops.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from contextlib import contextmanager
from typing import Optional

import numpy
from .. import registry
from .ops import Ops
from .ops import Ops, contextmethod
from .numpy_ops import NumpyOps
from . import _custom_kernels
from ..types import DeviceTypes
Expand All @@ -17,11 +20,28 @@ class CupyOps(Ops):
_xp2 = cupyx

def __init__(
self, device_type: DeviceTypes = "gpu", device_id: int = 0, **kwargs
self,
device_type: DeviceTypes = "gpu",
device_id: Optional[int] = None,
**kwargs
) -> None:
self.device_type = device_type
if device_id is None or device_id < 0:
device_id = self.xp.cuda.runtime.getDevice()
self.device_id = device_id

@contextmanager
def context(self):
"""Create a context placing Cupy operations on a specific GPU.
used as:

>>> ops = CupyOps(device_id=1)
>>> with ops.context():
>>> x = ops.xp.zeros((4,10)) + ops.xp.arange(10)
"""
with self.xp.cuda.Device(self.device_id) as ctxt:
yield ctxt

def to_numpy(self, data, *, byte_order=None):
if not isinstance(data, numpy.ndarray):
data = data.get()
Expand All @@ -30,24 +50,28 @@ def to_numpy(self, data, *, byte_order=None):
data = numpy.asarray(data, dtype=dtype)
return data

@contextmethod
def gather_add(self, table, indices):
if table.dtype in ("float32", "float64"):
return _custom_kernels.gather_add(table, indices)
else:
return super().gather_add(table, indices)

@contextmethod
def gelu(self, X, inplace=False):
if X.dtype in ("float32", "float64"):
return _custom_kernels.gelu(X, inplace=inplace, threshold=6.0)
else:
return super().gelu(X, inplace=inplace)

@contextmethod
def backprop_gelu(self, dY, X, inplace=False):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_gelu(dY, X, inplace=inplace, threshold=6.0)
else:
return super().backprop_gelu(dY, X, inplace=inplace)

@contextmethod
def gemm(self, x, y, out=None, trans1=False, trans2=False):
if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
raise ValueError(
Expand All @@ -64,10 +88,11 @@ def gemm(self, x, y, out=None, trans1=False, trans2=False):
self.xp.dot(x, y, out=out)
return out

@contextmethod
def asarray(self, data, dtype=None):
# We'll try to perform a zero-copy conversion if possible.
if is_cupy_array(data):
array = data
return self.xp.asarray(data, dtype)
elif is_torch_cuda_array(data):
array = torch2xp(data)
elif is_tensorflow_gpu_array(data):
Expand All @@ -82,31 +107,36 @@ def asarray(self, data, dtype=None):

return array

@contextmethod
def maxout(self, X):
if X.dtype in ("float32", "float64"):
return _custom_kernels.maxout(X)
else:
return super().maxout(X)

@contextmethod
def backprop_maxout(self, dY, which, P):
if dY.dtype in ("float32", "float64") and which.dtype == "int32":
return _custom_kernels.backprop_maxout(dY, which, P)
else:
return super().backprop_maxout(dY, which, P)

@contextmethod
def relu(self, X, inplace=False):
if not inplace:
return X * (X > 0)
else:
X *= X > 0
return X

@contextmethod
def backprop_relu(self, dY, Y, inplace=False):
if not inplace:
return dY * (Y > 0)
dY *= Y > 0
return dY

@contextmethod
def clipped_linear(
self,
X,
Expand Down Expand Up @@ -135,6 +165,7 @@ def clipped_linear(
max_val=max_val,
)

@contextmethod
def backprop_clipped_linear(
self,
dY,
Expand Down Expand Up @@ -166,24 +197,28 @@ def backprop_clipped_linear(
inplace=inplace,
)

@contextmethod
def backprop_hard_swish(self, dY, X, inplace: bool = False):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_hard_swish(dY, X, inplace=inplace)
else:
return super().backprop_hard_swish(dY, X, inplace=inplace)

@contextmethod
def backprop_hard_swish_mobilenet(self, dY, X, inplace: bool = False):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_hard_swish_mobilenet(dY, X, inplace=inplace)
else:
return super().backprop_hard_swish_mobilenet(dY, X, inplace=inplace)

@contextmethod
def mish(self, X, threshold=20.0, inplace=False):
if X.dtype in ("float32", "float64"):
return _custom_kernels.mish(X, inplace=inplace, threshold=threshold)
else:
return super().mish(X, threshold, inplace)

@contextmethod
def backprop_mish(self, dY, X, threshold=20.0, inplace=False):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_mish(
Expand All @@ -192,12 +227,14 @@ def backprop_mish(self, dY, X, threshold=20.0, inplace=False):
else:
return super().backprop_mish(dY, X, threshold, inplace)

@contextmethod
def swish(self, X, inplace=False):
if X.dtype in ("float32", "float64"):
return _custom_kernels.swish(X, inplace=inplace, threshold=17.0)
else:
return super().swish(X, inplace=inplace)

@contextmethod
def backprop_swish(self, dY, X, Y, inplace=False):
if X.dtype == dY.dtype == Y.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_swish(
Expand All @@ -206,6 +243,7 @@ def backprop_swish(self, dY, X, Y, inplace=False):
else:
return super().backprop_swish(dY, X, Y, inplace=inplace)

@contextmethod
def clip_gradient(self, gradient, threshold):
# We do not use CuPy's linalg.norm, since it uses scalar reductions
# using one CUDA block. This is a lot slower than the cuBLAS
Expand All @@ -218,6 +256,7 @@ def frobenius_norm(X):
gradient *= cupy.minimum(threshold, grad_norm) / grad_norm
return gradient

@contextmethod
def seq2col(self, seq, nW, *, lengths=None):
"""Given an (M, N) sequence of vectors, return an (M, N*(nW*2+1)) sequence.
The new sequence is constructed by concatenating nW preceding and succeeding
Expand All @@ -230,6 +269,7 @@ def seq2col(self, seq, nW, *, lengths=None):
else:
return super().seq2col(seq, nW, lengths=lengths)

@contextmethod
def backprop_seq2col(self, dY, nW, *, lengths=None):
if dY.dtype in ("float32", "float64") and (
lengths is None or lengths.dtype == "int32"
Expand All @@ -238,24 +278,28 @@ def backprop_seq2col(self, dY, nW, *, lengths=None):
else:
return super().backprop_seq2col(dY, nW, lengths=lengths)

@contextmethod
def reduce_mean(self, X, lengths):
if X.dtype in ("float32", "float64") and lengths.dtype == "int32":
return _custom_kernels.reduce_mean(X, lengths=lengths)
else:
super().reduce_mean(X, lengths)

@contextmethod
def backprop_reduce_mean(self, d_means, lengths):
if d_means.dtype in ("float32", "float64") and lengths.dtype == "int32":
return _custom_kernels.backprop_reduce_mean(d_means, lengths)
else:
super().backprop_reduce_mean(d_means, lengths)

@contextmethod
def reduce_max(self, X, lengths):
if X.dtype in ("float32", "float64") and lengths.dtype == "int32":
return _custom_kernels.reduce_max(X, lengths)
else:
super().reduce_max(X, lengths)

@contextmethod
def backprop_reduce_max(self, d_maxes, which, lengths):
if (
d_maxes.dtype in ("float32", "float64")
Expand All @@ -266,24 +310,29 @@ def backprop_reduce_max(self, d_maxes, which, lengths):
else:
super().backprop_reduce_max(d_maxes, which, lengths)

@contextmethod
def reduce_sum(self, X, lengths):
if X.dtype in ("float32", "float64") and lengths.dtype == "int32":
return _custom_kernels.reduce_sum(X, lengths)
else:
return super().reduce_sum(X, lengths)

@contextmethod
def backprop_reduce_sum(self, d_sums, lengths):
if d_sums.dtype in ("float32", "float64") and lengths.dtype == "int32":
return _custom_kernels.backprop_reduce_sum(d_sums, lengths)
else:
return super().backprop_reduce_sum(d_sums, lengths)

@contextmethod
def hash(self, ids, seed):
return _custom_kernels.hash(ids, seed)

@contextmethod
def scatter_add(self, table, indices, values):
self._xp2.scatter_add(table, indices, values)

@contextmethod
def adam(
self, weights, gradient, mom1, mom2, beta1, beta2, eps, learn_rate, mod_rate=1.0
):
Expand Down
6 changes: 6 additions & 0 deletions thinc/backends/numpy_ops.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# cython: cdivision=True
# cython: infer_types=True
# cython: profile=True
from contextlib import contextmanager
from typing import Optional
from collections.abc import Sized
import numpy
Expand Down Expand Up @@ -61,6 +62,11 @@ class NumpyOps(Ops):
if self.use_blis and not has_blis:
raise ValueError("BLIS support requires blis: pip install blis")

@contextmanager
def context(self):
# Dummy context for compatibility with CupyOps.context()
yield self

def asarray(self, data, dtype=None):
if isinstance(data, self.xp.ndarray):
array = data
Expand Down
Loading