pytorch
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎captum/__init__.py
Lines changed: 0 additions & 7 deletions b/‎captum/__init__.py
Lines changed: 0 additions & 7 deletions
diff --git a/‎captum/_utils/av.py
Lines changed: 1 addition & 1 deletion b/‎captum/_utils/av.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎captum/_utils/models/linear_model/train.py
Lines changed: 70 additions & 69 deletions b/‎captum/_utils/models/linear_model/train.py
Lines changed: 70 additions & 69 deletions
diff --git a/‎captum/_utils/progress.py
Lines changed: 2 additions & 2 deletions b/‎captum/_utils/progress.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎captum/_utils/sample_gradient.py
Lines changed: 14 additions & 14 deletions b/‎captum/_utils/sample_gradient.py
Lines changed: 14 additions & 14 deletions
diff --git a/‎captum/attr/_core/lime.py
Lines changed: 6 additions & 6 deletions b/‎captum/attr/_core/lime.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎captum/attr/_core/saliency.py
Lines changed: 3 additions & 3 deletions b/‎captum/attr/_core/saliency.py
Lines changed: 3 additions & 3 deletions
@@ -49,7 +49,7 @@ Captum can also be used by application engineers who are using trained models in
 
 **Installation Requirements**
 - Python >= 3.6
-- PyTorch >= 1.6
+- PyTorch >= 1.2
 
 
 ##### Installing the latest release
 
@@ -1,10 +1,3 @@
 #!/usr/bin/env python3
-import captum.attr as attr  # noqa
-import captum.concept as concept  # noqa
-import captum.influence as influence  # noqa
-import captum.log as log  # noqa
-import captum.metrics as metrics  # noqa
-import captum.robust as robust  # noqa
-
 
 __version__ = "0.5.0"
@@ -47,7 +47,7 @@ def __init__(
             identifier: Optional[str] = None,
             layer: Optional[str] = None,
             num_id: Optional[str] = None,
-        ) -> None:
+        ):
             r"""
             Loads into memory the list of all activation file paths associated
             with the input `model_id`.
 
@@ -99,6 +99,7 @@ def sgd_train_linear_model(
         This will return the final training loss (averaged with
         `running_loss_window`)
     """
+
     loss_window: List[torch.Tensor] = []
     min_avg_loss = None
     convergence_counter = 0
@@ -144,77 +145,77 @@ def get_point(datapoint):
             if model.linear.bias is not None:
                 model.linear.bias.zero_()
 
-    with torch.enable_grad():
-        optim = torch.optim.SGD(model.parameters(), lr=initial_lr)
-        if reduce_lr:
-            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
-                optim, factor=0.5, patience=patience, threshold=threshold
-            )
-
-        t1 = time.time()
-        epoch = 0
-        i = 0
-        while epoch < max_epoch:
-            while True:  # for x, y, w in dataloader
-                if running_loss_window is None:
-                    running_loss_window = x.shape[0] * len(dataloader)
-
-                y = y.view(x.shape[0], -1)
-                if w is not None:
-                    w = w.view(x.shape[0], -1)
-
-                i += 1
-
-                out = model(x)
-
-                loss = loss_fn(y, out, w)
-                if reg_term is not None:
-                    reg = torch.norm(model.linear.weight, p=reg_term)
-                    loss += reg.sum() * alpha
-
-                if len(loss_window) >= running_loss_window:
-                    loss_window = loss_window[1:]
-                loss_window.append(loss.clone().detach())
-                assert len(loss_window) <= running_loss_window
-
-                average_loss = torch.mean(torch.stack(loss_window))
-                if min_avg_loss is not None:
-                    # if we haven't improved by at least `threshold`
-                    if average_loss > min_avg_loss or torch.isclose(
-                        min_avg_loss, average_loss, atol=threshold
-                    ):
-                        convergence_counter += 1
-                        if convergence_counter >= patience:
-                            converged = True
-                            break
-                    else:
-                        convergence_counter = 0
-                if min_avg_loss is None or min_avg_loss >= average_loss:
-                    min_avg_loss = average_loss.clone()
-
-                if debug:
-                    print(
-                        f"lr={optim.param_groups[0]['lr']}, Loss={loss},"
-                        + "Aloss={average_loss}, min_avg_loss={min_avg_loss}"
-                    )
-
-                loss.backward()
-                optim.step()
-                model.zero_grad()
-                if scheduler:
-                    scheduler.step(average_loss)
-
-                temp = next(data_iter, None)
-                if temp is None:
-                    break
-                x, y, w = get_point(temp)
-
-            if converged:
+    optim = torch.optim.SGD(model.parameters(), lr=initial_lr)
+    if reduce_lr:
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            optim, factor=0.5, patience=patience, threshold=threshold
+        )
+
+    t1 = time.time()
+    epoch = 0
+    i = 0
+    while epoch < max_epoch:
+        while True:  # for x, y, w in dataloader
+            if running_loss_window is None:
+                running_loss_window = x.shape[0] * len(dataloader)
+
+            y = y.view(x.shape[0], -1)
+            if w is not None:
+                w = w.view(x.shape[0], -1)
+
+            i += 1
+
+            out = model(x)
+
+            loss = loss_fn(y, out, w)
+            if reg_term is not None:
+                reg = torch.norm(model.linear.weight, p=reg_term)
+                loss += reg.sum() * alpha
+
+            if len(loss_window) >= running_loss_window:
+                loss_window = loss_window[1:]
+            loss_window.append(loss.clone().detach())
+            assert len(loss_window) <= running_loss_window
+
+            average_loss = torch.mean(torch.stack(loss_window))
+            if min_avg_loss is not None:
+                # if we haven't improved by at least `threshold`
+                if average_loss > min_avg_loss or torch.isclose(
+                    min_avg_loss, average_loss, atol=threshold
+                ):
+                    convergence_counter += 1
+                    if convergence_counter >= patience:
+                        converged = True
+                        break
+                else:
+                    convergence_counter = 0
+            if min_avg_loss is None or min_avg_loss >= average_loss:
+                min_avg_loss = average_loss.clone()
+
+            if debug:
+                print(
+                    f"lr={optim.param_groups[0]['lr']}, Loss={loss},"
+                    + "Aloss={average_loss}, min_avg_loss={min_avg_loss}"
+                )
+
+            loss.backward()
+
+            optim.step()
+            model.zero_grad()
+            if scheduler:
+                scheduler.step(average_loss)
+
+            temp = next(data_iter, None)
+            if temp is None:
                 break
+            x, y, w = get_point(temp)
+
+        if converged:
+            break
 
-            epoch += 1
-            data_iter = iter(dataloader)
-            x, y, w = get_point(next(data_iter))
+        epoch += 1
+        data_iter = iter(dataloader)
+        x, y, w = get_point(next(data_iter))
 
     t2 = time.time()
     return {
 
@@ -12,7 +12,7 @@
 
 
 class DisableErrorIOWrapper(object):
-    def __init__(self, wrapped: TextIO) -> None:
+    def __init__(self, wrapped: TextIO):
         """
         The wrapper around a TextIO object to ignore write errors like tqdm
         https://github.com/tqdm/tqdm/blob/bcce20f771a16cb8e4ac5cc5b2307374a2c0e535/tqdm/utils.py#L131
@@ -48,7 +48,7 @@ def __init__(
         total: int = None,
         file: TextIO = None,
         mininterval: float = 0.5,
-    ) -> None:
+    ):
         """
         Simple progress output used when tqdm is unavailable.
         Same as tqdm, output to stderr channel
 
@@ -1,14 +1,14 @@
 from collections import defaultdict
 from enum import Enum
-from typing import cast, DefaultDict, Iterable, List, Tuple, Union
+from typing import cast, Iterable, Tuple, Union
 
 import torch
 from captum._utils.common import _format_tensor_into_tuples, _register_backward_hook
 from torch import Tensor
 from torch.nn import Module
 
 
-def _reset_sample_grads(module: Module) -> None:
+def _reset_sample_grads(module: Module):
     module.weight.sample_grad = 0  # type: ignore
     if module.bias is not None:
         module.bias.sample_grad = 0  # type: ignore
@@ -100,19 +100,19 @@ class SampleGradientWrapper:
     - https://github.com/pytorch/opacus/tree/main/opacus/grad_sample
     """
 
-    def __init__(self, model) -> None:
+    def __init__(self, model):
         self.model = model
         self.hooks_added = False
-        self.activation_dict: DefaultDict[Module, List[Tensor]] = defaultdict(list)
-        self.gradient_dict: DefaultDict[Module, List[Tensor]] = defaultdict(list)
-        self.forward_hooks: List[torch.utils.hooks.RemovableHandle] = []
-        self.backward_hooks: List[torch.utils.hooks.RemovableHandle] = []
+        self.activation_dict = defaultdict(list)
+        self.gradient_dict = defaultdict(list)
+        self.forward_hooks = []
+        self.backward_hooks = []
 
-    def add_hooks(self) -> None:
+    def add_hooks(self):
         self.hooks_added = True
         self.model.apply(self._register_module_hooks)
 
-    def _register_module_hooks(self, module: torch.nn.Module) -> None:
+    def _register_module_hooks(self, module: torch.nn.Module):
         if isinstance(module, tuple(SUPPORTED_MODULES.keys())):
             self.forward_hooks.append(
                 module.register_forward_hook(self._forward_hook_fn)
@@ -126,7 +126,7 @@ def _forward_hook_fn(
         module: Module,
         module_input: Union[Tensor, Tuple[Tensor, ...]],
         module_output: Union[Tensor, Tuple[Tensor, ...]],
-    ) -> None:
+    ):
         inp_tuple = _format_tensor_into_tuples(module_input)
         self.activation_dict[module].append(inp_tuple[0].clone().detach())
 
@@ -135,11 +135,11 @@ def _backward_hook_fn(
         module: Module,
         grad_input: Union[Tensor, Tuple[Tensor, ...]],
         grad_output: Union[Tensor, Tuple[Tensor, ...]],
-    ) -> None:
+    ):
         grad_output_tuple = _format_tensor_into_tuples(grad_output)
         self.gradient_dict[module].append(grad_output_tuple[0].clone().detach())
 
-    def remove_hooks(self) -> None:
+    def remove_hooks(self):
         self.hooks_added = False
 
         for hook in self.forward_hooks:
@@ -151,11 +151,11 @@ def remove_hooks(self) -> None:
         self.forward_hooks = []
         self.backward_hooks = []
 
-    def _reset(self) -> None:
+    def _reset(self):
         self.activation_dict = defaultdict(list)
         self.gradient_dict = defaultdict(list)
 
-    def compute_param_sample_gradients(self, loss_blob, loss_mode="mean") -> None:
+    def compute_param_sample_gradients(self, loss_blob, loss_mode="mean"):
         assert (
             loss_mode.upper() in LossMode.__members__
         ), f"Provided loss mode {loss_mode} is not valid"
 
@@ -512,17 +512,17 @@ def attribute(
             if show_progress:
                 attr_progress.close()
 
-            combined_interp_inps = torch.cat(interpretable_inps).float()
+            combined_interp_inps = torch.cat(interpretable_inps).double()
             combined_outputs = (
                 torch.cat(outputs)
                 if len(outputs[0].shape) > 0
                 else torch.stack(outputs)
-            ).float()
+            ).double()
             combined_sim = (
                 torch.cat(similarities)
                 if len(similarities[0].shape) > 0
                 else torch.stack(similarities)
-            ).float()
+            ).double()
             dataset = TensorDataset(
                 combined_interp_inps, combined_outputs, combined_sim
             )
@@ -734,7 +734,7 @@ def __init__(
 
             forward_func (callable):  The forward function of the model or any
                     modification of it
-            interpretable_model (Model, optional): Model object to train
+            interpretable_model (optional, Model): Model object to train
                     interpretable model.
 
                     This argument is optional and defaults to SkLearnLasso(alpha=0.01),
@@ -760,7 +760,7 @@ def __init__(
                     Note that calling fit multiple times should retrain the
                     interpretable model, each attribution call reuses
                     the same given interpretable model object.
-            similarity_func (callable, optional): Function which takes a single sample
+            similarity_func (optional, callable): Function which takes a single sample
                     along with its corresponding interpretable representation
                     and returns the weight of the interpretable sample for
                     training the interpretable model.
@@ -793,7 +793,7 @@ def __init__(
 
                     kwargs includes baselines, feature_mask, num_interp_features
                     (integer, determined from feature mask).
-            perturb_func (callable, optional): Function which returns a single
+            perturb_func (optional, callable): Function which returns a single
                     sampled input, which is a binary vector of length
                     num_interp_features, or a generator of such tensors.
 
 
@@ -43,9 +43,9 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which saliency
-                        is computed. If forward_func takes a single tensor
-                        as input, a single input tensor should be provided.
+            inputs (tensor or tuple of tensors):  Input for which integrated
+                        gradients are computed. If forward_func takes a single
+                        tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
                         of the input tensors should be provided. It is assumed
                         that for all given input tensors, dimension 0 corresponds