diff --git a/examples/dna/dna_trainer.py b/examples/dna/dna_trainer.py
new file mode 100644
index 00000000..d08167b4
--- /dev/null
+++ b/examples/dna/dna_trainer.py
@@ -0,0 +1,144 @@
+import os
+# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+# os.environ['TL_BACKEND'] = 'torch'
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
+# 0:Output all; 1:Filter out INFO; 2:Filter out INFO and WARNING; 3:Filter out INFO, WARNING, and ERROR
+
+import argparse
+import tensorlayerx as tlx
+from gammagl.datasets import Planetoid
+from gammagl.utils import add_self_loops, mask_to_index
+from tensorlayerx.model import TrainOneStep, WithLoss
+from gammagl.models import DNAModel
+from sklearn.model_selection import StratifiedKFold
+import numpy as np
+
+
+class SemiSpvzLoss(WithLoss):
+    def __init__(self, net, loss_fn):
+        super(SemiSpvzLoss, self).__init__(backbone=net, loss_fn=loss_fn)
+
+    def forward(self, data, y):
+        logits = self.backbone_network(data['x'], data['edge_index'])
+        train_logits = tlx.gather(logits, data['train_idx'])
+        train_y = tlx.gather(data['y'], data['train_idx'])
+        loss = self._loss_fn(train_logits, train_y)
+        return loss
+
+
+def calculate_acc(logits, y, metrics):
+    """
+    Args:
+        logits: node logits
+        y: node labels
+        metrics: tensorlayerx.metrics
+
+    Returns:
+        rst
+    """
+
+    metrics.update(logits, y)
+    rst = metrics.result()
+    metrics.reset()
+    return rst
+
+def gen_uniform_20_20_60_split(data):
+    skf = StratifiedKFold(5, shuffle=True, random_state=55)
+    data.y = tlx.convert_to_numpy(data.y)
+    idx = [tlx.convert_to_tensor(i) for _, i in skf.split(data.y, data.y)]
+    data.train_idx = tlx.convert_to_tensor(idx[0], dtype=tlx.int64)
+    data.val_idx = tlx.convert_to_tensor(idx[1], dtype=tlx.int64)
+    data.test_idx = tlx.convert_to_tensor(tlx.concat(idx[2:], axis=0), dtype=tlx.int64)
+    data.y = tlx.convert_to_tensor(data.y)
+    return data
+
+
+def main(args):
+    # load datasets
+    if str.lower(args.dataset) not in ['cora','pubmed','citeseer']:
+        raise ValueError('Unknown dataset: {}'.format(args.dataset))
+    dataset = Planetoid(args.dataset_path, args.dataset)
+    graph = dataset[0]
+    graph = gen_uniform_20_20_60_split(graph)
+
+    net = DNAModel(in_channels=dataset.num_node_features,
+                   hidden_channels=args.hidden_dim,
+                   out_channels=dataset.num_classes,
+                   num_layers=args.num_layers,
+                   drop_rate_conv=args.drop_rate_conv,
+                   drop_rate_model=args.drop_rate_model,
+                   heads=args.heads,
+                   groups=args.groups,
+                   name="DNA")
+
+    optimizer = tlx.optimizers.Adam(lr=args.lr, weight_decay=args.l2_coef)
+    metrics = tlx.metrics.Accuracy()
+    train_weights = net.trainable_weights
+
+    loss_func = SemiSpvzLoss(net, tlx.losses.softmax_cross_entropy_with_logits)
+    train_one_step = TrainOneStep(loss_func, optimizer, train_weights)
+
+    data = {
+        "x": graph.x,
+        "y": graph.y,
+        "edge_index": graph.edge_index,
+        # "edge_weight": edge_weight,
+        "train_idx": graph.train_idx,
+        "test_idx": graph.test_idx,
+        "val_idx": graph.val_idx,
+        "num_nodes": graph.num_nodes,
+    }
+
+    best_val_acc = 0
+    for epoch in range(args.n_epoch):
+        net.set_train()
+        train_loss = train_one_step(data, graph.y)
+        net.set_eval()
+        logits = net(data['x'], data['edge_index'])
+        val_logits = tlx.gather(logits, data['val_idx'])
+        val_y = tlx.gather(data['y'], data['val_idx'])
+        val_acc = calculate_acc(val_logits, val_y, metrics)
+
+        print("Epoch [{:0>3d}] ".format(epoch+1)\
+              + "  train loss: {:.4f}".format(train_loss.item())\
+              + "  val acc: {:.4f}".format(val_acc))
+
+        # save best model on evaluation set
+        if val_acc > best_val_acc:
+            best_val_acc = val_acc
+            net.save_weights(args.best_model_path+net.name+".npz", format='npz_dict')
+
+    net.load_weights(args.best_model_path+net.name+".npz", format='npz_dict')
+    net.set_eval()
+    logits = net(data['x'], data['edge_index'])
+    test_logits = tlx.gather(logits, data['test_idx'])
+    test_y = tlx.gather(data['y'], data['test_idx'])
+    test_acc = calculate_acc(test_logits, test_y, metrics)
+    print("Test acc:  {:.4f}".format(test_acc))
+
+
+if __name__ == '__main__':
+    # parameters setting
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lr", type=float, default=0.005, help="learnin rate")
+    parser.add_argument("--n_epoch", type=int, default=200, help="number of epoch")
+    parser.add_argument("--hidden_dim", type=int, default=128, help="dimention of hidden layers")
+    parser.add_argument("--drop_rate_conv", type=float, default=0.8, help="drop_rate_conv")
+    parser.add_argument("--drop_rate_model", type=float, default=0.8, help="drop_rate_model")
+    parser.add_argument("--num_layers", type=int, default=4, help="number of layers")
+    parser.add_argument("--heads", type=int, default=8, help="number of heads for stablization")
+    parser.add_argument("--groups", type=int, default=16, help="number of groups")
+    parser.add_argument("--l2_coef", type=float, default=5e-5, help="l2 loss coeficient")
+    parser.add_argument('--dataset', type=str, default='cora', help='dataset')
+    parser.add_argument("--dataset_path", type=str, default=r'', help="path to save dataset")
+    parser.add_argument("--best_model_path", type=str, default=r'./', help="path to save best model")
+    parser.add_argument("--self_loops", type=int, default=1, help="number of graph self-loop")
+    parser.add_argument("--gpu", type=int, default=6)
+    
+    args = parser.parse_args()
+    if args.gpu >= 0:
+        tlx.set_device("GPU", args.gpu)
+    else:
+        tlx.set_device("CPU")
+
+    main(args)
diff --git a/examples/dna/readme.md b/examples/dna/readme.md
new file mode 100644
index 00000000..e672b552
--- /dev/null
+++ b/examples/dna/readme.md
@@ -0,0 +1,45 @@
+# JUST JUMP: DYNAMIC NEIGHBORHOOD AGGREGATION IN GRAPH NEURAL NETWORKS (DNA)
+
+- Paper link: [https://arxiv.org/abs/1904.04849](https://arxiv.org/abs/1904.04849)
+- The implementation of PyG: [https://github.com/pyg-team/pytorch_geometric/blob/master/examples/dna.py](https://github.com/pyg-team/pytorch_geometric/blob/master/examples/dna.py). 
+
+# Dataset Statics
+
+| Dataset  | # Nodes | # Edges | # Classes |
+|----------|---------|---------|-----------|
+| Cora     | 2,708   | 10,556  | 7         |
+| Citeseer | 3,327   | 9,228   | 6         |
+| Pubmed   | 19,717  | 88,651  | 3         |
+
+Refer to [Planetoid](https://gammagl.readthedocs.io/en/latest/api/gammagl.datasets.html#gammagl.datasets.Planetoid).
+
+Results
+-------
+
+```bash
+# available dataset: "cora", "citeseer", "pubmed"
+
+TL_BACKEND="torch" python dna_trainer.py --dataset cora --lr 0.01 --drop_rate_conv 0.2 --drop_rate_model 0.8 --num_layers 3 --heads 64 --groups 1 --l2_coef 5e-5 --hidden_dim 256
+TL_BACKEND="torch" python dna_trainer.py --dataset cora --lr 0.01 --drop_rate_conv 0.2 --drop_rate_model 0.9 --num_layers 3 --heads 32 --groups 8 --l2_coef 5e-5 --hidden_dim 256
+TL_BACKEND="torch" python dna_trainer.py --dataset cora --lr 0.01 --drop_rate_conv 0.1 --drop_rate_model 0.8 --num_layers 4 --heads 8 --groups 16 --l2_coef 5e-5 --hidden_dim 128
+TL_BACKEND="torch" python dna_trainer.py --dataset citeseer --lr 0.01 --drop_rate_conv 0.2 --drop_rate_model 0.8 --num_layers 3 --heads 32 --groups 1 --l2_coef 5e-5 --hidden_dim 128
+TL_BACKEND="torch" python dna_trainer.py --dataset citeseer --lr 0.01 --drop_rate_conv 0.1 --drop_rate_model 0.8 --num_layers 4 --heads 8 --groups 8 --l2_coef 5e-5 --hidden_dim 128
+TL_BACKEND="torch" python dna_trainer.py --dataset citeseer --lr 0.01 --drop_rate_conv 0.1 --drop_rate_model 0.8 --num_layers 4 --heads 8 --groups 16 --l2_coef 5e-5 --hidden_dim 128
+TL_BACKEND="torch" python dna_trainer.py --dataset pubmed --lr 0.01 --drop_rate_conv 0.1 --drop_rate_model 0.8 --num_layers 4 --heads 8 --groups 1 --l2_coef 5e-5 --hidden_dim 128
+TL_BACKEND="torch" python dna_trainer.py --dataset pubmed --lr 0.01 --drop_rate_conv 0.1 --drop_rate_model 0.8 --num_layers 4 --heads 8 --groups 8 --l2_coef 5e-5 --hidden_dim 128
+TL_BACKEND="torch" python dna_trainer.py --dataset pubmed --lr 0.01 --drop_rate_conv 0.1 --drop_rate_model 0.8 --num_layers 4 --heads 8 --groups 16 --l2_coef 5e-5 --hidden_dim 128
+```
+
+| Dataset  | group | Paper      | Our(th)    |
+| -------- | ----- | ---------- | ---------- |
+| cora     | 1     | 83.88±0.50 | 80.50±0.81 |
+| cora     | 8     | 85.86±0.45 | 81.22±0.18 |
+| cora     | 16    | 86.15±0.57 | 82.25±0.4  |
+| citeseer | 1     | 73.37±0.83 | 71.41±1.02 |
+| citeseer | 8     | 74.19±0.66 | 72.29±0.59 |
+| citeseer | 16    | 74.50±0.62 | 72.99±0.68 |
+| pubmed   | 1     | 87.80±0.25 | 87.32±0.52 |
+| pubmed   | 8     | 88.04±0.17 | 87.46±0.25 |
+| pubmed   | 16    | 88.04±0.22 | 87.49±0.11 |
+
+![image-20240703225836573](readme.assets/image-20240703225836573.png)
diff --git a/gammagl/layers/conv/__init__.py b/gammagl/layers/conv/__init__.py
index 0162a6cb..84c89697 100644
--- a/gammagl/layers/conv/__init__.py
+++ b/gammagl/layers/conv/__init__.py
@@ -33,6 +33,7 @@
 from .magcl_conv import MAGCLConv
 from .fusedgat_conv import FusedGATConv
 from .hid_conv import Hid_conv
+from .dna_conv import DNAConv
 __all__ = [
     'MessagePassing',
     'GCNConv',
@@ -68,7 +69,8 @@
     'MAGCLConv',
     'FusedGATConv',
     'Hid_conv',
-    'HEATlayer'
+    'HEATlayer',
+    'DNAConv'
 ]
 
 classes = __all__
diff --git a/gammagl/layers/conv/dna_conv.py b/gammagl/layers/conv/dna_conv.py
new file mode 100644
index 00000000..968cd36c
--- /dev/null
+++ b/gammagl/layers/conv/dna_conv.py
@@ -0,0 +1,213 @@
+import tensorlayerx as tlx
+from gammagl.layers.conv import MessagePassing
+from gammagl.utils import calc_gcn_norm, add_self_loops
+from gammagl.utils.num_nodes import maybe_num_nodes
+import math
+import numpy as np
+
+
+class Linear(tlx.nn.Module):
+    def __init__(self, in_channels, out_channels, groups=1, bias=True):
+        super(Linear, self).__init__()
+        assert in_channels % groups == 0 and out_channels % groups == 0
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.groups = groups
+
+        shape = (groups, in_channels // groups, out_channels // groups)[::-1]
+        self.weight = self._get_weights('weights', shape=shape, init=tlx.initializers.he_uniform(a=math.sqrt(5)))
+
+        if bias:
+            initor = tlx.initializers.RandomUniform(0, 1)
+            self.bias = tlx.nn.Parameter(tlx.random_uniform(shape = (self.out_channels,)))
+        else:
+            self.bias = None
+
+    def forward(self, src):
+        if self.groups > 1:
+            src_shape = tlx.get_tensor_shape(src)[:-1]
+            src = tlx.reshape(src, (-1, self.groups, self.in_channels // self.groups))
+            src = tlx.transpose(src, (1, 0, 2))
+            out = tlx.matmul(src, self.weight)
+            out = tlx.transpose(out, (1, 0, 2))
+            out = tlx.reshape(out, tuple(src_shape) + (self.out_channels,))
+        else:
+            out = tlx.matmul(src, tlx.squeeze(self.weight, axis=0))
+
+        if self.bias is not None:
+            out += self.bias
+        
+        return out
+
+
+def restricted_softmax(src, dim: int = -1, margin: float = 0.):
+    src_max = tlx.reduce_max(src, axis=dim, keepdims=True)
+    src_max = np.clip(tlx.convert_to_numpy(src_max), 0, None)
+    src_max = tlx.convert_to_tensor(src_max)
+    out = tlx.exp(src - src_max)
+    out = out / (tlx.reduce_sum(out, axis=dim, keepdims=True) + tlx.exp(margin - src_max))
+
+    return out
+    
+class Attention(tlx.nn.Module):
+    def __init__(self, dropout=0):
+        super(Attention, self).__init__()
+        self.dropout = tlx.nn.Dropout(p = dropout)
+
+    def forward(self, query, key, value):
+        return self.compute_attention(query, key, value)
+
+    def compute_attention(self, query, key, value):
+        # query: [*, query_entries, dim_k]
+        # key: [*, key_entries, dim_k]
+        # value: [*, key_entries, dim_v]
+        # Output: [*, query_entries, dim_v]
+
+        assert query.ndim == key.ndim == value.ndim >= 2
+        assert query.shape[-1] == key.shape[-1]
+        assert key.shape[-2] == value.shape[-2]
+
+        score = tlx.matmul(query, tlx.ops.transpose(key, (0, 1, 3, 2)))
+        score = score / math.sqrt(key.shape[-1])
+        score = restricted_softmax(score, dim=-1)
+        score = self.dropout(score)
+
+        return tlx.matmul(score, value)
+    
+
+class MultiHead(Attention):
+    def __init__(self, in_channels, out_channels, heads=1, groups=1, dropout=0, bias=True):
+        super().__init__(dropout=dropout)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.heads = heads
+        self.groups = groups
+        self.bias = bias
+
+        assert in_channels % heads == 0 and out_channels % heads == 0
+        assert in_channels % groups == 0 and out_channels % groups == 0
+        assert max(groups, heads) % min(groups, heads) == 0
+
+
+        self.lin_q = Linear(in_channels, out_channels, groups=groups, bias=bias)
+        self.lin_k = Linear(in_channels, out_channels, groups=groups, bias=bias)
+        self.lin_v = Linear(in_channels, out_channels, groups=groups, bias=bias)
+
+    def forward(self, query, key, value):
+        query = self.lin_q(query)
+        key = self.lin_k(key)
+        value = self.lin_v(value)
+
+        batch_size = tlx.get_tensor_shape(query)[:-2]
+
+        out_channels_per_head = self.out_channels // self.heads
+
+        query_size = tuple(batch_size) + (query.shape[-2], self.heads, out_channels_per_head)
+        query = tlx.ops.reshape(query, query_size)
+        query = tlx.ops.transpose(query, (0, 2, 1, 3))
+
+        key_size = tuple(batch_size) + (key.shape[-2], self.heads, out_channels_per_head)
+        key = tlx.ops.reshape(key, key_size)
+        key = tlx.ops.transpose(key, (0, 2, 1, 3))
+
+        value_size = tuple(batch_size) + (value.shape[-2], self.heads, out_channels_per_head)
+        value = tlx.ops.reshape(value, value_size)
+        value = tlx.ops.transpose(value, (0, 2, 1, 3))
+
+        out = self.compute_attention(query, key, value)
+
+        out = tlx.transpose(out, (0, 2, 1, 3))
+        out = tlx.reshape(out, tuple(batch_size) + (query.shape[-2], self.out_channels))
+        
+        return out
+    
+
+class DNAConv(MessagePassing):
+    r"""The dynamic neighborhood aggregation operator from the `"Just Jump:
+    Towards Dynamic Neighborhood Aggregation in Graph Neural Networks"
+    <https://arxiv.org/abs/1904.04849>`_ paper.
+
+    .. math::
+        \mathbf{x}_v^{(t)} = h_{\mathbf{\Theta}}^{(t)} \left( \mathbf{x}_{v
+        \leftarrow v}^{(t)}, \left\{ \mathbf{x}_{v \leftarrow w}^{(t)} : w \in
+        \mathcal{N}(v) \right\} \right)
+
+    based on (multi-head) dot-product attention
+
+    .. math::
+        \mathbf{x}_{v \leftarrow w}^{(t)} = \textrm{Attention} \left(
+        \mathbf{x}^{(t-1)}_v \, \mathbf{\Theta}_Q^{(t)}, [\mathbf{x}_w^{(1)},
+        \ldots, \mathbf{x}_w^{(t-1)}] \, \mathbf{\Theta}_K^{(t)}, \,
+        [\mathbf{x}_w^{(1)}, \ldots, \mathbf{x}_w^{(t-1)}] \,
+        \mathbf{\Theta}_V^{(t)} \right)
+
+    with :math:`\mathbf{\Theta}_Q^{(t)}, \mathbf{\Theta}_K^{(t)},
+    \mathbf{\Theta}_V^{(t)}` denoting (grouped) projection matrices for query,
+    key and value information, respectively.
+    :math:`h^{(t)}_{\mathbf{\Theta}}` is implemented as a non-trainable
+    version of :class:`torch_geometric.nn.conv.GCNConv`.
+
+    .. note::
+        In contrast to other layers, this operator expects node features as
+        shape :obj:`[num_nodes, num_layers, channels]`.
+
+    Parameters
+    ----------
+    channels: int
+        Size of each input/output sample.
+    heads: int, optional
+        Number of multi-head-attentions.
+        (default: :obj:`1`)
+    groups: int, optional
+        Number of groups to use for all linear projections.
+        (default: :obj:`1`)
+    dropout: float, optional
+        Dropout probability of attention coefficients.
+        (default: :obj:`0.`)
+    normalize: bool, optional
+        Whether to add self-loops and apply symmetric normalization.
+        (default: :obj:`True`)
+    add_self_loops: bool, optional
+        If set to :obj:`False`, will not add self-loops to the input graph.
+        (default: :obj:`True`)
+    bias: bool, optional
+        If set to :obj:`False`, the layer will not learn an additive bias.
+        (default: :obj:`True`)
+
+    Shapes:
+        - **input:**
+          node features :math:`(|\mathcal{V}|, L, F)` where :math:`L` is the
+          number of layers,
+          edge indices :math:`(2, |\mathcal{E}|)`
+        - **output:** node features :math:`(|\mathcal{V}|, F)`
+    """
+
+    def __init__(self, channels: int, heads: int = 1, groups: int = 1,
+                 dropout: float = 0., normalize: bool = True, add_self_loops: bool = True,
+                 bias: bool = True):
+        super().__init__()
+
+        self.bias = bias
+        self.normalize = normalize
+        self.add_self_loops = add_self_loops
+
+        self.multi_head = MultiHead(channels, channels, heads, groups, dropout, bias)
+
+    def forward(self, x, edge_index, edge_weight = None):
+        if self.normalize and edge_weight == None:
+            edge_index, edge_weight = add_self_loops(edge_index)
+            edge_weight = calc_gcn_norm(edge_index=edge_index, num_nodes=maybe_num_nodes(edge_index), edge_weight=edge_weight)
+        else:
+            edge_weight = tlx.ones((edge_index.shape[1],))
+
+        return self.propagate(edge_index=edge_index, x=x, edge_weight=edge_weight)
+    
+    def message(self, x, edge_index, edge_weight=None):
+        x_i = tlx.gather(x, edge_index[0, :])
+        x_j = tlx.gather(x, edge_index[1, :])
+
+        x_i = x_i[:, -1:]
+        out = self.multi_head(x_i, x_j, x_j)
+        return tlx.reshape(edge_weight, (-1, 1)) * tlx.squeeze(out, axis=1)
diff --git a/gammagl/models/__init__.py b/gammagl/models/__init__.py
index c08b85c0..9d564bfb 100644
--- a/gammagl/models/__init__.py
+++ b/gammagl/models/__init__.py
@@ -57,6 +57,7 @@
 from .fusedgat import FusedGATModel
 from .hid_net import Hid_net
 from .gnnlfhf import GNNLFHFModel
+from .dna import DNAModel
 
 __all__ = [
     'HeCo',
@@ -117,7 +118,8 @@
     'FusedGATModel',
     'hid_net',
     'HEAT',
-    'GNNLFHFModel'
+    'GNNLFHFModel',
+    'DNAModel'
 ]
 
 classes = __all__
diff --git a/gammagl/models/dna.py b/gammagl/models/dna.py
new file mode 100644
index 00000000..448eed3d
--- /dev/null
+++ b/gammagl/models/dna.py
@@ -0,0 +1,32 @@
+import tensorlayerx as tlx
+from gammagl.layers.conv import DNAConv
+import math
+
+
+class DNAModel(tlx.nn.Module):
+    def __init__(self, in_channels, hidden_channels, out_channels, num_layers, drop_rate_conv = 0.2, drop_rate_model = 0.8,
+                 heads=1, groups=1, name = None):
+        super().__init__(name=name)
+        self.hidden_channels = hidden_channels
+        self.lin1 = tlx.nn.Linear(in_features=in_channels, out_features=hidden_channels, W_init=tlx.nn.he_uniform(a=math.sqrt(5)))
+        self.convs = tlx.nn.ModuleList()
+        for i in range(num_layers):
+            self.convs.append(
+                DNAConv(hidden_channels, heads, groups, dropout=drop_rate_conv))
+        self.lin2 = tlx.nn.Linear(in_features=hidden_channels, out_features=out_channels)
+        self.relu = tlx.nn.ReLU()
+        self.dropout = tlx.nn.Dropout(p=drop_rate_model)
+
+    def forward(self, x, edge_index):
+        x = self.relu(self.lin1(x))
+        x = self.dropout(x)
+        x_all = tlx.reshape(x, (-1, 1, self.hidden_channels))
+        for conv in self.convs:
+            x = self.relu(conv(x_all, edge_index))
+            x = tlx.reshape(x, (-1, 1, self.hidden_channels))
+            x_all = tlx.concat([x_all, x], axis=1)
+        x = x_all[:, -1]
+        x = self.dropout(x)
+        x = self.lin2(x)
+
+        return tlx.logsoftmax(x, dim=1)