Skip to content

Commit

Permalink
[Mellanox] Fix issue: watchdogutil command does not work (sonic-net#1…
Browse files Browse the repository at this point in the history
…6091)

- Why I did it
watchdogutil uses platform API watchdog instance to control/query watchdog status. In Nvidia watchdog status, it caches "armed" status in a object member "WatchdogImplBase.armed". This is not working for CLI infrastructure because each CLI will create a new watchdog instance, the status cached in previous instance will totally lose. Consider following commands:

admin@sonic:~$ sudo watchdogutil arm -s 100      =====> watchdog instance1, armed=True
Watchdog armed for 100 seconds
admin@sonic:~$ sudo watchdogutil status             ======> watchdog instance2, armed=False
Status: Unarmed
admin@sonic:~$ sudo watchdogutil disarm            =======> watchdog instance3, armed=False
Failed to disarm Watchdog

- How I did it
Use sysfs to query watchdog status

- How to verify it
Manual test
Unit test
Conflicts:
	platform/mellanox/mlnx-platform-api/sonic_platform/watchdog.py
	platform/mellanox/mlnx-platform-api/tests/test_watchdog.py
  • Loading branch information
Junchao-Mellanox committed Aug 24, 2023
1 parent 611449d commit b83e462
Show file tree
Hide file tree
Showing 2 changed files with 165 additions and 25 deletions.
49 changes: 24 additions & 25 deletions platform/mellanox/mlnx-platform-api/sonic_platform/watchdog.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import time

from sonic_platform_base.watchdog_base import WatchdogBase
from . import utils

""" ioctl constants """
IO_WRITE = 0x40000000
Expand Down Expand Up @@ -80,15 +81,17 @@ def __init__(self, wd_device_path):
super(WatchdogImplBase, self).__init__()

self.watchdog_path = wd_device_path
self.watchdog = os.open(self.watchdog_path, os.O_WRONLY)
self._watchdog = None
self.timeout = self._gettimeout()

# Opening a watchdog descriptor starts
# watchdog timer;
# by default it should be stopped
self._disablecard()
self.armed = False
@property
def watchdog(self):
if self._watchdog is None:
self._watchdog = self.open_handle()
return self._watchdog

self.timeout = self._gettimeout()
def open_handle(self):
return os.open(self.watchdog_path, os.O_WRONLY)

def _enablecard(self):
"""
Expand Down Expand Up @@ -131,21 +134,15 @@ def _gettimeout(self):
@return watchdog timeout
"""

req = array.array('I', [0])
fcntl.ioctl(self.watchdog, WDIOC_GETTIMEOUT, req, True)

return int(req[0])
return utils.read_int_from_file('/run/hw-management/watchdog/main/timeout')

def _gettimeleft(self):
"""
Get time left before watchdog timer expires
@return time left in seconds
"""

req = array.array('I', [0])
fcntl.ioctl(self.watchdog, WDIOC_GETTIMELEFT, req, True)

return int(req[0])
return utils.read_int_from_file('/run/hw-management/watchdog/main/timeleft')

def arm(self, seconds):
"""
Expand All @@ -159,11 +156,10 @@ def arm(self, seconds):
try:
if self.timeout != seconds:
self.timeout = self._settimeout(seconds)
if self.armed:
if self.is_armed():
self._keepalive()
else:
self._enablecard()
self.armed = True
ret = self.timeout
except IOError:
pass
Expand All @@ -176,10 +172,9 @@ def disarm(self):
"""

disarmed = False
if self.armed:
if self.is_armed():
try:
self._disablecard()
self.armed = False
disarmed = True
except IOError:
pass
Expand All @@ -191,7 +186,7 @@ def is_armed(self):
Implements is_armed WatchdogBase API
"""

return self.armed
return utils.read_str_from_file('/run/hw-management/watchdog/main/state') == 'active'

def get_remaining_time(self):
"""
Expand All @@ -200,7 +195,7 @@ def get_remaining_time(self):

timeleft = WD_COMMON_ERROR

if self.armed:
if self.is_armed():
try:
timeleft = self._gettimeleft()
except IOError:
Expand All @@ -213,13 +208,15 @@ def __del__(self):
Close watchdog
"""

os.close(self.watchdog)
if self._watchdog is not None:
os.close(self._watchdog)


class WatchdogType1(WatchdogImplBase):
"""
Watchdog type 1
"""
TIMESTAMP_FILE = '/tmp/nvidia/watchdog_timestamp'

def arm(self, seconds):
"""
Expand All @@ -230,7 +227,8 @@ def arm(self, seconds):
ret = WatchdogImplBase.arm(self, seconds)
# Save the watchdog arm timestamp
# requiered for get_remaining_time()
self.arm_timestamp = time.time()
os.makedirs('/tmp/nvidia', exist_ok=True)
utils.write_file(self.TIMESTAMP_FILE, str(time.time()))

return ret

Expand All @@ -243,8 +241,9 @@ def get_remaining_time(self):

timeleft = WD_COMMON_ERROR

if self.armed:
timeleft = int(self.timeout - (time.time() - self.arm_timestamp))
if self.is_armed():
arm_timestamp = utils.read_float_from_file(self.TIMESTAMP_FILE)
timeleft = int(self.timeout - (time.time() - arm_timestamp))

return timeleft

Expand Down
141 changes: 141 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/test_watchdog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import pytest
import sys
if sys.version_info.major == 3:
from unittest import mock
else:
import mock

test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
sys.path.insert(0, modules_path)

from sonic_platform.chassis import Chassis
from sonic_platform.watchdog import get_watchdog, \
WatchdogType2, \
WatchdogType1, \
is_mlnx_wd_main, \
is_wd_type2


class TestWatchdog:
@mock.patch('sonic_platform.watchdog.is_mlnx_wd_main')
@mock.patch('sonic_platform.watchdog.os.listdir')
def test_get_watchdog_no_device(self, mock_listdir, mock_is_main):
mock_listdir.return_value = []
assert get_watchdog() is None

mock_listdir.return_value = ['invalid']
mock_is_main.return_value = True
assert get_watchdog() is None

mock_listdir.return_value = ['watchdog1']
mock_is_main.return_value = False
assert get_watchdog() is None

@mock.patch('sonic_platform.watchdog.is_mlnx_wd_main')
@mock.patch('sonic_platform.watchdog.is_wd_type2')
@mock.patch('sonic_platform.watchdog.os.listdir', mock.MagicMock(return_value=['watchdog1', 'watchdog2']))
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock())
@pytest.mark.parametrize('test_para',
[(True, WatchdogType2), (False, WatchdogType1)])
def test_get_watchdog(self, mock_is_type2, mock_is_main, test_para):
mock_is_main.side_effect = lambda dev: dev == 'watchdog2'
mock_is_type2.return_value = test_para[0]
chassis = Chassis()
watchdog = chassis.get_watchdog()
assert isinstance(watchdog, test_para[1])
assert watchdog.watchdog_path == '/dev/watchdog2'

def test_is_mlnx_wd_main(self):
mock_os_open = mock.mock_open(read_data='mlx-wdt-main')
with mock.patch('sonic_platform.watchdog.open', mock_os_open):
assert is_mlnx_wd_main('')

mock_os_open = mock.mock_open(read_data='invalid')
with mock.patch('sonic_platform.watchdog.open', mock_os_open):
assert not is_mlnx_wd_main('')
mock_os_open.side_effect = IOError
with mock.patch('sonic_platform.watchdog.open', mock_os_open):
assert not is_mlnx_wd_main('')

@mock.patch('sonic_platform.watchdog.os.path.exists')
@pytest.mark.parametrize('test_para',
[True, False])
def test_is_wd_type2(self, mock_exists, test_para):
mock_exists.return_value = test_para
assert is_wd_type2('') is test_para

@mock.patch('sonic_platform.utils.read_str_from_file')
def test_is_armed(self, mock_read):
watchdog = WatchdogType2('watchdog2')
mock_read.return_value = 'inactive'
assert not watchdog.is_armed()
mock_read.return_value = 'active'
assert watchdog.is_armed()

@mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed')
def test_arm_disarm_watchdog2(self, mock_is_armed):
watchdog = WatchdogType2('watchdog2')
assert watchdog.arm(-1) == -1
mock_is_armed.return_value = False
watchdog.arm(10)
mock_is_armed.return_value = True
watchdog.arm(5)
watchdog.disarm()

@mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed')
def test_arm_disarm_watchdog1(self, mock_is_armed):
watchdog = WatchdogType1('watchdog1')
assert watchdog.arm(-1) == -1
mock_is_armed.return_value = False
watchdog.arm(10)
mock_is_armed.return_value = True
watchdog.arm(5)
watchdog.disarm()

@mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.WatchdogImplBase._gettimeleft', mock.MagicMock(return_value=10))
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed')
def test_get_remaining_time_watchdog2(self, mock_is_armed):
watchdog = WatchdogType2('watchdog2')
mock_is_armed.return_value = False
assert watchdog.get_remaining_time() == -1
watchdog.arm(10)
mock_is_armed.return_value = True
assert watchdog.get_remaining_time() == 10

@mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.WatchdogImplBase._gettimeleft', mock.MagicMock(return_value=10))
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed')
def test_get_remaining_time_watchdog1(self, mock_is_armed):
watchdog = WatchdogType1('watchdog2')
mock_is_armed.return_value = False
assert watchdog.get_remaining_time() == -1
watchdog.arm(10)
mock_is_armed.return_value = True
assert watchdog.get_remaining_time() > 0

0 comments on commit b83e462

Please sign in to comment.