Skip to content

Commit

Permalink
Merge pull request #34 from cfreehill/master
Browse files Browse the repository at this point in the history
Added rsmi_dev_pci_replay_counter_get()
  • Loading branch information
cfreehill authored May 6, 2019
2 parents d4af9e4 + 34c977b commit caf2748
Show file tree
Hide file tree
Showing 11 changed files with 88 additions and 21 deletions.
18 changes: 18 additions & 0 deletions include/rocm_smi/rocm_smi.h
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,24 @@ rsmi_status_t rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid);
rsmi_status_t rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent,
uint64_t *received, uint64_t *max_pkt_sz);

/**
* @brief Get PCIe replay counter
*
* @details Given a device index @p dv_ind and a pointer to a uint64_t @p
* counter, this function will write the sum of the number of NAK's received
* by the GPU and the NAK's generated by the GPU to memory pointed to by @p
* counter.
*
* @param[in] dv_ind a device index
*
* @param[inout] counter a pointer to uint64_t to which the sum of the NAK's
* received and generated by the GPU is written
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
*/
rsmi_status_t rsmi_dev_pci_replay_counter_get(uint32_t dv_ind,
uint64_t *counter);

/** @} */ // end of PCIeQuer
/*****************************************************************************/
/** @defgroup PCIeCont PCIe Control
Expand Down
4 changes: 3 additions & 1 deletion include/rocm_smi/rocm_smi_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi.h"
extern "C" {
#include "shared_mutex.h"
#include "shared_mutex.h" // NOLINT
};

namespace amd {
Expand Down Expand Up @@ -90,6 +90,7 @@ enum DevInfoTypes {
kDevMemUsedGTT,
kDevMemUsedVisVRAM,
kDevMemUsedVRAM,
kDevPCIEReplayCount,
};

class Device {
Expand All @@ -116,6 +117,7 @@ class Device {
void set_bdfid(uint64_t val) {bdfid_ = val;}
uint64_t get_bdfid(void) const {return bdfid_;}
pthread_mutex_t *mutex(void) {return mutex_.ptr;}

private:
std::shared_ptr<Monitor> monitor_;
std::shared_ptr<PowerMon> power_monitor_;
Expand Down
12 changes: 6 additions & 6 deletions include/rocm_smi/rocm_smi_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,23 +66,23 @@ int ReadSysfsStr(std::string path, std::string *retStr);
int WriteSysfsStr(std::string path, std::string val);

struct pthread_wrap {
public:
pthread_wrap(pthread_mutex_t &p_mut) : mutex_(p_mut) {}
public:
explicit pthread_wrap(pthread_mutex_t &p_mut) : mutex_(p_mut) {}

void Acquire() { pthread_mutex_lock(&mutex_); }
void Release() { pthread_mutex_unlock(&mutex_); }
private:
private:
pthread_mutex_t& mutex_;
};
struct ScopedPthread {
ScopedPthread(pthread_wrap& mutex) : pthrd_ref_(mutex) {
explicit ScopedPthread(pthread_wrap& mutex) : pthrd_ref_(mutex) {
pthrd_ref_.Acquire();
};
}

~ScopedPthread() {
pthrd_ref_.Release();
}
private:
private:
ScopedPthread(const ScopedPthread&);

pthread_wrap& pthrd_ref_;
Expand Down
17 changes: 15 additions & 2 deletions src/rocm_smi.cc
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ static rsmi_status_t errno_to_rsmi_status(uint32_t err) {
switch (err) {
case 0: return RSMI_STATUS_SUCCESS;
case EACCES: return RSMI_STATUS_PERMISSION;
case EPERM: return RSMI_STATUS_NOT_SUPPORTED;
case ENOENT:
case EPERM:
case ENOENT: return RSMI_STATUS_NOT_SUPPORTED;
case EISDIR: return RSMI_STATUS_FILE_ERROR;
default: return RSMI_STATUS_UNKNOWN_ERROR;
}
Expand Down Expand Up @@ -2044,3 +2044,16 @@ rsmi_version_str_get(rsmi_sw_component_t component, char *ver_str,

CATCH
}

rsmi_status_t
rsmi_dev_pci_replay_counter_get(uint32_t dv_ind, uint64_t *counter) {
TRY

DEVICE_MUTEX
rsmi_status_t ret;

ret = get_dev_value_int(amd::smi::kDevPCIEReplayCount, dv_ind, counter);
return ret;

CATCH
}
5 changes: 4 additions & 1 deletion src/rocm_smi_device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ static const char *kDevMemTotVRAMFName = "mem_info_vram_total";
static const char *kDevMemUsedGTTFName = "mem_info_gtt_used";
static const char *kDevMemUsedVisVRAMFName = "mem_info_vis_vram_used";
static const char *kDevMemUsedVRAMFName = "mem_info_vram_used";
static const char *kDevPCIEReplayCountFName = "pcie_replay_count";

// Strings that are found within sysfs files
static const char *kDevPerfLevelAutoStr = "auto";
Expand Down Expand Up @@ -136,6 +137,7 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
{kDevMemUsedGTT, kDevMemUsedGTTFName},
{kDevMemUsedVisVRAM, kDevMemUsedVisVRAMFName},
{kDevMemUsedVRAM, kDevMemUsedVRAMFName},
{kDevPCIEReplayCount, kDevPCIEReplayCountFName},
};

static const std::map<rsmi_dev_perf_level, const char *> kDevPerfLvlMap = {
Expand Down Expand Up @@ -202,7 +204,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {

DBG_FILE_ERROR(sysfs_path, str);
if (!isRegularFile(sysfs_path)) {
return EISDIR;
return ENOENT;
}

fs->open(sysfs_path);
Expand Down Expand Up @@ -367,6 +369,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
case kDevMemUsedGTT:
case kDevMemUsedVisVRAM:
case kDevMemUsedVRAM:
case kDevPCIEReplayCount:
ret = readDevInfoStr(type, &tempStr);
RET_IF_NONZERO(ret);
*val = std::stoul(tempStr, 0);
Expand Down
18 changes: 16 additions & 2 deletions src/shared_mutex/shared_mutex.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <stdio.h> // perror
#include <stdlib.h> // malloc, free
#include <string.h> // strcpy
#include <time.h> // clock_gettime

shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
shared_mutex_t mutex = {NULL, 0, NULL, 0};
Expand Down Expand Up @@ -51,7 +52,17 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
return mutex;
}

if (mutex.created == 0 && ((shared_mutex_t *)addr)->ptr == NULL) {
pthread_mutex_t *mutex_ptr = (pthread_mutex_t *)addr;

// Make sure the mutex wasn't left in a locked state. If we can't
// acquire it in 3 sec., re-do everything.
struct timespec expireTime;
clock_gettime(CLOCK_REALTIME, &expireTime);
expireTime.tv_sec += 3;

int ret = pthread_mutex_timedlock(mutex_ptr, &expireTime);

if (ret || (mutex.created == 0 && ((shared_mutex_t *)addr)->ptr == NULL)) {
// Something is out of sync. Unlink shm and start over.
if (shm_unlink(name)) {
mutex.shm_fd = 0;
Expand All @@ -60,9 +71,12 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
free(mutex.name);

return shared_mutex_init(name, mode);
} else {
if (pthread_mutex_unlock(mutex_ptr)) {
perror("pthread_mutex_unlock");
}
}

pthread_mutex_t *mutex_ptr = (pthread_mutex_t *)addr;

if (mutex.created) {
pthread_mutexattr_t attr;
Expand Down
3 changes: 3 additions & 0 deletions src/shared_mutex/shared_mutex.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
// NOLINT(legal/copyright)
// See LICENSE file

#ifndef SRC_SHARED_MUTEX_SHARED_MUTEX_H_
#define SRC_SHARED_MUTEX_SHARED_MUTEX_H_

Expand Down
2 changes: 1 addition & 1 deletion tests/rocm_smi_test/functional/err_cnt_read.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ void TestErrCntRead::Run(void) {
err = rsmi_dev_ecc_enabled_get(i, &enabled_mask);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout <<
"\t**Error Count Enabled Mask for is not supported on this machine"
"\t**Error Count Enabled Mask get is not supported on this machine"
<< std::endl;
} else {
CHK_ERR_ASRT(err)
Expand Down
26 changes: 20 additions & 6 deletions tests/rocm_smi_test/functional/pci_read_write.cc
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,26 @@ void TestPciReadWrite::Run(void) {
rsmi_status_t ret;
rsmi_pcie_bandwidth_t bw;
uint32_t freq_bitmask;
uint64_t sent, received, max_pkt_sz;
uint64_t sent, received, max_pkt_sz, u64int;

TestBase::Run();

for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
PrintDeviceHeader(dv_ind);

ret = rsmi_dev_pci_replay_counter_get(dv_ind, &u64int);

if (ret == RSMI_STATUS_NOT_SUPPORTED) {
std::cout <<
"\t**rsmi_dev_pci_replay_counter_get() is not supported"
" on this machine" << std::endl;
} else {
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\tPCIe Replay Counter: " << u64int << std::endl;
}
}

ret = rsmi_dev_pci_throughput_get(dv_ind, &sent, &received, &max_pkt_sz);
if (ret == RSMI_STATUS_NOT_SUPPORTED) {
std::cout << "TEST FAILURE: Current PCIe throughput is not detected. "
Expand All @@ -106,7 +119,7 @@ void TestPciReadWrite::Run(void) {
CHK_ERR_ASRT(ret)

IF_VERB(STANDARD) {
std::cout << "PCIe Throughput (1 sec.): " << std::endl;
std::cout << "\tPCIe Throughput (1 sec.): " << std::endl;
std::cout << "\t\tSent: " << sent << " bytes" << std::endl;
std::cout << "\t\tReceived: " << received << " bytes" << std::endl;
std::cout << "\t\tMax Packet Size: " << max_pkt_sz << " bytes" <<
Expand All @@ -125,7 +138,8 @@ void TestPciReadWrite::Run(void) {
CHK_ERR_ASRT(ret)

IF_VERB(STANDARD) {
std::cout << "Initial PCIe is " << bw.transfer_rate.current << std::endl;
std::cout << "\tInitial PCIe is " << bw.transfer_rate.current <<
std::endl;
}

// First set the bitmask to all supported bandwidths
Expand All @@ -141,7 +155,7 @@ void TestPciReadWrite::Run(void) {
freq_bm_str.size()-1));

IF_VERB(STANDARD) {
std::cout << "Setting bandwidth mask to " << "0b" << freq_bm_str <<
std::cout << "\tSetting bandwidth mask to " << "0b" << freq_bm_str <<
" ..." << std::endl;
}
ret = rsmi_dev_pci_bandwidth_set(dv_ind, freq_bitmask);
Expand All @@ -151,9 +165,9 @@ void TestPciReadWrite::Run(void) {
CHK_ERR_ASRT(ret)

IF_VERB(STANDARD) {
std::cout << "Bandwidth is now index " << bw.transfer_rate.current <<
std::cout << "\tBandwidth is now index " << bw.transfer_rate.current <<
std::endl;
std::cout << "Resetting mask to all bandwidths." << std::endl;
std::cout << "\tResetting mask to all bandwidths." << std::endl;
}
ret = rsmi_dev_pci_bandwidth_set(dv_ind, 0xFFFFFFFF);
CHK_ERR_ASRT(ret)
Expand Down
2 changes: 1 addition & 1 deletion tests/rocm_smi_test/functional/temp_read.cc
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ void TestTempRead::Run(void) {
err = rsmi_dev_temp_metric_get(i, 0, met, &val_i64);

if (err != RSMI_STATUS_SUCCESS) {
if (err == RSMI_STATUS_FILE_ERROR) {
if (err == RSMI_STATUS_NOT_SUPPORTED) {
IF_VERB(STANDARD) {
std::cout << "\t**" << label << ": " <<
"Not supported on this machine" << std::endl;
Expand Down
2 changes: 1 addition & 1 deletion tests/rocm_smi_test/functional/volt_freq_curv_read.cc
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ void TestVoltCurvRead::Run(void) {

err = rsmi_dev_od_volt_info_get(i, &odv);
if (err == RSMI_STATUS_FILE_ERROR ||
err == RSMI_STATUS_NOT_YET_IMPLEMENTED) {
err == RSMI_STATUS_NOT_SUPPORTED) {
IF_VERB(STANDARD) {
std::cout <<
"\t**rsmi_dev_od_volt_info_get: Not supported on this machine"
Expand Down

0 comments on commit caf2748

Please sign in to comment.