fix gtest and driver

ROCm · cognaiger9 · May 13, 2024 · May 16, 2024 · May 16, 2024 · May 17, 2024
commit 6e055828fe70227114069d8c56a3d6673807d21c
@@ -45,26 +45,19 @@ template <typename Tgpu, typename Tcheck>
 int mloL1LossReducedForwardRunHost(const miopenTensorDescriptor_t iDesc,
                                    const Tgpu* input,
                                    const Tgpu* target,
-                                   Tcheck* workspacehost,
                                    Tcheck* outputhost,
                                    miopenLossReductionMode_t reduction)
 {
     auto size      = miopen::deref(iDesc).GetElementSize();
     size_t divisor = (reduction == MIOPEN_LOSS_REDUCTION_MEAN) ? size : 1;
 
-    // Phase 1: Calc loss for each element
+    double output = 0.0;
     for(size_t i = 0; i < size; i++)
     {
-        workspacehost[i] = abs(input[i] - target[i]) / divisor;
+        float diff = abs(static_cast<float>(input[i]) - static_cast<float>(target[i]));
+        output += diff;
     }
-
-    // Phase 2: Reduce
-    float output = 0.0;
-    for(size_t i = 0; i < size; i++)
-    {
-        output += workspacehost[i];
-    }
-    outputhost[0] = output;
+    outputhost[0] = output / divisor;
 
     return 0;
 }
@@ -128,7 +121,6 @@ class L1LossDriver : public Driver
     std::vector<Tgpu> workspace;
 
     std::vector<Tref> outhost;
-    std::vector<Tref> workspacehost;
 
     size_t ws_sizeInBytes;
     miopenLossReductionMode_t reduction;
@@ -199,9 +191,9 @@ int L1LossDriver<Tgpu, Tref>::AddCmdLineArgs()
                          "int");
     inflags.AddInputFlag("reduction",
                          'R',
-                         "0",
+                         "2",
                          "Reduction mode ('none'(0) | 'sum'(1) |'mean'(2)) "
-                         "(Default=0)",
+                         "(Default=2)",
                          "int");
     inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
     inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int");
@@ -239,8 +231,7 @@ int L1LossDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     out       = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
     workspace = std::vector<Tgpu>(ws_sz, static_cast<Tgpu>(0));
 
-    outhost       = std::vector<Tref>(out_sz, static_cast<Tref>(0));
-    workspacehost = std::vector<Tref>(ws_sz, static_cast<Tref>(0));
+    outhost = std::vector<Tref>(out_sz, static_cast<Tref>(0));
 
     for(int i = 0; i < in_sz; i++)
     {
@@ -318,7 +309,7 @@ int L1LossDriver<Tgpu, Tref>::RunForwardCPU()
     if(reduction == MIOPEN_LOSS_REDUCTION_MEAN || reduction == MIOPEN_LOSS_REDUCTION_SUM)
     {
         mloL1LossReducedForwardRunHost<Tgpu, Tref>(
-            inputDesc, in.data(), tar.data(), workspacehost.data(), outhost.data(), reduction);
+            inputDesc, in.data(), tar.data(), outhost.data(), reduction);
     }
 
     return miopenStatusSuccess;

@@ -82,10 +82,10 @@ inline std::vector<L1LossTestCase> GenFullTestCases()
         {{1, 1, 1, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, false},
         {{1, 2, 3, 4, 1}, MIOPEN_LOSS_REDUCTION_SUM, false},
         {{1, 1, 1, 257, 1}, MIOPEN_LOSS_REDUCTION_SUM, false},
-        {{2, 10, 128, 128, 1}, MIOPEN_LOSS_REDUCTION_SUM, false},
+        {{2, 10, 128, 64, 1}, MIOPEN_LOSS_REDUCTION_MEAN, false},
         {{5, 13, 17, 11, 1}, MIOPEN_LOSS_REDUCTION_MEAN, false},
-        {{256, 4, 8723, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, false},
-        {{256, 4, 8723, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, true},
+        {{256, 4, 128, 1, 1}, MIOPEN_LOSS_REDUCTION_MEAN, false},
+        {{256, 4, 128, 1, 1}, MIOPEN_LOSS_REDUCTION_MEAN, true},
         {{1, 1, 1, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, true},
         {{34, 4, 5, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, true},
         {{4, 7, 5, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, true},
@@ -174,24 +174,14 @@ struct L1LossFwdTest : public ::testing::TestWithParam<L1LossTestCase>
 
     double GetTolerance()
     {
-        // Computation error of fp16 is ~2^13 (=8192) bigger than
-        // the one of fp32 because mantissa is shorter by 13 bits.
-        double tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
-
-        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
-        if(std::is_same<T, bfloat16>::value)
-            tolerance *= 8.0;
-
+        double tolerance = std::numeric_limits<T>::epsilon() * 10;
         return tolerance;
     }
 
     void Verify()
     {
         double threshold = GetTolerance();
-
-        auto error = miopen::rms_range(ref_output, output);
-
-        std::cout << "cpu output: " << ref_output[0] << "gpu output" << output[0] << std::endl;
+        auto error       = miopen::rms_range(ref_output, output);
 
         EXPECT_TRUE(error < threshold * 10) << "Error output beyond tolerance Error: " << error
                                             << ",  Tolerance: " << threshold * 10;