From c4da2a9b7791483c3b95f48da5ef4b6309fdfb08 Mon Sep 17 00:00:00 2001
From: Qinheping Hu <qinhh@amazon.com>
Date: Tue, 21 Nov 2023 22:38:45 -0600
Subject: [PATCH] Add quick filter into goto-synthesizer

The quick filter check if a candidate is consistent with the previous seen examples by evaluating the candidate on the examples.
The idea is that quick filter is more efficient than CBMC for checking if a candidate is incorrect.
So the synthesizer can use the quick filter to rule out incorrect candidates in an early stage before sending them to CBMC.

In detail, we say a candidate is consistent with a positive (negative) example if the evaluation of the candidate on it is true (false).
If a candidate is inconsistent with any example, it cannot be a valid loop invariant predicate.

Examples are collected in the verification stage. In a CBMC trace, the valuations of variables upon the entry of the loop (loop_entry)
must satisfy the loop invariants, so that they are positive examples. Oppositely, valuations of variables in the havoced iteration must not satisfy
the loop invariants, so that they are negative examples.

The quick filter sinificantly reduce the number of candidate sent to CBMC. As an example, in the benchmark `loop_contracts_synthesis_04`, 61 out of 67 bad candidates
are ruled out be the quick filter.
---
 .../loop_contracts_synthesis_03/main.c        |   3 +-
 .../loop_contracts_synthesis_04/test.desc     |   3 +-
 src/goto-synthesizer/Makefile                 |   3 +-
 src/goto-synthesizer/cegis_evaluator.cpp      | 343 ++++++++++++++++++
 src/goto-synthesizer/cegis_evaluator.h        |  54 +++
 src/goto-synthesizer/cegis_verifier.cpp       |   3 +-
 ...enumerative_loop_contracts_synthesizer.cpp |  29 +-
 .../enumerative_loop_contracts_synthesizer.h  |   4 +-
 8 files changed, 433 insertions(+), 9 deletions(-)
 create mode 100644 src/goto-synthesizer/cegis_evaluator.cpp
 create mode 100644 src/goto-synthesizer/cegis_evaluator.h

diff --git a/regression/goto-synthesizer/loop_contracts_synthesis_03/main.c b/regression/goto-synthesizer/loop_contracts_synthesis_03/main.c
index be272d35f2cf..fe82fd1a3e2c 100644
--- a/regression/goto-synthesizer/loop_contracts_synthesis_03/main.c
+++ b/regression/goto-synthesizer/loop_contracts_synthesis_03/main.c
@@ -2,8 +2,7 @@
 
 void main()
 {
-  unsigned long len;
-  __CPROVER_assume(len <= SIZE);
+  unsigned long long len;
   __CPROVER_assume(len >= 8);
   char *array = malloc(len);
   const char *end = array + len;
diff --git a/regression/goto-synthesizer/loop_contracts_synthesis_04/test.desc b/regression/goto-synthesizer/loop_contracts_synthesis_04/test.desc
index 847f74c0ae11..e7ee10ae51a4 100644
--- a/regression/goto-synthesizer/loop_contracts_synthesis_04/test.desc
+++ b/regression/goto-synthesizer/loop_contracts_synthesis_04/test.desc
@@ -1,8 +1,9 @@
 CORE
 main.c
---pointer-check
+--pointer-check _ --verbosity 9
 ^EXIT=0$
 ^SIGNAL=0$
+Quick filter\: 6.* out of 67 candidates were filtered out\.
 ^\[main.pointer\_dereference.\d+\] .* SUCCESS$
 ^VERIFICATION SUCCESSFUL$
 --
diff --git a/src/goto-synthesizer/Makefile b/src/goto-synthesizer/Makefile
index 4098f531296e..53d634021e3d 100644
--- a/src/goto-synthesizer/Makefile
+++ b/src/goto-synthesizer/Makefile
@@ -1,4 +1,5 @@
-SRC = cegis_verifier.cpp \
+SRC = cegis_evaluator.cpp \
+      cegis_verifier.cpp \
       dump_loop_contracts.cpp \
       enumerative_loop_contracts_synthesizer.cpp \
       expr_enumerator.cpp \
diff --git a/src/goto-synthesizer/cegis_evaluator.cpp b/src/goto-synthesizer/cegis_evaluator.cpp
new file mode 100644
index 000000000000..bdb3d960dd81
--- /dev/null
+++ b/src/goto-synthesizer/cegis_evaluator.cpp
@@ -0,0 +1,343 @@
+/*******************************************************************\
+
+Module: Evaluate if an expression is consistent with examples
+
+Author: Qinheping Hu
+
+\*******************************************************************/
+
+/// \file
+/// Evaluate if an expression is consistent with examples
+
+#include "cegis_evaluator.h"
+
+#include <util/arith_tools.h>
+#include <util/format_expr.h>
+
+bool cegis_evaluatort::evaluate()
+{
+  auto is_inconsistent = false;
+  // Check if checked_expr is consistent with all examples.
+  for(const auto &cex : cexs)
+  {
+    // checked_expr is inconsistent with a positive example if its evaluation
+    // false.
+    is_inconsistent =
+      is_inconsistent || !evaluate_rec_bool(checked_expr, cex, 1);
+    // checked_expr is inconsistent with a negative example if its evaluation
+    // false.
+    is_inconsistent =
+      is_inconsistent || evaluate_rec_bool(checked_expr, cex, 0);
+  }
+  return !is_inconsistent;
+}
+
+bool cegis_evaluatort::evaluate_rec_bool(
+  const exprt &expr,
+  const cext &cex,
+  const bool is_positive)
+{
+  const auto id = expr.id();
+  // eval(AND op1 op2) :=
+  // eval(op1) && eval(op2)
+  if(id == ID_and)
+  {
+    bool result = true;
+    for(auto &op : expr.operands())
+    {
+      result = result && evaluate_rec_bool(op, cex, is_positive);
+    }
+    return result;
+  }
+
+  // eval(OR op1 op2) :=
+  // eval(op1) || eval(op2)
+  if(id == ID_or)
+  {
+    bool result = false;
+    for(auto &op : expr.operands())
+    {
+      result = result || evaluate_rec_bool(op, cex, is_positive);
+    }
+    return result;
+  }
+
+  // eval(IMPLIES op1 op2) :=
+  // eval(op1) => eval(op2)
+  if(id == ID_implies)
+  {
+    return !evaluate_rec_bool(expr.operands()[0], cex, is_positive) ||
+           evaluate_rec_bool(expr.operands()[1], cex, is_positive);
+  }
+
+  // eval(NOT op) :=
+  // !eval(op)
+  if(id == ID_not)
+  {
+    return !evaluate_rec_bool(expr.operands()[0], cex, is_positive);
+  }
+
+  // eval(EQUAL op1 op2) :=
+  // eval(op1) == eval(op2)
+  if(id == ID_equal)
+  {
+    return evaluate_rec_int(expr.operands()[0], cex, is_positive) ==
+           evaluate_rec_int(expr.operands()[1], cex, is_positive);
+  }
+
+  // eval(NOTEQUAL op1 op2) :=
+  // eval(op1) != eval(op2)
+  if(id == ID_notequal)
+  {
+    return evaluate_rec_int(expr.operands()[0], cex, is_positive) !=
+           evaluate_rec_int(expr.operands()[1], cex, is_positive);
+  }
+
+  // eval(LE op1 op2) :=
+  // eval(op1) <= eval(op2)
+  if(id == ID_le)
+  {
+    return evaluate_rec_int(expr.operands()[0], cex, is_positive) <=
+           evaluate_rec_int(expr.operands()[1], cex, is_positive);
+  }
+
+  // eval(LT op1 op2) :=
+  // eval(op1) < eval(op2)
+  if(id == ID_lt)
+  {
+    return evaluate_rec_int(expr.operands()[0], cex, is_positive) <
+           evaluate_rec_int(expr.operands()[1], cex, is_positive);
+  }
+
+  // eval(GE op1 op2) :=
+  // eval(op1) >= eval(op2)
+  if(id == ID_ge)
+  {
+    return evaluate_rec_int(expr.operands()[0], cex, is_positive) >=
+           evaluate_rec_int(expr.operands()[1], cex, is_positive);
+  }
+
+  // eval(GT op1 op2) :=
+  // eval(op1) > eval(op2)
+  if(id == ID_gt)
+  {
+    return evaluate_rec_int(expr.operands()[0], cex, is_positive) >
+           evaluate_rec_int(expr.operands()[1], cex, is_positive);
+  }
+
+  // eval(CONST op) :=
+  // op
+  if(id == ID_constant)
+  {
+    if(expr == true_exprt())
+    {
+      return true;
+    }
+    else if(expr == false_exprt())
+    {
+      return false;
+    }
+    UNREACHABLE_BECAUSE(
+      "Boolean constant must be either true or false: " + expr.pretty());
+  }
+
+  UNREACHABLE_BECAUSE(
+    "Found a unsupported boolean operator during quick filtering: " +
+    expr.id_string());
+}
+
+mp_integer cegis_evaluatort::evaluate_rec_int(
+  const exprt &expr,
+  const cext &cex,
+  const bool is_positive)
+{
+  const auto id = expr.id();
+  mp_integer result;
+
+  // For symbol expression, we look up their values from the example.
+  // There are three cases:
+  // 1. is_positive == true
+  //    We evaluate the expression on the positive examples, which is the
+  //    valuation of loop_entry(v). Note that the loop invariants must hold
+  //    for loop_entry valuations as the base case. So we look up the values
+  //    in the loop_entry set
+  // 2. is_positive == false, expr in the havoced set
+  //    We evaluate the expression on the negative examples---the havoced set.
+  // 3. is_positive == false, expr not in havoced set
+  //    The valuations of expr in the havoced iteration is the same as
+  //    in the loop entry. So we look up the values from loop_entry set.
+  if(id == ID_symbol)
+  {
+    if(
+      cex.havoced_values.find(expr) != cex.havoced_values.end() && !is_positive)
+    {
+      // Use havoc values if they exists and we are evaluating on
+      // the negative example---is_positive is false.
+      result = cex.havoced_values.at(expr);
+    }
+    else if(cex.loop_entry_values.find(expr) != cex.loop_entry_values.end())
+    {
+      result = cex.loop_entry_values.at(expr);
+    }
+    else
+    {
+      UNREACHABLE_BECAUSE(
+        "Variable not found in the havoced set or the un-havoced set: " +
+        expr.pretty());
+    }
+
+    // Typecast `result` to the type of `expr`.
+    auto tmp_expr = from_integer(result, expr.type());
+    to_integer(tmp_expr, result);
+    return result;
+  }
+
+  // For loop_entry expression, we look up their values from loop_entry set.
+  if(id == ID_loop_entry)
+  {
+    if(
+      cex.loop_entry_values.find(expr.operands()[0]) !=
+      cex.loop_entry_values.end())
+    {
+      result = cex.loop_entry_values.at(expr.operands()[0]);
+    }
+    else
+    {
+      UNREACHABLE_BECAUSE(
+        "Variable not found in the havoced set or the un-havoced set: " +
+        expr.pretty());
+    }
+
+    // Typecast `result` to the type of `expr`.
+    auto tmp_expr = from_integer(result, expr.type());
+    to_integer(tmp_expr, result);
+    return result;
+  }
+
+  // Evaluate the underlying expression and then typecast the result.
+  if(id == ID_typecast)
+  {
+    // Typecast `result` to the type of `expr`.
+    result = evaluate_rec_int(expr.operands()[0], cex, is_positive);
+    auto tmp_expr = from_integer(result, expr.type());
+    to_integer(tmp_expr, result);
+    return result;
+  }
+
+  // For object_size expression, look up the size of the underlying pointer in
+  // the example `cex`.
+  if(id == ID_object_size)
+  {
+    if(cex.object_sizes.find(expr.operands()[0]) != cex.object_sizes.end())
+    {
+      result = cex.object_sizes.at(expr.operands()[0]);
+    }
+    else
+    {
+      UNREACHABLE_BECAUSE(
+        "Variable not found in the object size set: " + expr.pretty());
+    }
+
+    // Typecast `result` to the type of `expr`.
+    auto tmp_expr = from_integer(result, expr.type());
+    to_integer(tmp_expr, result);
+    return result;
+  }
+
+  // For pointer_offset expression, look up the offset of the underlying
+  // pointer in the example `cex`.
+  // A pointer offset expression can be of form
+  // pointer_offset(p)
+  // or
+  // pointer_offset(loop_entry(p))
+  // for some pointer p.
+  // For the first case, we look up the offset in havoced
+  // offset set. Note that if the pointer is not in the havoced set or
+  // is_positive is set, we look up in loop_entry_offset set instead.
+  // For the second case, we look up the offset in the loop_entry_offset set.
+  if(id == ID_pointer_offset)
+  {
+    if(expr.operands()[0].id() != ID_loop_entry)
+    {
+      // If the expression is pointer_offset(p), look up the offset in the
+      // havoced offset set.
+      if(
+        cex.havoced_pointer_offsets.find(expr.operands()[0]) !=
+          cex.havoced_pointer_offsets.end() &&
+        !is_positive)
+      {
+        // Use havoc values only if we are evaluating the expression against
+        // the negative example---is_positive is false.
+        result = cex.havoced_pointer_offsets.at(expr.operands()[0]);
+      }
+      else if(
+        cex.loop_entry_offsets.find(expr.operands()[0]) !=
+        cex.loop_entry_offsets.end())
+      {
+        // The pointer is not havoced. So look up the offset in the loop-entry
+        // set instead.
+        result = cex.loop_entry_offsets.at(expr.operands()[0]);
+      }
+      else
+      {
+        UNREACHABLE_BECAUSE(
+          "Variable not found in the offset set: " + expr.pretty());
+      }
+    }
+    else
+    {
+      // If the expression is pointer_offset(loop_entry(p)), look up the
+      // offset in the offset-of-loop-entry-set.
+      if(
+        cex.loop_entry_offsets.find(expr.operands()[0].operands()[0]) !=
+        cex.loop_entry_offsets.end())
+      {
+        result = cex.loop_entry_offsets.at(expr.operands()[0].operands()[0]);
+      }
+      else
+      {
+        UNREACHABLE_BECAUSE(
+          "Variable not found in the offset set: " + expr.pretty());
+      }
+    }
+
+    // Typecast `result` to the type of `expr`.
+    auto tmp_expr = from_integer(result, expr.type());
+    to_integer(tmp_expr, result);
+    return result;
+  }
+
+  // For a constant expression, return its evaluation.
+  if(id == ID_constant)
+  {
+    to_integer(to_constant_expr(expr), result);
+    return result;
+  }
+
+  // For a plus expression, return the sum of the evaluations of two operands.
+  if(id == ID_plus)
+  {
+    result = evaluate_rec_int(expr.operands()[0], cex, is_positive) +
+             evaluate_rec_int(expr.operands()[1], cex, is_positive);
+
+    // Typecast `result` to the type of `expr`.
+    auto tmp_expr = from_integer(result, expr.type());
+    to_integer(tmp_expr, result);
+    return result;
+  }
+
+  // For a minus expression, return difference of evaluations of two operands.
+  if(id == ID_minus)
+  {
+    result = evaluate_rec_int(expr.operands()[0], cex, is_positive) -
+             evaluate_rec_int(expr.operands()[1], cex, is_positive);
+
+    // Typecast `result` to the type of `expr`.
+    auto tmp_expr = from_integer(result, expr.type());
+    to_integer(tmp_expr, result);
+    return result;
+  }
+
+  UNREACHABLE_BECAUSE(
+    "Found a unsupported operator during quick filtering: " + expr.id_string());
+}
diff --git a/src/goto-synthesizer/cegis_evaluator.h b/src/goto-synthesizer/cegis_evaluator.h
new file mode 100644
index 000000000000..b7897d56d877
--- /dev/null
+++ b/src/goto-synthesizer/cegis_evaluator.h
@@ -0,0 +1,54 @@
+/*******************************************************************\
+
+Module: Evaluate if an expression is consistent with examples
+
+Author: Qinheping Hu
+
+\*******************************************************************/
+
+/// \file
+/// Evaluate if an expression is consistent with examples
+
+#ifndef CPROVER_GOTO_SYNTHESIZER_CEGIS_EVALUATOR_H
+#define CPROVER_GOTO_SYNTHESIZER_CEGIS_EVALUATOR_H
+
+#include "cegis_verifier.h"
+
+/// Evaluator for checking if an expression is consistent with a given set of
+/// test cases (positive examples and negative examples).
+class cegis_evaluatort
+{
+public:
+  cegis_evaluatort(
+    const exprt &expr,
+    const std::vector<cext> &cexs)
+    : checked_expr(expr), cexs(cexs)
+  {
+  }
+
+  // Evaluate if the expression `checked_expr` is consistent with `cexs`.
+  // Return true if `checked_expr` is consistent with all examples.
+  bool evaluate();
+
+protected:
+  // Recursively evaluate boolean expressions on `cex`.
+  // If `is_positive` is set, evaluate on the positive example in `cex`.
+  // The positive example is the test collected from the first iteration of
+  // loop---the loop_entry valuation.
+  bool
+  evaluate_rec_bool(const exprt &expr, const cext &cex, const bool is_positive);
+
+  // Recursively evaluate integer expressions on `cex`.
+  // If `is_positive` is set, evaluate on the positive example in `cex`.
+  // The positive example is the test collected from the first iteration of
+  // loop---the loop_entry valuation.
+  mp_integer
+  evaluate_rec_int(const exprt &expr, const cext &cex, const bool is_positive);
+
+  /// @brief The expression being evaluated.
+  const exprt &checked_expr;
+  /// @brief The set of examples evaluated against.
+  const std::vector<cext> &cexs;
+};
+
+#endif // CPROVER_GOTO_SYNTHESIZER_CEGIS_EVALUATOR_H
diff --git a/src/goto-synthesizer/cegis_verifier.cpp b/src/goto-synthesizer/cegis_verifier.cpp
index e3d95dbf0311..b45e0b420a54 100644
--- a/src/goto-synthesizer/cegis_verifier.cpp
+++ b/src/goto-synthesizer/cegis_verifier.cpp
@@ -464,7 +464,8 @@ cext cegis_verifiert::build_cex(
           {
             if(
               underlying_array.id() == ID_address_of ||
-              underlying_array.id() == ID_index)
+              underlying_array.id() == ID_index ||
+              underlying_array.id() == ID_typecast)
             {
               underlying_array = underlying_array.operands()[0];
               continue;
diff --git a/src/goto-synthesizer/enumerative_loop_contracts_synthesizer.cpp b/src/goto-synthesizer/enumerative_loop_contracts_synthesizer.cpp
index fd55145916ce..d4cf8e8ec6b0 100644
--- a/src/goto-synthesizer/enumerative_loop_contracts_synthesizer.cpp
+++ b/src/goto-synthesizer/enumerative_loop_contracts_synthesizer.cpp
@@ -26,9 +26,10 @@ Author: Qinheping Hu
 #include <goto-instrument/havoc_utils.h>
 #include <goto-instrument/loop_utils.h>
 
-#include "cegis_verifier.h"
+#include "cegis_evaluator.h"
 #include "expr_enumerator.h"
 
+
 // substitute all tmp_post variables with their origins in `expr`
 void replace_tmp_post(
   exprt &dest,
@@ -289,7 +290,8 @@ exprt enumerative_loop_contracts_synthesizert::synthesize_same_object_predicate(
 exprt enumerative_loop_contracts_synthesizert::synthesize_strengthening_clause(
   const std::vector<exprt> terminal_symbols,
   const loop_idt &cause_loop_id,
-  const irep_idt &violation_id)
+  const irep_idt &violation_id,
+  const std::vector<cext> &cexs)
 {
   // Synthesis of strengthening clauses is a enumerate-and-check process.
   // We first construct the enumerator for the following grammar.
@@ -350,6 +352,10 @@ exprt enumerative_loop_contracts_synthesizert::synthesize_strengthening_clause(
   // starting from 0
   size_t size_bound = 0;
 
+  // Count how many candidates are filtered out by the quick filter.
+  size_t count_all = 0;
+  size_t count_filtered = 0;
+
   // Start to enumerate and check.
   while(true)
   {
@@ -369,6 +375,16 @@ exprt enumerative_loop_contracts_synthesizert::synthesize_strengthening_clause(
       const auto &combined_invariant = combine_in_and_post_invariant_clauses(
         new_in_clauses, new_pos_clauses, neg_guards);
 
+      // Quick filter:
+      // Rule out a candidate if its evaluation is inconsistent with examples.
+      cegis_evaluatort evaluator(strengthening_candidate, cexs);
+      count_all++;
+      if(!evaluator.evaluate())
+      {
+        count_filtered++;
+        continue;
+      }
+
       // The verifier we use to check current invariant candidates.
       cegis_verifiert verifier(
         combined_invariant, assigns_map, goto_model, options, log);
@@ -386,6 +402,8 @@ exprt enumerative_loop_contracts_synthesizert::synthesize_strengthening_clause(
          return_cex->violation_type !=
            cext::violation_typet::cex_not_preserved))
       {
+        log.progress() << "Quick filter: " << count_filtered << " out of "
+                       << count_all << " candidates were filtered out.\n";
         return strengthening_candidate;
       }
     }
@@ -416,8 +434,12 @@ invariant_mapt enumerative_loop_contracts_synthesizert::synthesize_all()
   log.debug() << "Start the first synthesis iteration." << messaget::eom;
   auto return_cex = verifier.verify();
 
+  // Counterexamples we have seen.
+  std::vector<cext> cexs;
+
   while(return_cex.has_value())
   {
+    cexs.push_back(return_cex.value());
     exprt new_invariant_clause = true_exprt();
     // Synthesize the new_clause
     // We use difference strategies for different type of violations.
@@ -449,7 +471,8 @@ invariant_mapt enumerative_loop_contracts_synthesizert::synthesize_all()
       new_invariant_clause = synthesize_strengthening_clause(
         terminal_symbols,
         return_cex->cause_loop_ids.front(),
-        verifier.target_violation_id);
+        verifier.target_violation_id,
+        cexs);
       break;
 
     case cext::violation_typet::cex_not_hold_upon_entry:
diff --git a/src/goto-synthesizer/enumerative_loop_contracts_synthesizer.h b/src/goto-synthesizer/enumerative_loop_contracts_synthesizer.h
index f39d1f6700b7..fd065e7e69f5 100644
--- a/src/goto-synthesizer/enumerative_loop_contracts_synthesizer.h
+++ b/src/goto-synthesizer/enumerative_loop_contracts_synthesizer.h
@@ -16,6 +16,7 @@ Author: Qinheping Hu
 
 #include <util/options.h>
 
+#include "cegis_verifier.h"
 #include "loop_contracts_synthesizer_base.h"
 
 class messaget;
@@ -80,7 +81,8 @@ class enumerative_loop_contracts_synthesizert
   exprt synthesize_strengthening_clause(
     const std::vector<exprt> terminal_symbols,
     const loop_idt &cause_loop_id,
-    const irep_idt &violation_id);
+    const irep_idt &violation_id,
+    const std::vector<cext> &cexs);
 
   /// Synthesize assigns target and update assigns_map.
   void synthesize_assigns(