application-multinomial.conf

deepdive {

  db.default: {
    driver: "org.postgresql.Driver"
    url: "jdbc:postgresql://"${PGHOST}":"${PGPORT}"/"${DBNAME} #"
    user: ${PGUSER}
    password: ${PGPASSWORD}
    dbname: ${DBNAME}
    host: ${PGHOST}
    port: ${PGPORT}
  }

  schema.variables {
    # labels
    variable.label: Categorical(4) 
    # 0: none
    # 1: T
    # 2: C
    # 3: CT
  }

  extraction.extractors: {

    # cand_word table is ready in advance
    ext_prepare_variable {
      style: "sql_extractor"
      # before: util/fill_sequence.sh cand_word cand_word_id
      sql: """
        UPDATE cand_word
        SET cand_word_id = docid || '@' || varid || '_' || candid || '.' || wordid;

        DROP TABLE IF EXISTS variable cascade;

        CREATE TABLE variable(
          docid TEXT,
          varid INT,
          label INT,
          variable_id TEXT,
          id BIGINT
          ) DISTRIBUTED BY (docid);

        INSERT INTO variable(
                docid, 
                varid,
                variable_id)
        SELECT  DISTINCT docid, varid, 
                (docid || '@' || varid) AS variable_id
        FROM    cand_word;

      """
      # after: util/fill_sequence.sh variable variable_id
      dependencies: []
    }

    ext_prepare_candidate {
      style: "sql_extractor"
      sql: """
        DROP TABLE IF EXISTS candidate CASCADE;

        CREATE TABLE candidate(
          variable_id   TEXT,
          docid         TEXT,
          varid         INT,
          candid        INT,
          source        TEXT,
          label         BOOLEAN,
          candidate_id  TEXT,
          id            BIGINT
        )
        DISTRIBUTED BY (docid);

        INSERT INTO candidate(variable_id, docid, varid, candid, source, candidate_id) 
        SELECT DISTINCT 
                variable.variable_id, 
                variable.docid, 
                variable.varid, 
                candid, 
                source,
                (variable.variable_id || '_' || candid) AS candidate_id
        FROM    cand_word, variable 
        WHERE   variable.docid = cand_word.docid
        AND     variable.varid = cand_word.varid;

        UPDATE  cand_word 
        SET     candidate_id = candidate.candidate_id
        FROM    candidate
        WHERE   cand_word.docid = candidate.docid
          AND   cand_word.varid = candidate.varid
          AND   cand_word.candid = candidate.candid
        ;

      """
      # after: util/fill_sequence.sh candidate candidate_id
      dependencies: ["ext_prepare_variable"]
    }

    # Preparation Before holdout documents
    ext_prepare_document {
      style: "sql_extractor"
      sql: """
          DROP TABLE IF EXISTS document CASCADE;

          CREATE TABLE document(
              document_id BIGSERIAL, 
              docid TEXT)
          DISTRIBUTED BY (document_id);

          INSERT INTO document(docid) 
          SELECT DISTINCT docid 
          FROM cand_word 
          ORDER BY docid;

          DROP TABLE IF EXISTS document_backup CASCADE;

          SELECT * 
          INTO document_backup 
          FROM document;

          -- CREATE VIEW eval_docs AS
          -- SELECT docid 
          -- FROM document_backup 
          -- WHERE document_id IN (
            -- SELECT document_id FROM document WHERE docid IS NULL
          -- );
      """
      dependencies: ["ext_prepare_variable"]
    }
    # K-fold a fraction of documents.
    ext_holdout_document {
      style: "sql_extractor" # "cmd + sql"
      dependencies: ["ext_prepare_document"]
      before: ${APP_HOME}"/udf/kfold.py "${KFOLD_NUM}" "${KFOLD_ITER}" document document_id docid"  
      # IMPORTANT: holdout result is reproducible across different runs.
      sql: """
          DROP TABLE IF EXISTS eval_docs CASCADE;

          CREATE TABLE eval_docs AS 
          SELECT docid 
          FROM document_backup 
          WHERE document_id IN (
            SELECT document_id FROM document WHERE docid IS NULL
          ) DISTRIBUTED BY (docid);
      """
    }
    # K-fold a fraction of documents defined by .
    ext_holdout_from_orderaware {
      style: "cmd_extractor"
      cmd: ${APP_HOME}"/udf/before_ext_fold_from_orderaware.sh"
    }

    # Extract naive ocr-specific features
    ext_naivefeature {
      style: "tsv_extractor"
      input: """
        SELECT  docid, cand_word_id, word, source
          FROM  cand_word
        """
      output_relation: "feature"
      before: ${APP_HOME}"/udf/before_naivefeature.sh"
      udf: "pypy "${APP_HOME}"/udf/ext_naivefeature.py"
      parallelism: ${MAX_PARALLELISM}
      # input_batch_size: 5000
      # output_batch_size: 5000
    }

    # ext_prepare_domain_1gram SUCCESS [144211 ms]
    # INSERT 0 8408220 (group by docs)
    # SELECT 674194 (aggregated)
    ext_prepare_domain_1gram {
      style: "plpy_extractor"
      dependencies: []
      input: """
        SELECT docid, 
          string_to_array(article, '~^~') as words,
          1 as gram_len,
          0 as count_filter
        FROM domain_corpus
        WHERE article != '';
      """
      output_relation: "doc_domain_1gram"
      before: ${APP_HOME}"/udf/before_prepare_domain_ngram.sh 1"
      udf: ${APP_HOME}"/udf/ext_prepare_domain_ngram.py"
      # 1: gram_len, 0: count_filter
      # after: ${APP_HOME}"/udf/after_prepare_domain_ngram.sh 1 0"
    }

    # ext_prepare_domain_2gram SUCCESS [334037 ms]
    # INSERT 0 22982529
    # SELECT 6197359
    ext_prepare_domain_2gram {
      style: "plpy_extractor"
      dependencies: []
      input: """
        SELECT docid, 
          string_to_array(article, '~^~') as words,
          2 as gram_len,
          0 as count_filter
        FROM domain_corpus
        WHERE article != '';
      """
      output_relation: "doc_domain_2gram"
      before: ${APP_HOME}"/udf/before_prepare_domain_ngram.sh 2"
      udf: ${APP_HOME}"/udf/ext_prepare_domain_ngram.py"
      # 2: gram_len, 0: count_filter
      # after: ${APP_HOME}"/udf/after_prepare_domain_ngram.sh 2 0"
    }

    # ext_prepare_domain_3gram SUCCESS [174428 ms]
    # INSERT 0 31118773 (group by docs)
    # SELECT 16550020 (aggregated)
    ext_prepare_domain_3gram {
      style: "plpy_extractor"
      dependencies: []
      input: """
        SELECT docid, 
          string_to_array(article, '~^~') as words,
          3 as gram_len,
          0 as count_filter
        FROM domain_corpus
        WHERE article != '';
      """
      output_relation: "doc_domain_3gram"
      before: ${APP_HOME}"/udf/before_prepare_domain_ngram.sh 3"
      udf: ${APP_HOME}"/udf/ext_prepare_domain_ngram.py"
      # 3: gram_len, 0: count_filter
      # after: ${APP_HOME}"/udf/after_prepare_domain_ngram.sh 3 0"
    }

    # Get domain Ngram excluding evaluation docs
    ext_domain_ngram {
      style: "sql_extractor"
      dependencies: ["ext_prepare_domain_1gram","ext_prepare_domain_2gram","ext_prepare_domain_3gram"]
      sql: """
          DROP TABLE IF EXISTS domain_1gram CASCADE;
          DROP TABLE IF EXISTS domain_2gram CASCADE;
          DROP TABLE IF EXISTS domain_3gram CASCADE;

          CREATE TABLE domain_1gram AS 
          SELECT   ngram, sum(count) as count
          FROM     doc_domain_1gram
          WHERE    NOT EXISTS (
                   SELECT * 
                   FROM eval_docs 
                   WHERE eval_docs.docid = doc_domain_1gram.docid
          ) GROUP BY ngram
          DISTRIBUTED BY (ngram);

          CREATE TABLE domain_2gram AS 
          SELECT   ngram, sum(count) as count
          FROM     doc_domain_2gram
          WHERE    NOT EXISTS (
                   SELECT * 
                   FROM eval_docs 
                   WHERE eval_docs.docid = doc_domain_2gram.docid
          ) GROUP BY ngram
          DISTRIBUTED BY (ngram);

          CREATE TABLE domain_3gram AS 
          SELECT   ngram, sum(count) as count
          FROM     doc_domain_3gram
          WHERE    NOT EXISTS (
                   SELECT * 
                   FROM eval_docs 
                   WHERE eval_docs.docid = doc_domain_3gram.docid
          ) GROUP BY ngram
          DISTRIBUTED BY (ngram);
      """
      # cmd: ${APP_HOME}"/udf/after_prepare_domain_ngram.sh 1 0 ; "${APP_HOME}"/udf/after_prepare_domain_ngram.sh 2 0 ; "${APP_HOME}"/udf/after_prepare_domain_ngram.sh 3 0 ; "
    }

    #################### NGRAM SUPERVISION ####################

    ext_prepare_supv_ngram {
      style: "plpy_extractor"
      dependencies: ["ext_holdout_document", "ext_prepare_document", "ext_holdout_from_orderaware"]
      input: """
        SELECT docid, 
          array_agg(word order by wordid) as words,
          """${SUPV_GRAM_LEN}""" as gram_len
        FROM html_seq GROUP BY (docid);
      """
      ###### TODO html_seq -> domain_corpus??? faster?
      output_relation: "supv_ngram"
      before: ${APP_HOME}"/udf/before_prepare_supv_ngram.sh"
      udf: ${APP_HOME}"/udf/ext_prepare_supv_ngram.py"
    }

    # Extract all candidate 2grams
    # Fixed parameters. Used for ONLY google features
    ext_cand_2gram {
      style: "plpy_extractor"
      dependencies: ["ext_prepare_document", "ext_prepare_variable", "ext_prepare_candidate"]
      input: """
      SELECT 
        docid,
        array_agg(cand_word_id order by varid, candid, wordid) as arr_cand_word_id,
        array_agg(candidate_id order by varid, candid, wordid) as arr_candidate_id,
        array_agg(varid order by varid, candid, wordid) as arr_varid,
        array_agg(candid order by varid, candid, wordid) as arr_candid,
        array_agg(word order by varid, candid, wordid) as arr_feature,
        2 as gram_len
      FROM cand_word
      GROUP BY docid
      """
      output_relation: "cand_2gram"
      # udf: "pypy "${APP_HOME}"/udf/ext_cand_ngram.py 2"
      udf: ${APP_HOME}"/udf/ext_cand_ngram_plpy.py"
      # parallelism: ${MAX_PARALLELISM}
      before: ${APP_HOME}"/udf/before_cand_ngram.sh 2"
    }

    # Extract all candidate Ngrams. Accept N as input (SUPV_GRAM_LEN)
    # Used for BOTH supervision and features
    ext_cand_ngram {
      style: "plpy_extractor"
      dependencies: ["ext_prepare_document", "ext_prepare_variable", "ext_prepare_candidate"]
      input: """
      SELECT 
        docid,
        array_agg(cand_word_id order by varid, candid, wordid) as arr_cand_word_id,
        array_agg(candidate_id order by varid, candid, wordid) as arr_candidate_id,
        array_agg(varid order by varid, candid, wordid) as arr_varid,
        array_agg(candid order by varid, candid, wordid) as arr_candid,
        array_agg(word order by varid, candid, wordid) as arr_feature,
        """${SUPV_GRAM_LEN}""" as gram_len
      FROM cand_word
      GROUP BY docid
      """
      output_relation: "cand_"${SUPV_GRAM_LEN}"gram"
      # udf: "pypy "${APP_HOME}"/udf/ext_cand_ngram.py 2"
      udf: ${APP_HOME}"/udf/ext_cand_ngram_plpy.py"
      # parallelism: ${MAX_PARALLELISM}
      before: ${APP_HOME}"/udf/before_cand_ngram.sh "${SUPV_GRAM_LEN}
    }


    ext_sup_ngram {
      style: "cmd_extractor"
      dependencies: ["ext_holdout_document", "ext_prepare_document", "ext_holdout_from_orderaware", "ext_cand_ngram", "ext_prepare_supv_ngram"]
      cmd: "bash "${APP_HOME}"/udf/ext_sup_ngram.sh "${SUPV_GRAM_LEN}
    }

    # array_agg(id order by varid, candid, wordid) as arr_id,
    # array_agg(candid order by varid, candid, wordid) as arr_candid,
    # array_agg(wordid order by varid, candid, wordid) as arr_wordid,
    # TAKES VERY LONG
    ########### TODO at least change this to TSV... ############
    ext_sup_orderaware {
      dependencies: ["ext_holdout_document", "ext_prepare_document", "ext_holdout_from_orderaware"]
      input: """select 
        docid,
        array_agg(candidate_id order by varid, candid, wordid) as arr_candidate_id,
        array_agg(varid order by varid, candid, wordid) as arr_varid,
        array_agg(word order by varid, candid, wordid) as arr_word
        from cand_word
        group by docid
        """
      # where docid in (select * from eval_docs)
      output_relation: "orderaware_supv_label"
      # Supervision dir
      udf: "pypy "${APP_HOME}"/udf/ext_sup_orderaware.py "
      # # Using Evaluation dir
      # udf: "pypy "${APP_HOME}"/udf/ext_sup_orderaware.py "${EVAL_DIR}
      parallelism: ${MAX_PARALLELISM}
      before: ${APP_HOME}"/udf/before_sup_orderaware.sh"
      # after: ${APP_HOME}"/udf/after_sup_orderaware.sh"
      input_batch_size: 1
      output_batch_size: 10000
    }

    ext_eval_orderaware_bestpick {
      dependencies: ["ext_holdout_document", "ext_prepare_document", "ext_holdout_from_orderaware"]
      input: """select 
        docid,
        array_agg(candidate_id order by varid, candid, wordid) as arr_candidate_id,
        array_agg(varid order by varid, candid, wordid) as arr_varid,
        array_agg(word order by varid, candid, wordid) as arr_word
        from cand_word
        where docid in (select * from eval_docs)
        group by docid
        """
      # where docid in (select * from eval_docs)
      output_relation: "orderaware_eval_label_bestpick"
      udf: "pypy "${APP_HOME}"/udf/ext_sup_orderaware.py /lfs/local/0/zifei/bestpick-result/ "${EVAL_DIR}
      parallelism: ${MAX_PARALLELISM}
      before: ${APP_HOME}"/udf/before_sup_orderaware_bestpick.sh orderaware_eval_label_bestpick"
      # after: ${APP_HOME}"/udf/after_sup_orderaware.sh"
      input_batch_size: 1
      output_batch_size: 10000
    }
    ext_eval_orderaware_bestpick_tess {
      dependencies: ["ext_holdout_document", "ext_prepare_document", "ext_holdout_from_orderaware"]
      input: """select 
        docid,
        array_agg(candidate_id order by varid, candid, wordid) as arr_candidate_id,
        array_agg(varid order by varid, candid, wordid) as arr_varid,
        array_agg(word order by varid, candid, wordid) as arr_word
        from cand_word
        where docid in (select * from eval_docs)
        and (source = 'T' or source = 'CT' or source = 'TC')
        group by docid
        """
      # where docid in (select * from eval_docs)
      output_relation: "orderaware_eval_label_bestpick_tess"
      udf: "pypy "${APP_HOME}"/udf/ext_sup_orderaware.py /lfs/local/0/zifei/bestpick-result-tess/ "${EVAL_DIR}
      parallelism: ${MAX_PARALLELISM}
      before: ${APP_HOME}"/udf/before_sup_orderaware_bestpick.sh orderaware_eval_label_bestpick_tess"
      # after: ${APP_HOME}"/udf/after_sup_orderaware.sh"
      input_batch_size: 1
      output_batch_size: 10000
    }
    ext_eval_orderaware_bestpick_cuni {
      dependencies: ["ext_holdout_document", "ext_prepare_document", "ext_holdout_from_orderaware"]
      input: """select 
        docid,
        array_agg(candidate_id order by varid, candid, wordid) as arr_candidate_id,
        array_agg(varid order by varid, candid, wordid) as arr_varid,
        array_agg(word order by varid, candid, wordid) as arr_word
        from cand_word
        where docid in (select * from eval_docs)
        and (source = 'C' or source = 'CT' or source = 'TC')
        group by docid
        """
      # where docid in (select * from eval_docs)
      output_relation: "orderaware_eval_label_bestpick_cuni"
      udf: "pypy "${APP_HOME}"/udf/ext_sup_orderaware.py /lfs/local/0/zifei/bestpick-result-cuni/ "${EVAL_DIR}
      parallelism: ${MAX_PARALLELISM}
      before: ${APP_HOME}"/udf/before_sup_orderaware_bestpick.sh orderaware_eval_label_bestpick_cuni"
      # after: ${APP_HOME}"/udf/after_sup_orderaware.sh"
      input_batch_size: 1
      output_batch_size: 10000
    }


    ext_sup_orderaware_incremental {
      dependencies: ["ext_holdout_document", "ext_prepare_document", "ext_holdout_from_orderaware"]
      input: """select 
        docid,
        array_agg(candidate_id order by varid, candid, wordid) as arr_candidate_id,
        array_agg(varid order by varid, candid, wordid) as arr_varid,
        array_agg(word order by varid, candid, wordid) as arr_word
        from cand_word
        where not exists
        (select DISTINCT docid from orderaware_supv_label
          where orderaware_supv_label.docid = cand_word.docid
          )
        group by docid
        """
        # TODO CHECK IT!!
      # where docid in (select * from eval_docs)
      output_relation: "orderaware_supv_label"
      udf: "pypy "${APP_HOME}"/udf/ext_sup_orderaware.py"
      parallelism: ${MAX_PARALLELISM}
      # before: ${APP_HOME}"/udf/before_sup_orderaware.sh"
      # after: ${APP_HOME}"/udf/after_sup_orderaware.sh"
      input_batch_size: 1
      output_batch_size: 10000
    }
    # NO EXTRACT, Use only
    ext_sup_use_orderaware {
      dependencies: ["ext_sup_orderaware"]
      style: "sql_extractor"
      sql: """
        update candidate set label = NULL;

        update candidate
        set label = true
        where candidate_id in 
        (select candidate_id from orderaware_supv_label where label = true);

        update candidate 
        set label = false 
        where label is null 
          and docid in (select distinct docid from orderaware_supv_label);
      """
    }


    # TODO
    ext_sup_variable_label {
      dependencies: ["ext_sup_use_orderaware", "ext_sup_ngram"]
      style: "sql_extractor"
      sql: """

        UPDATE variable v SET label = null;

        UPDATE  variable v
        SET     label = 3
        FROM    candidate c
        WHERE   c.docid = v.docid
          AND   c.variable_id = v.variable_id
          AND   c.source = 'CT'
          AND   c.label = true;

        UPDATE  variable v
        SET     label = 2
        FROM    candidate c
        WHERE   c.docid = v.docid
          AND   c.variable_id = v.variable_id
          AND   c.source = 'C'
          AND   c.label = true;

        UPDATE  variable v
        SET     label = 1
        FROM    candidate c
        WHERE   c.docid = v.docid
          AND   c.variable_id = v.variable_id
          AND   c.source = 'T'
          AND   c.label = true
          AND   v.label is null;

        UPDATE  variable v
        SET     label = 0
        WHERE   label is null 
        AND NOT EXISTS (
          SELECT * FROM candidate 
          WHERE candidate.variable_id = v.variable_id
          AND   candidate.docid = v.docid
          AND   (candidate.label != false OR candidate.label is null)
        );


        UPDATE  variable v
        SET     label = null
        FROM    candidate c
        WHERE   c.docid = v.docid
          AND   c.variable_id = v.variable_id
          AND   c.source = 'T'
          AND   c.label = true
          AND   v.label = 2;
      """  # disambi not necessary: done in supervision step...
    }


    # # Deprecated since DD's manual holdout is enabled
    # ext_sup_holdout_labels {
    #   style: "sql_extractor"
    #   dependencies: ["ext_sup_1gram", "ext_sup_1gram_2gram", "ext_sup_use_orderaware"]
    #   sql: """UPDATE candidate 
    #       SET label = null 
    #       WHERE docid IN (select docid from eval_docs);"""
    #   # after: ${APP_HOME}"/udf/after_sup_holdout_labels.sh"

    # }

    # Use 2gram results to generate features based on Google Ngrams
    ext_cand_2gram_feature {
      style: "sql_extractor"
      # cmd: ${APP_HOME}"/udf/after_cand_2gram.sh"
      sql: """
      DROP TABLE IF EXISTS cand_2gram_positive CASCADE;
      DROP TABLE IF EXISTS cand_2gram_somepos_candidates CASCADE;
      DROP TABLE IF EXISTS cand_2gram_someneg_candidates CASCADE;

      CREATE TABLE cand_2gram_positive AS 
        select cand_2gram.*, count 
        from cand_2gram, google_2gram_reduced
      where count > 1000 and ngram = gram
      DISTRIBUTED BY (docid);

      CREATE TABLE cand_2gram_somepos_candidates AS
      select  cand_word.docid, 
              cand_word.candidate_id
      from  cand_2gram_positive t,
            cand_word
      where cand_word.cand_word_id = t.cand_word_id
        AND cand_word.docid = t.docid
      group by cand_word.docid, cand_word.candidate_id
      DISTRIBUTED BY (docid);

      CREATE TABLE cand_2gram_someneg_candidates AS 
      SELECT cand_word.docid, cand_word.candidate_id
      FROM   cand_word
      WHERE NOT EXISTS 
      ( select * 
        from  cand_2gram_positive t
        where t.cand_word_id = cand_word.cand_word_id
        and   t.docid        = cand_word.docid
      )
      group by cand_word.docid, cand_word.candidate_id
      DISTRIBUTED BY (docid);

      """
      dependencies: ["ext_cand_2gram"]
    }

    # Do not do cross-words
    # Select character Ngrams (N is 3rd input in the query / before / output_relation)
    ext_char_1gram {
      style: "plpy_extractor"
      dependencies: ["ext_prepare_document", "ext_prepare_variable", "ext_prepare_candidate"] 
      input: """
        SELECT docid, candidate_id, word, 1 as gram_len
        FROM cand_word
      """
      output_relation: "f_char_1gram"
      udf: ${APP_HOME}"/udf/ext_char_ngram.py"
      before: ${APP_HOME}"/udf/before_ngram.sh char_1"
    }

    ext_char_2gram {
      style: "plpy_extractor"
      dependencies: ["ext_prepare_document", "ext_prepare_variable", "ext_prepare_candidate"] 
      input: """
        SELECT docid, candidate_id, word, 2 as gram_len
        FROM cand_word
      """
      output_relation: "f_char_2gram"
      udf: ${APP_HOME}"/udf/ext_char_ngram.py"
      before: ${APP_HOME}"/udf/before_ngram.sh char_2"
    }

    # ext_pos_3gram {
    #   # TODO write a general script to lable sequences with ngram!!
    #   input: """select 
    #     docid,
    #     array_agg(cand_word_id order by varid, candid, wordid) as arr_cand_word_id,
    #     array_agg(candidate_id order by varid, candid, wordid) as arr_candidate_id,
    #     array_agg(varid order by varid, candid, wordid) as arr_varid,
    #     array_agg(candid order by varid, candid, wordid) as arr_candid,
    #     array_agg(pos order by varid, candid, wordid) as arr_feature
    #     from cand_word
    #     group by docid
    #     """
    #   output_relation: "pos_3gram"
    #   udf: "pypy "${APP_HOME}"/udf/ext_cand_ngram.py 3"
    #   parallelism: ${MAX_PARALLELISM}
    #   before: ${APP_HOME}"/udf/before_pos_3gram.sh"
    #   after: ${APP_HOME}"/udf/after_pos_3gram.sh"
    #   input_batch_size: 1
    # }
  }

  inference.factors: {

    ########## NEW FORMAT FOR INFERENCE RULES #########
    # Weight column
    # ID for each variables
    # label (var) column
    # naming convention: select id from 

    f_naivefeature {
      input_query: """
        select
          cand_word.source || ':' || feature.fname as "feature.fname", 
          variable.id as "variable.id",
          variable.label as "variable.label"
        from    feature, cand_word, variable
        where   feature.cand_word_id = cand_word.cand_word_id
          AND   cand_word.docid || '@' || cand_word.varid = variable.variable_id
          AND   cand_word.docid = variable.docid
          AND   cand_word.docid = feature.docid

        """
      function: "Multinomial(variable.label)"
      weight: "?(feature.fname)"
    }

    f_nlp_pos {
      input_query: """
        SELECT
          cand_word.source || ':' || pos as pos, 
          variable.id as "variable.id",
          variable.label as "variable.label"
        from  cand_word, variable
        where cand_word.docid || '@' || cand_word.varid = variable.variable_id
        AND   cand_word.docid = variable.docid
        """
      function: "Multinomial(variable.label)"
      weight: "?(pos)"
    }

# TODO
    # f_nlp_pos_2gram {
    # }

    f_nlp_ner {
      input_query: """
        SELECT
          cand_word.source || ':' || ner as ner, 
          variable.id as "variable.id",
          variable.label as "variable.label"
        from  cand_word, variable
        where cand_word.docid || '@' || cand_word.varid = variable.variable_id
        AND   cand_word.docid = variable.docid
        """
      function: "Multinomial(variable.label)"
      weight: "?(ner)"
    }

    f_ocr_bias {
      input_query: """
        select 
          candidate.source as source,
          variable.id as "variable.id",
          variable.label as "variable.label"
        from candidate, variable
        WHERE candidate.docid =  variable.docid
          AND candidate.variable_id =  variable.variable_id
        """
      function: "Multinomial(variable.label)"
      weight: "?(source)"
    }

    # f_nlp_pos {
    #   input_query: """
    #     select 
    #       candidate.source as "candidate.source",
    #       candidate.id as "candidate.id",
    #       candidate.label as "candidate.label"
    #     from candidate
    #     """
    #   function: "IsTrue(candidate.label)"
    #   weight: "?(candidate.source)"
    # }

    f_constraint {
      # Cannot reverse natural join order: id MUST BE cand_label.id!!
      # select c1.id as "candidate.c1.id", 
      #   c2.id as "candidate.c2.id", 

      # ALREADY DISTINCT
      input_query: """
        select 
          c1.id as "candidate.c1.id", 
          c2.id as "candidate.c2.id", 
          c1.label as "candidate.c1.label", 
          c2.label as "candidate.c2.label"
        from candidate as c1, candidate as c2
        where c1.docid = c2.docid
          AND c1.variable_id = c2.variable_id
          and c1.candidate_id != c2.candidate_id
        """
      # function: "Imply(candidate.c1.label, !candidate.c2.label)"
      # # weight: "?"
      # weight: 30
      function: "And(candidate.c1.label, candidate.c2.label)"
      weight: "?"
      # weight: -20
    }

    # f_agree_bonus {
    #   # Different source, same output: both correct. 
    #     More agree, more correct
    # }

    # Positive: In google ngram
    f_1gram_pos {
      input_query: """
        select 
          candidate.id as "candidate.id",
          candidate.label as "candidate.label"
        FROM candidate, cand_word, google_1gram
        WHERE google_1gram.gram = cand_word.word
        and google_1gram.count > 1000
        and cand_word.candidate_id = candidate.candidate_id
        and cand_word.docid = candidate.docid
        """
      function: "IsTrue(candidate.label)"
      weight: "?"
    }

    # negative: not in google ngram, or too few
    f_1gram_neg {
      input_query: """
        select 
          candidate.id as "candidate.id",
          candidate.label as "candidate.label"
        FROM candidate, cand_word
        where not exists
        (select * from google_1gram 
          where count > 1000 
          and cand_word.word = gram
          )
        and cand_word.candidate_id = candidate.candidate_id
        and cand_word.docid = candidate.docid
        """
      function: "IsTrue(!candidate.label)"
      weight: "?"
    }

    ############### 2gram features #################
    f_2gram_each {
      input_query: """
        SELECT  candidate.id as "candidate.id",
                candidate.label as "candidate.label",
                log(count)::int as logcount
        FROM    cand_2gram_positive t,
                candidate
        WHERE   t.candidate_id = candidate.candidate_id
          AND   t.docid = candidate.docid
      """
      function: "IsTrue(candidate.label)"
      weight: "?(logcount)"
    }
    f_2gram_allpos {
      input_query: """
        select
          candidate.id as "candidate.id",
          candidate.label as "candidate.label"
        from candidate
        where not exists
        (select * from cand_2gram_someneg_candidates t
          where candidate.candidate_id = t.candidate_id
          and candidate.docid = t.docid)
        """
      function: "IsTrue(candidate.label)"
      weight: "?"
    }
    f_2gram_someneg {
      input_query: """
        select
          candidate.id as "candidate.id",
          candidate.label as "candidate.label"
        from candidate, cand_2gram_someneg_candidates t
        where candidate.candidate_id = t.candidate_id
        and candidate.docid = t.docid
        """
      function: "IsTrue(!candidate.label)"
      weight: "?"
    }
    f_2gram_allneg {
      input_query: """
        select
          candidate.id as "candidate.id",
          candidate.label as "candidate.label"
        from candidate
        where not exists
        (select * from cand_2gram_somepos_candidates t
          where candidate.candidate_id = t.candidate_id
            and candidate.docid = t.docid)
        """
      function: "IsTrue(!candidate.label)" # Should be a positive weight
      weight: "?"
    }
    f_2gram_somepos {
      input_query: """
        select
          candidate.id as "candidate.id",
          candidate.label as "candidate.label"
        from candidate, cand_2gram_somepos_candidates t
        where candidate.candidate_id = t.candidate_id
        and candidate.docid = t.docid
        """
      function: "IsTrue(candidate.label)"
      weight: "?"
    }

    f_char_1gram {
      input_query: """
        SELECT  c.id AS "candidate.id",
                c.label AS "candidate.label",
                ngram || '-' || source AS feature
        FROM    candidate c, f_char_1gram f
        WHERE   c.docid = f.docid
        AND     c.candidate_id = f.candidate_id
        """
      function: "IsTrue(candidate.label)"
      weight: "?(feature)"
    }

  }

  calibration {
    # holdout_fraction: ${CALI_FRACTION}
    holdout_query: """
      INSERT INTO dd_graph_variables_holdout(variable_id) 
      SELECT id 
      FROM   candidate 
      WHERE  docid IN (select docid from eval_docs);"""
  }

  sampler.sampler_cmd: "util/sampler-dw-linux gibbs"

  # sampler.sampler_cmd: "/dfs/rulk/0/czhang/dm2/dimmwitted/dw gibbs"
  # sampler.sampler_args: "-l 300 -s 1 -i 300" # Only for May 27 bestball

  # sampler.sampler_args: "-l 100 -s 10 -i 100 -t 1"

  # sampler.sampler_args: "-l 500 -s 1 -i 1000 --alpha 0.1 -d 0.98" # Which better?
  sampler.sampler_args: "-l 300 -s 1 -i 500 -a 0.01"
  
  # sampler.sampler_args: "-l 300 -s 1 -i 500 -a 0.1" # Which better?
  # pick a small learning rate to try to learn a correct weight

  pipeline.pipelines.empty: []

  pipeline.pipelines.bestpick: [
    # "ext_prepare_document", "ext_holdout_document",
    "ext_eval_orderaware_bestpick",
    "ext_eval_orderaware_bestpick_tess",
    "ext_eval_orderaware_bestpick_cuni",
    # "f_constraint",
    ]

  #####################################################
  ################## MAIN PIPELINE ####################
  #####################################################
  pipeline.pipelines.main: [

    ###############################
    ########## EXTRACTORS #########
    ###############################

    ############## PREPROCESSING #################
    ### "ext_prepare_variable",
    ### "ext_prepare_candidate",
    ### "ext_prepare_document","ext_holdout_document",

    ############## SUPERVISION   #################

    # Ngram supervision
    ### "ext_prepare_supv_ngram", # prepare ngram supervision
    ### "ext_cand_ngram",         # prepare N-gram candidate (long)
    # "ext_sup_ngram",          # Calculate supervision

    #---------- Order-aware supervision --------
    ### "ext_sup_orderaware",
    # "ext_sup_use_orderaware",

    ########### After cand supervision, update variable labels #####
    # "ext_sup_variable_label",

    ################## FEATURE ####################

    ### "ext_naivefeature",

    ###### Domain-corpus Ngram Feature ######
    # # PREPROCESS ALL DOCUMENTS (Run only once!)
    # # "ext_prepare_domain_1gram",
    # # "ext_prepare_domain_2gram",
    # # "ext_prepare_domain_3gram",

    # "ext_domain_ngram",  # Exclude this testset

    #########################################

    # ngram features
    # "ext_cand_2gram",
    # "ext_cand_2gram_feature",

    # "ext_char_1gram",

    # ###############################
    # ####### INFERENCE RULES #######
    # ###############################
    # "f_naivefeature",
    # "f_constraint",
    # "f_nlp_pos",
    # "f_nlp_ner",
    "f_ocr_bias",
    # "f_1gram_pos",
    # "f_1gram_neg",
    # "f_2gram_somepos",
    # "f_2gram_allneg",
    # "f_2gram_someneg", # same AS somepos....
    # "f_2gram_allpos",
    # "f_2gram_each", # new one

    # "f_char_1gram",
    ]
  
  # pipeline.run: "orderaware" 
  pipeline.run: "main"


  # inference.skip_learning: true

  # pipeline.relearn_from: /lfs/madmax/0/zifei/deepdive/out/2014-05-28T154541
}

# Regex to select all rules:  "    ext|f_.*\{"

# Time:
# 02:31:27 [profiler] INFO  ext_prepare_candidate SUCCESS [7261 ms]
# 02:31:27 [profiler] INFO  ext_prepare_document SUCCESS [8506 ms]
# 02:31:27 [profiler] INFO  ext_naivefeature SUCCESS [63002 ms]
# 02:31:27 [profiler] INFO  ext_holdout_document SUCCESS [54690 ms]
# 02:31:27 [profiler] INFO  ext_cand_ngram SUCCESS [129908 ms]
# 02:31:27 [profiler] INFO  ext_sup_orderaware SUCCESS [1768944 ms]
# 02:31:28 [profiler] INFO  ext_cand_2gram SUCCESS [1868929 ms]
# 02:31:28 [profiler] INFO  ext_sup_use_orderaware SUCCESS [48940 ms]
# 02:31:28 [profiler] INFO  ext_cand_2gram_feature SUCCESS [5936 ms]
# 02:31:28 [profiler] INFO  ext_prepare_supv_ngram SUCCESS [1830603 ms]