Skip to content

Commit 83716e1

Browse files
committed
Updates to Scala stuff
1 parent 04259a5 commit 83716e1

7 files changed

+66
-40
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ data/*
88
.idea
99

1010
project/
11-
.DS_Store
11+
.DS_Store
12+
target/

lamdamart.py

-27
This file was deleted.

src/main/scala/CandidateGenerator.scala

+3-1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ class CandidateGenerator(dataset: Dataset, backgroundData: BackgroundData) {
4848
mutable.LinkedHashMap.empty
4949

5050
def generateCandidates(synthetic: Int = 0) = {
51+
candidateHash.clear()
52+
synthethicCandidateHash.clear()
53+
5154
println("Now generating candidates: ")
5255
val candidateFile = new File(
5356
candidate_file.replace(".txt", s"_${synthetic}.txt"))
@@ -115,7 +118,6 @@ class CandidateGenerator(dataset: Dataset, backgroundData: BackgroundData) {
115118

116119
}
117120

118-
println(MRR.result())
119121
bufferedWriter.close()
120122
}
121123

src/main/scala/Dataset.scala

+6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
11
import scala.io.Source
2+
object Dataset {
3+
val TEST = "test.txt"
4+
val BACKGROUND = "background.txt"
5+
val VALIDATION = "validation.txt"
6+
val TRAINING = "training.xt"
7+
}
28
class Dataset(val filename: String) {
39
def queries = Source.fromFile(filename).getLines.drop(0)
410
}

src/main/scala/Feature.scala

+8
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import java.io.BufferedWriter
2+
13
case class FeatureVec(freq: Int,
24
prefix: String,
35
lenPrefix: Int,
@@ -19,6 +21,8 @@ object Feature {
1921
private lazy val background = new BackgroundData()
2022
private var currentGroup = ""
2123
private var currentGroupId = 0
24+
private var currentGroupSize = 0
25+
var groupWriter: BufferedWriter = null
2226

2327
def computeFeatureVec(candidate: Candidate): FeatureVec = {
2428
val freq =
@@ -51,8 +55,12 @@ object Feature {
5155

5256
def writeFeature(feature: FeatureVec): String = {
5357
if (currentGroup != feature.prefix) {
58+
if (currentGroupSize != 0) {
59+
groupWriter.write(s"$currentGroupSize\n")
60+
}
5461
currentGroup = feature.prefix
5562
currentGroupId += 1
63+
currentGroupSize = 0
5664
}
5765
val builder = new StringBuilder()
5866
builder.append(feature.relevant + " ")

src/main/scala/FeatureGenerator.scala

+9
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,25 @@ import scala.io.Source
55
class FeatureGenerator(filename: String) {
66

77
private val writeFilename = filename.replace(".txt", "_features.txt")
8+
private val writeGroupname = filename.replace(".txt", "_groups.txt")
89

910
def generateAndWriteFeatures(): Unit = {
1011
val featureFile = new File(writeFilename)
12+
val groupFile = new File(writeGroupname)
1113

1214
if (!featureFile.exists()) featureFile.createNewFile()
15+
if (!groupFile.exists()) groupFile.createNewFile()
1316

1417
var i = 0
1518
val size = candidates.size
1619

1720
val bufferedWriter: BufferedWriter = new BufferedWriter(
1821
new FileWriter(writeFilename))
1922

23+
val groupWriter: BufferedWriter = new BufferedWriter(new FileWriter(writeGroupname))
24+
25+
Feature.groupWriter = groupWriter
26+
2027
for (rawC <- candidates) {
2128
i += 1
2229
if (i % 100000 == 0) {
@@ -28,6 +35,8 @@ class FeatureGenerator(filename: String) {
2835
bufferedWriter.write(Feature.candidate2FeatureVec(rawC))
2936
}
3037

38+
groupWriter.flush()
39+
groupWriter.close()
3140
bufferedWriter.flush()
3241
bufferedWriter.close()
3342

src/main/scala/Main.scala

+38-11
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,50 @@ import cats.implicits._
33
object Main {
44

55
def main(args: Array[String]): Unit = {
6-
// val candidateGenerator =
7-
// new CandidateGenerator(new Dataset("validation.txt"),
8-
// new BackgroundData())
9-
// candidateGenerator.generateCandidates(Synthetic.NO_SYNTHETIC)
10-
//
11-
// val mpc = new MPC("validation_candidates_0.txt")
12-
// mpc.computeMPC()
6+
// Generate all candidates. Note: this takes quite a while.
7+
genAllCandidates(Dataset.TEST)
8+
// genAllCandidates(Dataset.VALIDATION)
9+
// genAllCandidates(Dataset.TRAINING)
10+
11+
12+
// Compute MPC for the test candidates WITHOUT synthetic candidates. We use this as baseline.
13+
val mpc = new MPC("test_candidates_0.txt")
14+
mpc.computeMPC()
15+
16+
// This probably takes a bit of time.
17+
genAllFeatures(Dataset.TEST)
18+
// genAllFeatures(Dataset.TRAINING)
19+
// genAllFeatures(Dataset.VALIDATION)
20+
21+
1322
//
14-
// val featureGenerator = new FeatureGenerator("validation_candidates_0.txt")
15-
// featureGenerator.generateAndWriteFeatures()
1623
// val lambda = LambdaMart.trainModel("train_candidates_0_features.txt",
1724
// "validation_candidates_0_features.txt",
1825
// "model.txt")
26+
//
27+
// LambdaMart.evaluateModel("./model/xgb.model",
28+
// "test_candidates_0_features.txt")
29+
30+
}
31+
32+
def genAllCandidates(file: String): Unit = {
33+
val candidateGenerator =
34+
new CandidateGenerator(new Dataset(file),
35+
new BackgroundData())
36+
candidateGenerator.generateCandidates(Synthetic.NO_SYNTHETIC)
37+
candidateGenerator.generateCandidates(Synthetic.SYNTHETIC_10K)
38+
candidateGenerator.generateCandidates(Synthetic.SYNTHETIC_100K)
39+
}
40+
41+
def genAllFeatures(file: String): Unit = {
42+
val featureGenerator = new FeatureGenerator(s"${file.replace(".txt", "")}_candidates_0.txt")
43+
featureGenerator.generateAndWriteFeatures()
1944

20-
LambdaMart.evaluateModel("./model/xgb.model",
21-
"test_candidates_0_features.txt")
45+
val featureGenerator2 = new FeatureGenerator(s"${file.replace(".txt", "")}_candidates_1.txt")
46+
featureGenerator2.generateAndWriteFeatures()
2247

48+
val featureGenerator3 = new FeatureGenerator(s"${file.replace(".txt", "")}_candidates_2.txt")
49+
featureGenerator3.generateAndWriteFeatures()
2350
}
2451

2552
}

0 commit comments

Comments
 (0)