tensorflow · texasmichelle · Jun 12, 2020 · May 27, 2020 · May 27, 2020 · May 27, 2020
diff --git a/Benchmarks/Models/WordSeg.swift b/Benchmarks/Models/WordSeg.swift
@@ -106,14 +106,14 @@ struct WordSegBenchmark: Benchmark {
               from: [sentence],
               alphabet: dataset.alphabet,
               maxLength: maximumSequenceLength,
-              minFreq: 10
+              minFrequency: 10
             )
 
             let modelParameters = SNLM.Parameters(
-              ndim: 512,
-              dropoutProb: 0.5,
-              chrVocab: dataset.alphabet,
-              strVocab: lexicon,
+              hiddenSize: 512,
+              dropoutProbability: 0.5,
+              alphabet: dataset.alphabet,
+              lexicon: lexicon,
               order: 5
             )
 

diff --git a/Datasets/CMakeLists.txt b/Datasets/CMakeLists.txt
@@ -21,7 +21,7 @@ add_library(Datasets
   TensorPair.swift
   TextUnsupervised/TextUnsupervised.swift
   WordSeg/WordSegDataset.swift
-  WordSeg/WordSegRecord.swift
+  WordSeg/Phrase.swift
   ImageSegmentationDataset.swift
   OxfordIIITPets/OxfordIIITPets.swift)
 target_link_libraries(Datasets PUBLIC

diff --git a/Datasets/WordSeg/WordSegRecord.swift → Datasets/WordSeg/Phrase.swift b/Datasets/WordSeg/WordSegRecord.swift → Datasets/WordSeg/Phrase.swift
@@ -14,10 +14,17 @@
 
 import ModelSupport
 
-public struct WordSegRecord {
+/// A sequence of text for use in word segmentation.
+public struct Phrase {
+
+  /// A raw, unprocessed sequence of text.
   public let plainText: String
+
+  /// A sequence of text in numeric form, derived from `plainText`.
   public let numericalizedText: CharacterSequence
 
+  /// Creates an instance containing both raw (`plainText`) and processed
+  /// (`numericalizedText`) forms of a sequence of text.
   public init(plainText: String, numericalizedText: CharacterSequence) {
     self.plainText = plainText
     self.numericalizedText = numericalizedText

diff --git a/Datasets/WordSeg/WordSegDataset.swift b/Datasets/WordSeg/WordSegDataset.swift
@@ -15,110 +15,129 @@
 import Foundation
 import ModelSupport
 
+/// A dataset targeted at the problem of word segmentation.
+///
+/// The reference archive was published in the paper "Learning to Discover,
+/// Ground, and Use Words with Segmental Neural Language Models" by Kazuya
+/// Kawakami, Chris Dyer, and Phil Blunsom:
+/// https://www.aclweb.org/anthology/P19-1645.pdf.
 public struct WordSegDataset {
-  public let training: [WordSegRecord]
-  public private(set) var testing: [WordSegRecord]?
-  public private(set) var validation: [WordSegRecord]?
+
+  /// The training data.
+  public let trainingPhrases: [Phrase]
+
+  /// The test data.
+  public private(set) var testingPhrases: [Phrase]
+
+  /// The validation data.
+  public private(set) var validationPhrases: [Phrase]
+
+  /// A mapping between characters used in the dataset and densely-packed integers
   public let alphabet: Alphabet
 
-  private struct DownloadDetails {
-    var archiveLocation = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami")!
-    var archiveFileName = "seg"
-    var archiveExtension = "zip"
-    var testingFilePath = "br/br-text/te.txt"
-    var trainingFilePath = "br/br-text/tr.txt"
-    var validationFilePath = "br/br-text/va.txt"
-  }
+  /// A pointer to source data.
+  private struct DownloadableArchive {
 
-  private static func load(data: Data) throws -> [String] {
-    guard let contents: String = String(data: data, encoding: .utf8) else {
-      throw CharacterErrors.nonUtf8Data
-    }
-    return load(contents: contents)
-  }
+    /// A [web resource](https://en.wikipedia.org/wiki/Web_resource) that can be unpacked
+    /// into data files described by other properties of `self`. 
+    let location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")!
 
-  private static func load(contents: String) -> [String] {
-    var strings = [String]()
+    /// The path to the test data within the unpacked archive.
+    let testingFilePath = "br/br-text/te.txt"
 
-    for line in contents.components(separatedBy: .newlines) {
-      let trimmed = line.trimmingCharacters(in: .whitespaces)
-      if trimmed.isEmpty { continue }
-      strings.append(trimmed)
-    }
-    return strings
+    /// The path to the training data within the unpacked archive.
+    let trainingFilePath = "br/br-text/tr.txt"
+
+    /// The path to the validation data within the unpacked archive.
+    let validationFilePath = "br/br-text/va.txt"
+  }
+
+  /// Returns phrases parsed from `data` in UTF8, separated by newlines.
+  private static func load(data: Data) -> [Substring] {
+    let contents = String(decoding: data, as: Unicode.UTF8.self)
+    let splitContents = contents.split(separator: "\n", omittingEmptySubsequences: true)
+    return splitContents
   }
 
+  /// Returns the union of all characters in `phrases`.
+  ///
+  /// - Parameter eos: the end of sequence marker.
+  /// - Parameter eow:the end of word marker.
+  /// - Parameter pad: the padding marker.
   private static func makeAlphabet(
-    datasets training: [String],
-    _ otherSequences: [String]?...,
+    phrases: [Substring],
     eos: String = "</s>",
     eow: String = "</w>",
     pad: String = "</pad>"
   ) -> Alphabet {
-    var letters: Set<Character> = []
-
-    for dataset in otherSequences + [training] {
-      guard let dataset = dataset else { continue }
-      for sentence in dataset {
-        for character in sentence {
-          if !character.isWhitespace { letters.insert(character) }
-        }
-      }
-    }
+    let letters = Set(phrases.joined().lazy.filter { !$0.isWhitespace })
 
     // Sort the letters to make it easier to interpret ints vs letters.
-    var sorted = Array(letters)
-    sorted.sort()
+    let sorted = Array(letters).sorted()
 
     return Alphabet(sorted, eos: eos, eow: eow, pad: pad)
   }
 
-  private static func convertDataset(_ dataset: [String], alphabet: Alphabet) throws
-    -> [WordSegRecord]
-  {
-    return try dataset.map {
-      let trimmed = $0.components(separatedBy: .whitespaces).joined()
-      return try WordSegRecord(
-        plainText: $0,
-        numericalizedText: CharacterSequence(
-          alphabet: alphabet, appendingEoSTo: trimmed))
-    }
-  }
-  private static func convertDataset(_ dataset: [String]?, alphabet: Alphabet) throws
-    -> [WordSegRecord]?
+  /// Numericalizes `dataset` with the mapping in `alphabet`, to be used with the
+  /// WordSeg model.
+  ///
+  /// - Note: Omits any phrase that cannot be converted to `CharacterSequence`.
+  private static func numericalizeDataset(_ dataset: [Substring], alphabet: Alphabet)
+    -> [Phrase]
   {
-    if let ds = dataset {
-      let tmp: [WordSegRecord] = try convertDataset(ds, alphabet: alphabet)  // Use tmp to disambiguate function
-      return tmp
+    var phrases = [Phrase]()
+
+    for data in dataset {
+      let trimmed = data.split(separator: " ", omittingEmptySubsequences: true).joined()
+      guard
+        let numericalizedText = try? CharacterSequence(
+          alphabet: alphabet, appendingEoSTo: trimmed)
+      else { continue }
+      let phrase = Phrase(
+        plainText: String(data),
+        numericalizedText: numericalizedText)
+      phrases.append(phrase)
     }
-    return nil
+
+    return phrases
   }
 
+  /// Creates an instance containing phrases from the reference archive.
+  ///
+  /// - Throws: an error in the Cocoa domain, if the default training file
+  ///   cannot be read.
   public init() throws {
-    let downloadDetails = DownloadDetails()
+    let source = DownloadableArchive()
     let localStorageDirectory: URL = DatasetUtilities.defaultDirectory
       .appendingPathComponent("WordSeg", isDirectory: true)
 
-    WordSegDataset.downloadIfNotPresent(to: localStorageDirectory, downloadDetails: downloadDetails)
+    Self.downloadIfNotPresent(
+      to: localStorageDirectory, source: source)
 
+    let archiveFileName = source.location.deletingPathExtension().lastPathComponent
     let archiveDirectory =
       localStorageDirectory
-      .appendingPathComponent(downloadDetails.archiveFileName)
+      .appendingPathComponent(archiveFileName)
     let trainingFilePath =
       archiveDirectory
-      .appendingPathComponent(downloadDetails.trainingFilePath).path
+      .appendingPathComponent(source.trainingFilePath).path
     let validationFilePath =
       archiveDirectory
-      .appendingPathComponent(downloadDetails.validationFilePath).path
+      .appendingPathComponent(source.validationFilePath).path
     let testingFilePath =
       archiveDirectory
-      .appendingPathComponent(downloadDetails.testingFilePath).path
+      .appendingPathComponent(source.testingFilePath).path
 
     try self.init(
       training: trainingFilePath, validation: validationFilePath,
       testing: testingFilePath)
   }
 
+  /// Creates an instance containing phrases from `trainingFile`, and
+  /// optionally `validationFile` and `testingFile`.
+  ///
+  /// - Throws: an error in the Cocoa domain, if `trainingFile` cannot be
+  ///   read.
   public init(
     training trainingFile: String,
     validation validationFile: String? = nil,
@@ -127,53 +146,38 @@ public struct WordSegDataset {
     let trainingData = try Data(
       contentsOf: URL(fileURLWithPath: trainingFile),
       options: .alwaysMapped)
-    let training = try Self.load(data: trainingData)
 
-    var validation: [String]? = nil
-    var testing: [String]? = nil
+    let validationData = try Data(
+      contentsOf: URL(fileURLWithPath: validationFile ?? "/dev/null"),
+      options: .alwaysMapped)
 
-    if let validationFile = validationFile {
-      let data = try Data(
-        contentsOf: URL(fileURLWithPath: validationFile),
-        options: .alwaysMapped)
-      validation = try Self.load(data: data)
-    }
+    let testingData = try Data(
+      contentsOf: URL(fileURLWithPath: testingFile ?? "/dev/null"),
+      options: .alwaysMapped)
 
-    if let testingFile = testingFile {
-      let data: Data = try Data(
-        contentsOf: URL(fileURLWithPath: testingFile),
-        options: .alwaysMapped)
-      testing = try Self.load(data: data)
-    }
-    self.alphabet = Self.makeAlphabet(datasets: training, validation, testing)
-    self.training = try Self.convertDataset(training, alphabet: self.alphabet)
-    self.validation = try Self.convertDataset(validation, alphabet: self.alphabet)
-    self.testing = try Self.convertDataset(testing, alphabet: self.alphabet)
+    self.init(
+      training: trainingData, validation: validationData, testing: testingData)
   }
 
+  /// Creates an instance containing phrases from `trainingData`, and
+  /// optionally `validationData` and `testingData`.
   public init(
     training trainingData: Data, validation validationData: Data?, testing testingData: Data?
-  )
-    throws
-  {
-    let training = try Self.load(data: trainingData)
-    var validation: [String]? = nil
-    var testing: [String]? = nil
-    if let validationData = validationData {
-      validation = try Self.load(data: validationData)
-    }
-    if let testingData = testingData {
-      testing = try Self.load(data: testingData)
-    }
-
-    self.alphabet = Self.makeAlphabet(datasets: training, validation, testing)
-    self.training = try Self.convertDataset(training, alphabet: self.alphabet)
-    self.validation = try Self.convertDataset(validation, alphabet: self.alphabet)
-    self.testing = try Self.convertDataset(testing, alphabet: self.alphabet)
+  ) {
+    let training = Self.load(data: trainingData)
+    let validation = Self.load(data: validationData ?? Data())
+    let testing = Self.load(data: testingData ?? Data())
+
+    self.alphabet = Self.makeAlphabet(phrases: training + validation + testing)
+    self.trainingPhrases = Self.numericalizeDataset(training, alphabet: self.alphabet)
+    self.validationPhrases = Self.numericalizeDataset(validation, alphabet: self.alphabet)
+    self.testingPhrases = Self.numericalizeDataset(testing, alphabet: self.alphabet)
   }
 
+  /// Downloads and unpacks `source` to `directory` if it does not
+  /// exist locally.
   private static func downloadIfNotPresent(
-    to directory: URL, downloadDetails: DownloadDetails
+    to directory: URL, source: DownloadableArchive
   ) {
     let downloadPath = directory.path
     let directoryExists = FileManager.default.fileExists(atPath: downloadPath)
@@ -182,11 +186,15 @@ public struct WordSegDataset {
 
     guard !directoryExists || directoryEmpty else { return }
 
+    let remoteRoot = source.location.deletingLastPathComponent()
+    let filename = source.location.deletingPathExtension().lastPathComponent
+    let fileExtension = source.location.pathExtension
+
     // Downloads and extracts dataset files.
     let _ = DatasetUtilities.downloadResource(
-      filename: downloadDetails.archiveFileName,
-      fileExtension: downloadDetails.archiveExtension,
-      remoteRoot: downloadDetails.archiveLocation,
+      filename: filename,
+      fileExtension: fileExtension,
+      remoteRoot: remoteRoot,
       localStorageDirectory: directory, extract: true)
   }
 }