Skip to content
This repository was archived by the owner on Apr 23, 2025. It is now read-only.

[WordSeg] Add inline documentation #566

Merged
merged 35 commits into from
Jun 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
3f7a6f0
Add documentation
texasmichelle May 27, 2020
2d42404
Lint
texasmichelle May 27, 2020
bdf71a9
Add dataset files
texasmichelle May 27, 2020
d44b5bf
Add lattice
texasmichelle May 28, 2020
cf86c8c
Add semiring
texasmichelle May 28, 2020
8c3812e
Add SNLM
texasmichelle May 28, 2020
d36a20c
Add bullets for throws
texasmichelle May 28, 2020
8a8b513
Merge remote-tracking branch 'upstream/master' into wordseg_docs
texasmichelle Jun 1, 2020
951a98b
Rename WordSegRecord to Phrase
texasmichelle Jun 2, 2020
4de6e7d
Update CMakeLists
texasmichelle Jun 2, 2020
060e88b
Clarify more summaries.
texasmichelle Jun 2, 2020
04c58bb
Lint
texasmichelle Jun 2, 2020
079c47f
Add blank lines
texasmichelle Jun 2, 2020
711b3c8
Clarify more summaries.
texasmichelle Jun 2, 2020
31fc09b
Lint
texasmichelle Jun 2, 2020
270f2fd
Merge remote-tracking branch 'upstream/master' into wordseg_docs
texasmichelle Jun 2, 2020
e80dfc2
Clarify lattice summary.
texasmichelle Jun 2, 2020
09e82b5
Summary refinement
texasmichelle Jun 2, 2020
ec21c4e
Merge remote-tracking branch 'upstream/master' into wordseg_docs
texasmichelle Jun 4, 2020
d94a45e
Clarify end marker behavior and assumptions
texasmichelle Jun 5, 2020
4ca9ba9
Merge remote-tracking branch 'upstream/master' into wordseg_docs
texasmichelle Jun 9, 2020
6881b58
Rename ReferenceArchive to DownloadableArchive
texasmichelle Jun 9, 2020
c9ffce6
Update Datasets/WordSeg/WordSegDataset.swift
texasmichelle Jun 9, 2020
80e575a
Update Datasets/WordSeg/WordSegDataset.swift
texasmichelle Jun 9, 2020
fb5e5d7
Remove implied text from comments with phrase.
texasmichelle Jun 9, 2020
92b58e9
Remove Foundation string processing
texasmichelle Jun 9, 2020
21fc998
Update Datasets/WordSeg/WordSegDataset.swift
texasmichelle Jun 9, 2020
cc3f30e
Remove variadic arguments in makeAlphabet
texasmichelle Jun 10, 2020
6ddabe7
Merge remote-tracking branch 'upstream/master' into wordseg_docs
texasmichelle Jun 10, 2020
361e380
Rename convertDataset to numericalizeDataset
texasmichelle Jun 11, 2020
d6546ea
Remove raw loop in makeAlphabet
texasmichelle Jun 11, 2020
e942477
Update Datasets/WordSeg/WordSegDataset.swift
texasmichelle Jun 11, 2020
6e0ae15
s/densly/densely/
texasmichelle Jun 11, 2020
ac6436f
Remove hard-coded path
texasmichelle Jun 11, 2020
070d745
Replace `WordSegDataset` with `Self`
texasmichelle Jun 12, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions Benchmarks/Models/WordSeg.swift
Original file line number Diff line number Diff line change
Expand Up @@ -106,14 +106,14 @@ struct WordSegBenchmark: Benchmark {
from: [sentence],
alphabet: dataset.alphabet,
maxLength: maximumSequenceLength,
minFreq: 10
minFrequency: 10
)

let modelParameters = SNLM.Parameters(
ndim: 512,
dropoutProb: 0.5,
chrVocab: dataset.alphabet,
strVocab: lexicon,
hiddenSize: 512,
dropoutProbability: 0.5,
alphabet: dataset.alphabet,
lexicon: lexicon,
order: 5
)

Expand Down
2 changes: 1 addition & 1 deletion Datasets/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ add_library(Datasets
TensorPair.swift
TextUnsupervised/TextUnsupervised.swift
WordSeg/WordSegDataset.swift
WordSeg/WordSegRecord.swift
WordSeg/Phrase.swift
ImageSegmentationDataset.swift
OxfordIIITPets/OxfordIIITPets.swift)
target_link_libraries(Datasets PUBLIC
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,17 @@

import ModelSupport

public struct WordSegRecord {
/// A sequence of text for use in word segmentation.
public struct Phrase {

/// A raw, unprocessed sequence of text.
public let plainText: String

/// A sequence of text in numeric form, derived from `plainText`.
public let numericalizedText: CharacterSequence
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I remember wanting to mention this before, but if “Character” doesn't mean what in Swift is called Character, we should look for other names, e.g. “glyph.”

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As a start, I created #600 for using Character instead of String.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't exactly address this, but heads in the direction of cleaning up that design overall.


/// Creates an instance containing both raw (`plainText`) and processed
/// (`numericalizedText`) forms of a sequence of text.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've always thought “numericalized” read terribly awkwardizedly. I get the impression this is a term of art, but we should discuss whether it's the best choice.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Created #598

public init(plainText: String, numericalizedText: CharacterSequence) {
self.plainText = plainText
self.numericalizedText = numericalizedText
Expand Down
212 changes: 110 additions & 102 deletions Datasets/WordSeg/WordSegDataset.swift
Original file line number Diff line number Diff line change
Expand Up @@ -15,110 +15,129 @@
import Foundation
import ModelSupport

/// A dataset targeted at the problem of word segmentation.
///
/// The reference archive was published in the paper "Learning to Discover,
/// Ground, and Use Words with Segmental Neural Language Models" by Kazuya
/// Kawakami, Chris Dyer, and Phil Blunsom:
/// https://www.aclweb.org/anthology/P19-1645.pdf.
public struct WordSegDataset {
public let training: [WordSegRecord]
public private(set) var testing: [WordSegRecord]?
public private(set) var validation: [WordSegRecord]?

/// The training data.
public let trainingPhrases: [Phrase]

/// The test data.
public private(set) var testingPhrases: [Phrase]

/// The validation data.
public private(set) var validationPhrases: [Phrase]

/// A mapping between characters used in the dataset and densely-packed integers
public let alphabet: Alphabet

private struct DownloadDetails {
var archiveLocation = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami")!
var archiveFileName = "seg"
var archiveExtension = "zip"
var testingFilePath = "br/br-text/te.txt"
var trainingFilePath = "br/br-text/tr.txt"
var validationFilePath = "br/br-text/va.txt"
}
/// A pointer to source data.
private struct DownloadableArchive {

private static func load(data: Data) throws -> [String] {
guard let contents: String = String(data: data, encoding: .utf8) else {
throw CharacterErrors.nonUtf8Data
}
return load(contents: contents)
}
/// A [web resource](https://en.wikipedia.org/wiki/Web_resource) that can be unpacked
/// into data files described by other properties of `self`.
let location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")!

private static func load(contents: String) -> [String] {
var strings = [String]()
/// The path to the test data within the unpacked archive.
let testingFilePath = "br/br-text/te.txt"

for line in contents.components(separatedBy: .newlines) {
let trimmed = line.trimmingCharacters(in: .whitespaces)
if trimmed.isEmpty { continue }
strings.append(trimmed)
}
return strings
/// The path to the training data within the unpacked archive.
let trainingFilePath = "br/br-text/tr.txt"

/// The path to the validation data within the unpacked archive.
let validationFilePath = "br/br-text/va.txt"
}

/// Returns phrases parsed from `data` in UTF8, separated by newlines.
private static func load(data: Data) -> [Substring] {
let contents = String(decoding: data, as: Unicode.UTF8.self)
let splitContents = contents.split(separator: "\n", omittingEmptySubsequences: true)
return splitContents
}

/// Returns the union of all characters in `phrases`.
///
/// - Parameter eos: the end of sequence marker.
/// - Parameter eow:the end of word marker.
/// - Parameter pad: the padding marker.
private static func makeAlphabet(
datasets training: [String],
_ otherSequences: [String]?...,
phrases: [Substring],
eos: String = "</s>",
eow: String = "</w>",
pad: String = "</pad>"
) -> Alphabet {
var letters: Set<Character> = []

for dataset in otherSequences + [training] {
guard let dataset = dataset else { continue }
for sentence in dataset {
for character in sentence {
if !character.isWhitespace { letters.insert(character) }
}
}
}
let letters = Set(phrases.joined().lazy.filter { !$0.isWhitespace })

// Sort the letters to make it easier to interpret ints vs letters.
var sorted = Array(letters)
sorted.sort()
let sorted = Array(letters).sorted()

return Alphabet(sorted, eos: eos, eow: eow, pad: pad)
}

private static func convertDataset(_ dataset: [String], alphabet: Alphabet) throws
-> [WordSegRecord]
{
return try dataset.map {
let trimmed = $0.components(separatedBy: .whitespaces).joined()
return try WordSegRecord(
plainText: $0,
numericalizedText: CharacterSequence(
alphabet: alphabet, appendingEoSTo: trimmed))
}
}
private static func convertDataset(_ dataset: [String]?, alphabet: Alphabet) throws
-> [WordSegRecord]?
/// Numericalizes `dataset` with the mapping in `alphabet`, to be used with the
/// WordSeg model.
///
/// - Note: Omits any phrase that cannot be converted to `CharacterSequence`.
private static func numericalizeDataset(_ dataset: [Substring], alphabet: Alphabet)
-> [Phrase]
{
if let ds = dataset {
let tmp: [WordSegRecord] = try convertDataset(ds, alphabet: alphabet) // Use tmp to disambiguate function
return tmp
var phrases = [Phrase]()

for data in dataset {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dabrahams Same question here about how to compose this in a way that removes the raw loop?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How could I use compactMap here if I need to include the original text? If I want to base my inclusion on whether CharacterSequence() results in nil, how can I include the original text in composing Phrase?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels like a symptom of awkward design. I believe rethinking CharacterSequence will make these acrobatics unnecessary.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

    // untested of course
    return dataset.compactMap { data in
      let trimmed = data.split(separator: " ", omittingEmptySubsequences: true).joined()
      let numericalizedText = try? CharacterSequence(alphabet: alphabet, appendingEoSTo: trimmed)
      return numericalizedText.map { Phrase(plainText: String(data), numericalizedText: $0 }
    }

@texasmichelle

let trimmed = data.split(separator: " ", omittingEmptySubsequences: true).joined()
guard
let numericalizedText = try? CharacterSequence(
alphabet: alphabet, appendingEoSTo: trimmed)
else { continue }
let phrase = Phrase(
plainText: String(data),
numericalizedText: numericalizedText)
phrases.append(phrase)
}
return nil

return phrases
}

/// Creates an instance containing phrases from the reference archive.
///
/// - Throws: an error in the Cocoa domain, if the default training file
/// cannot be read.
public init() throws {
let downloadDetails = DownloadDetails()
let source = DownloadableArchive()
let localStorageDirectory: URL = DatasetUtilities.defaultDirectory
.appendingPathComponent("WordSeg", isDirectory: true)

WordSegDataset.downloadIfNotPresent(to: localStorageDirectory, downloadDetails: downloadDetails)
Self.downloadIfNotPresent(
to: localStorageDirectory, source: source)

let archiveFileName = source.location.deletingPathExtension().lastPathComponent
let archiveDirectory =
localStorageDirectory
.appendingPathComponent(downloadDetails.archiveFileName)
.appendingPathComponent(archiveFileName)
let trainingFilePath =
archiveDirectory
.appendingPathComponent(downloadDetails.trainingFilePath).path
.appendingPathComponent(source.trainingFilePath).path
let validationFilePath =
archiveDirectory
.appendingPathComponent(downloadDetails.validationFilePath).path
.appendingPathComponent(source.validationFilePath).path
let testingFilePath =
archiveDirectory
.appendingPathComponent(downloadDetails.testingFilePath).path
.appendingPathComponent(source.testingFilePath).path

try self.init(
training: trainingFilePath, validation: validationFilePath,
testing: testingFilePath)
}

/// Creates an instance containing phrases from `trainingFile`, and
/// optionally `validationFile` and `testingFile`.
///
/// - Throws: an error in the Cocoa domain, if `trainingFile` cannot be
/// read.
public init(
training trainingFile: String,
validation validationFile: String? = nil,
Expand All @@ -127,53 +146,38 @@ public struct WordSegDataset {
let trainingData = try Data(
contentsOf: URL(fileURLWithPath: trainingFile),
options: .alwaysMapped)
let training = try Self.load(data: trainingData)

var validation: [String]? = nil
var testing: [String]? = nil
let validationData = try Data(
contentsOf: URL(fileURLWithPath: validationFile ?? "/dev/null"),
options: .alwaysMapped)

if let validationFile = validationFile {
let data = try Data(
contentsOf: URL(fileURLWithPath: validationFile),
options: .alwaysMapped)
validation = try Self.load(data: data)
}
let testingData = try Data(
contentsOf: URL(fileURLWithPath: testingFile ?? "/dev/null"),
options: .alwaysMapped)

if let testingFile = testingFile {
let data: Data = try Data(
contentsOf: URL(fileURLWithPath: testingFile),
options: .alwaysMapped)
testing = try Self.load(data: data)
}
self.alphabet = Self.makeAlphabet(datasets: training, validation, testing)
self.training = try Self.convertDataset(training, alphabet: self.alphabet)
self.validation = try Self.convertDataset(validation, alphabet: self.alphabet)
self.testing = try Self.convertDataset(testing, alphabet: self.alphabet)
self.init(
training: trainingData, validation: validationData, testing: testingData)
}

/// Creates an instance containing phrases from `trainingData`, and
/// optionally `validationData` and `testingData`.
public init(
training trainingData: Data, validation validationData: Data?, testing testingData: Data?
)
throws
{
let training = try Self.load(data: trainingData)
var validation: [String]? = nil
var testing: [String]? = nil
if let validationData = validationData {
validation = try Self.load(data: validationData)
}
if let testingData = testingData {
testing = try Self.load(data: testingData)
}

self.alphabet = Self.makeAlphabet(datasets: training, validation, testing)
self.training = try Self.convertDataset(training, alphabet: self.alphabet)
self.validation = try Self.convertDataset(validation, alphabet: self.alphabet)
self.testing = try Self.convertDataset(testing, alphabet: self.alphabet)
) {
let training = Self.load(data: trainingData)
let validation = Self.load(data: validationData ?? Data())
let testing = Self.load(data: testingData ?? Data())

self.alphabet = Self.makeAlphabet(phrases: training + validation + testing)
self.trainingPhrases = Self.numericalizeDataset(training, alphabet: self.alphabet)
self.validationPhrases = Self.numericalizeDataset(validation, alphabet: self.alphabet)
self.testingPhrases = Self.numericalizeDataset(testing, alphabet: self.alphabet)
}

/// Downloads and unpacks `source` to `directory` if it does not
/// exist locally.
private static func downloadIfNotPresent(
to directory: URL, downloadDetails: DownloadDetails
to directory: URL, source: DownloadableArchive
) {
let downloadPath = directory.path
let directoryExists = FileManager.default.fileExists(atPath: downloadPath)
Expand All @@ -182,11 +186,15 @@ public struct WordSegDataset {

guard !directoryExists || directoryEmpty else { return }

let remoteRoot = source.location.deletingLastPathComponent()
let filename = source.location.deletingPathExtension().lastPathComponent
let fileExtension = source.location.pathExtension

// Downloads and extracts dataset files.
let _ = DatasetUtilities.downloadResource(
filename: downloadDetails.archiveFileName,
fileExtension: downloadDetails.archiveExtension,
remoteRoot: downloadDetails.archiveLocation,
filename: filename,
fileExtension: fileExtension,
remoteRoot: remoteRoot,
localStorageDirectory: directory, extract: true)
}
}
Loading