-
Notifications
You must be signed in to change notification settings - Fork 149
[WordSeg] Add inline documentation #566
Changes from all commits
3f7a6f0
2d42404
bdf71a9
d44b5bf
cf86c8c
8c3812e
d36a20c
8a8b513
951a98b
4de6e7d
060e88b
04c58bb
079c47f
711b3c8
31fc09b
270f2fd
e80dfc2
09e82b5
ec21c4e
d94a45e
4ca9ba9
6881b58
c9ffce6
80e575a
fb5e5d7
92b58e9
21fc998
cc3f30e
6ddabe7
361e380
d6546ea
e942477
6e0ae15
ac6436f
070d745
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,10 +14,17 @@ | |
|
||
import ModelSupport | ||
|
||
public struct WordSegRecord { | ||
/// A sequence of text for use in word segmentation. | ||
public struct Phrase { | ||
|
||
/// A raw, unprocessed sequence of text. | ||
public let plainText: String | ||
|
||
/// A sequence of text in numeric form, derived from `plainText`. | ||
public let numericalizedText: CharacterSequence | ||
|
||
/// Creates an instance containing both raw (`plainText`) and processed | ||
/// (`numericalizedText`) forms of a sequence of text. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've always thought “numericalized” read terribly awkwardizedly. I get the impression this is a term of art, but we should discuss whether it's the best choice. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Created #598 |
||
public init(plainText: String, numericalizedText: CharacterSequence) { | ||
self.plainText = plainText | ||
self.numericalizedText = numericalizedText | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,110 +15,129 @@ | |
import Foundation | ||
import ModelSupport | ||
|
||
/// A dataset targeted at the problem of word segmentation. | ||
/// | ||
/// The reference archive was published in the paper "Learning to Discover, | ||
/// Ground, and Use Words with Segmental Neural Language Models" by Kazuya | ||
/// Kawakami, Chris Dyer, and Phil Blunsom: | ||
/// https://www.aclweb.org/anthology/P19-1645.pdf. | ||
public struct WordSegDataset { | ||
texasmichelle marked this conversation as resolved.
Show resolved
Hide resolved
|
||
public let training: [WordSegRecord] | ||
public private(set) var testing: [WordSegRecord]? | ||
public private(set) var validation: [WordSegRecord]? | ||
|
||
/// The training data. | ||
public let trainingPhrases: [Phrase] | ||
|
||
/// The test data. | ||
public private(set) var testingPhrases: [Phrase] | ||
|
||
/// The validation data. | ||
public private(set) var validationPhrases: [Phrase] | ||
|
||
/// A mapping between characters used in the dataset and densely-packed integers | ||
public let alphabet: Alphabet | ||
|
||
private struct DownloadDetails { | ||
var archiveLocation = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami")! | ||
var archiveFileName = "seg" | ||
var archiveExtension = "zip" | ||
var testingFilePath = "br/br-text/te.txt" | ||
var trainingFilePath = "br/br-text/tr.txt" | ||
var validationFilePath = "br/br-text/va.txt" | ||
} | ||
/// A pointer to source data. | ||
texasmichelle marked this conversation as resolved.
Show resolved
Hide resolved
|
||
private struct DownloadableArchive { | ||
|
||
private static func load(data: Data) throws -> [String] { | ||
guard let contents: String = String(data: data, encoding: .utf8) else { | ||
throw CharacterErrors.nonUtf8Data | ||
} | ||
return load(contents: contents) | ||
} | ||
/// A [web resource](https://en.wikipedia.org/wiki/Web_resource) that can be unpacked | ||
/// into data files described by other properties of `self`. | ||
let location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")! | ||
|
||
private static func load(contents: String) -> [String] { | ||
var strings = [String]() | ||
/// The path to the test data within the unpacked archive. | ||
let testingFilePath = "br/br-text/te.txt" | ||
|
||
for line in contents.components(separatedBy: .newlines) { | ||
let trimmed = line.trimmingCharacters(in: .whitespaces) | ||
if trimmed.isEmpty { continue } | ||
strings.append(trimmed) | ||
} | ||
return strings | ||
/// The path to the training data within the unpacked archive. | ||
let trainingFilePath = "br/br-text/tr.txt" | ||
|
||
/// The path to the validation data within the unpacked archive. | ||
let validationFilePath = "br/br-text/va.txt" | ||
} | ||
|
||
/// Returns phrases parsed from `data` in UTF8, separated by newlines. | ||
private static func load(data: Data) -> [Substring] { | ||
let contents = String(decoding: data, as: Unicode.UTF8.self) | ||
let splitContents = contents.split(separator: "\n", omittingEmptySubsequences: true) | ||
return splitContents | ||
} | ||
|
||
/// Returns the union of all characters in `phrases`. | ||
/// | ||
/// - Parameter eos: the end of sequence marker. | ||
/// - Parameter eow:the end of word marker. | ||
/// - Parameter pad: the padding marker. | ||
private static func makeAlphabet( | ||
datasets training: [String], | ||
_ otherSequences: [String]?..., | ||
phrases: [Substring], | ||
eos: String = "</s>", | ||
eow: String = "</w>", | ||
pad: String = "</pad>" | ||
) -> Alphabet { | ||
var letters: Set<Character> = [] | ||
|
||
for dataset in otherSequences + [training] { | ||
guard let dataset = dataset else { continue } | ||
for sentence in dataset { | ||
for character in sentence { | ||
if !character.isWhitespace { letters.insert(character) } | ||
} | ||
} | ||
} | ||
let letters = Set(phrases.joined().lazy.filter { !$0.isWhitespace }) | ||
|
||
// Sort the letters to make it easier to interpret ints vs letters. | ||
var sorted = Array(letters) | ||
sorted.sort() | ||
let sorted = Array(letters).sorted() | ||
|
||
return Alphabet(sorted, eos: eos, eow: eow, pad: pad) | ||
} | ||
|
||
private static func convertDataset(_ dataset: [String], alphabet: Alphabet) throws | ||
-> [WordSegRecord] | ||
{ | ||
return try dataset.map { | ||
let trimmed = $0.components(separatedBy: .whitespaces).joined() | ||
return try WordSegRecord( | ||
plainText: $0, | ||
numericalizedText: CharacterSequence( | ||
alphabet: alphabet, appendingEoSTo: trimmed)) | ||
} | ||
} | ||
private static func convertDataset(_ dataset: [String]?, alphabet: Alphabet) throws | ||
-> [WordSegRecord]? | ||
/// Numericalizes `dataset` with the mapping in `alphabet`, to be used with the | ||
/// WordSeg model. | ||
/// | ||
/// - Note: Omits any phrase that cannot be converted to `CharacterSequence`. | ||
private static func numericalizeDataset(_ dataset: [Substring], alphabet: Alphabet) | ||
-> [Phrase] | ||
{ | ||
if let ds = dataset { | ||
let tmp: [WordSegRecord] = try convertDataset(ds, alphabet: alphabet) // Use tmp to disambiguate function | ||
return tmp | ||
var phrases = [Phrase]() | ||
|
||
for data in dataset { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dabrahams Same question here about how to compose this in a way that removes the raw loop? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How could I use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This feels like a symptom of awkward design. I believe rethinking There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. // untested of course
return dataset.compactMap { data in
let trimmed = data.split(separator: " ", omittingEmptySubsequences: true).joined()
let numericalizedText = try? CharacterSequence(alphabet: alphabet, appendingEoSTo: trimmed)
return numericalizedText.map { Phrase(plainText: String(data), numericalizedText: $0 }
} |
||
let trimmed = data.split(separator: " ", omittingEmptySubsequences: true).joined() | ||
guard | ||
let numericalizedText = try? CharacterSequence( | ||
alphabet: alphabet, appendingEoSTo: trimmed) | ||
else { continue } | ||
let phrase = Phrase( | ||
plainText: String(data), | ||
numericalizedText: numericalizedText) | ||
phrases.append(phrase) | ||
} | ||
return nil | ||
|
||
return phrases | ||
} | ||
|
||
/// Creates an instance containing phrases from the reference archive. | ||
/// | ||
/// - Throws: an error in the Cocoa domain, if the default training file | ||
/// cannot be read. | ||
public init() throws { | ||
let downloadDetails = DownloadDetails() | ||
let source = DownloadableArchive() | ||
let localStorageDirectory: URL = DatasetUtilities.defaultDirectory | ||
.appendingPathComponent("WordSeg", isDirectory: true) | ||
|
||
WordSegDataset.downloadIfNotPresent(to: localStorageDirectory, downloadDetails: downloadDetails) | ||
Self.downloadIfNotPresent( | ||
to: localStorageDirectory, source: source) | ||
|
||
let archiveFileName = source.location.deletingPathExtension().lastPathComponent | ||
let archiveDirectory = | ||
localStorageDirectory | ||
.appendingPathComponent(downloadDetails.archiveFileName) | ||
.appendingPathComponent(archiveFileName) | ||
let trainingFilePath = | ||
archiveDirectory | ||
.appendingPathComponent(downloadDetails.trainingFilePath).path | ||
.appendingPathComponent(source.trainingFilePath).path | ||
let validationFilePath = | ||
archiveDirectory | ||
.appendingPathComponent(downloadDetails.validationFilePath).path | ||
.appendingPathComponent(source.validationFilePath).path | ||
let testingFilePath = | ||
archiveDirectory | ||
.appendingPathComponent(downloadDetails.testingFilePath).path | ||
.appendingPathComponent(source.testingFilePath).path | ||
|
||
try self.init( | ||
training: trainingFilePath, validation: validationFilePath, | ||
testing: testingFilePath) | ||
} | ||
|
||
/// Creates an instance containing phrases from `trainingFile`, and | ||
/// optionally `validationFile` and `testingFile`. | ||
/// | ||
/// - Throws: an error in the Cocoa domain, if `trainingFile` cannot be | ||
/// read. | ||
public init( | ||
training trainingFile: String, | ||
validation validationFile: String? = nil, | ||
|
@@ -127,53 +146,38 @@ public struct WordSegDataset { | |
let trainingData = try Data( | ||
contentsOf: URL(fileURLWithPath: trainingFile), | ||
options: .alwaysMapped) | ||
let training = try Self.load(data: trainingData) | ||
|
||
var validation: [String]? = nil | ||
var testing: [String]? = nil | ||
let validationData = try Data( | ||
contentsOf: URL(fileURLWithPath: validationFile ?? "/dev/null"), | ||
options: .alwaysMapped) | ||
|
||
if let validationFile = validationFile { | ||
let data = try Data( | ||
contentsOf: URL(fileURLWithPath: validationFile), | ||
options: .alwaysMapped) | ||
validation = try Self.load(data: data) | ||
} | ||
let testingData = try Data( | ||
contentsOf: URL(fileURLWithPath: testingFile ?? "/dev/null"), | ||
options: .alwaysMapped) | ||
|
||
if let testingFile = testingFile { | ||
let data: Data = try Data( | ||
contentsOf: URL(fileURLWithPath: testingFile), | ||
options: .alwaysMapped) | ||
testing = try Self.load(data: data) | ||
} | ||
self.alphabet = Self.makeAlphabet(datasets: training, validation, testing) | ||
self.training = try Self.convertDataset(training, alphabet: self.alphabet) | ||
self.validation = try Self.convertDataset(validation, alphabet: self.alphabet) | ||
self.testing = try Self.convertDataset(testing, alphabet: self.alphabet) | ||
self.init( | ||
training: trainingData, validation: validationData, testing: testingData) | ||
} | ||
|
||
/// Creates an instance containing phrases from `trainingData`, and | ||
/// optionally `validationData` and `testingData`. | ||
public init( | ||
training trainingData: Data, validation validationData: Data?, testing testingData: Data? | ||
) | ||
throws | ||
{ | ||
let training = try Self.load(data: trainingData) | ||
var validation: [String]? = nil | ||
var testing: [String]? = nil | ||
if let validationData = validationData { | ||
validation = try Self.load(data: validationData) | ||
} | ||
if let testingData = testingData { | ||
testing = try Self.load(data: testingData) | ||
} | ||
|
||
self.alphabet = Self.makeAlphabet(datasets: training, validation, testing) | ||
self.training = try Self.convertDataset(training, alphabet: self.alphabet) | ||
self.validation = try Self.convertDataset(validation, alphabet: self.alphabet) | ||
self.testing = try Self.convertDataset(testing, alphabet: self.alphabet) | ||
) { | ||
let training = Self.load(data: trainingData) | ||
let validation = Self.load(data: validationData ?? Data()) | ||
let testing = Self.load(data: testingData ?? Data()) | ||
|
||
self.alphabet = Self.makeAlphabet(phrases: training + validation + testing) | ||
self.trainingPhrases = Self.numericalizeDataset(training, alphabet: self.alphabet) | ||
self.validationPhrases = Self.numericalizeDataset(validation, alphabet: self.alphabet) | ||
self.testingPhrases = Self.numericalizeDataset(testing, alphabet: self.alphabet) | ||
} | ||
|
||
/// Downloads and unpacks `source` to `directory` if it does not | ||
/// exist locally. | ||
private static func downloadIfNotPresent( | ||
to directory: URL, downloadDetails: DownloadDetails | ||
to directory: URL, source: DownloadableArchive | ||
) { | ||
let downloadPath = directory.path | ||
let directoryExists = FileManager.default.fileExists(atPath: downloadPath) | ||
|
@@ -182,11 +186,15 @@ public struct WordSegDataset { | |
|
||
guard !directoryExists || directoryEmpty else { return } | ||
|
||
let remoteRoot = source.location.deletingLastPathComponent() | ||
let filename = source.location.deletingPathExtension().lastPathComponent | ||
let fileExtension = source.location.pathExtension | ||
|
||
// Downloads and extracts dataset files. | ||
let _ = DatasetUtilities.downloadResource( | ||
filename: downloadDetails.archiveFileName, | ||
fileExtension: downloadDetails.archiveExtension, | ||
remoteRoot: downloadDetails.archiveLocation, | ||
filename: filename, | ||
fileExtension: fileExtension, | ||
remoteRoot: remoteRoot, | ||
localStorageDirectory: directory, extract: true) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I remember wanting to mention this before, but if “Character” doesn't mean what in Swift is called
Character
, we should look for other names, e.g. “glyph.”There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As a start, I created #600 for using
Character
instead ofString
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It doesn't exactly address this, but heads in the direction of cleaning up that design overall.