-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBatcher.swift
95 lines (85 loc) · 3.27 KB
/
Batcher.swift
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
// Copyright 2019 The TensorFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import TensorFlow
/// A collection that splits a dataset in batches
public class Batcher<SourceDataSet: RandomAccessCollection>
where SourceDataSet.Element: Collatable {
public typealias Element = SourceDataSet.Element
/// The dataset to get the batches from.
public var dataset: SourceDataSet
/// The size of each batch.
public var batchSize: Int
/// Optionally set a limit to the number of threads used.
public var threadsLimit: Int? = nil
/// If `true`, drop the last batch if it has less elements than `batchSize`.
public var droppingLast: Bool = false
var indices: [Int]
public init(
on dataset: SourceDataSet,
batchSize: Int,
threadsLimit: Int? = nil,
droppingLast: Bool = false
) {
self.dataset = dataset
self.batchSize = batchSize
self.threadsLimit = threadsLimit
self.droppingLast = droppingLast
indices = []
indices = sampleIndices(on: dataset)
}
/// Returns the elements of `0..<dataset.count`.
///
/// - Note: subclass and override this function to customize how indices are
/// sampled
func sampleIndices(on dataset: SourceDataSet) -> [Int] {
return Array(0..<dataset.count)
}
/// Returns `indices` shuffled.
///
/// - Note: subclass and override this function to customize how indices are
/// shuffled
func shuffleIndices(on dataset: inout SourceDataSet, indices: [Int]) -> [Int] {
return indices.shuffled()
}
/// Returns `samples`.
///
/// - Note: subclass and override this function to customize how samples are
/// padded
func padSamples(samples: [SourceDataSet.Element]) -> [SourceDataSet.Element] {
return samples
}
}
extension Batcher: Collection {
public typealias Index = Int
public var startIndex: Int { return 0 }
public var endIndex: Int {
let n = dataset.count
return n / batchSize + (n % batchSize == 0 || droppingLast ? 0 : 1)
}
public func index(after i: Int) -> Int { i+1 }
/// Access the i-th batch
public subscript(i: Int) -> Element {
get {
let start = i * batchSize
let end = Swift.min(start+batchSize, dataset.count)
return withDevice(.cpu) { () -> Element in
let n = threadsLimit == nil ? 1 : (end - start) / threadsLimit!
let samples = Array(start..<end).concurrentMap(minBatchSize: n) {
dataset[dataset.index(dataset.startIndex, offsetBy: indices[$0])]
}
return Element(collating: padSamples(samples: samples))
}
}
}
}