@@ -78,35 +78,125 @@ private:
78
78
79
79
template <typename T>
80
80
kj::Promise<kj::Array<T>> read (ReadOption option = ReadOption::NONE) {
81
+ // There are a few complexities in this operation that make it difficult to completely
82
+ // optimize. The most important is that even if a stream reports an expected length
83
+ // using tryGetLength, we really don't know how much data the stream will produce until
84
+ // we try to read it. The only signal we have that the stream is done producing data
85
+ // is a zero-length result from tryRead. Unfortuntately, we have to allocate a buffer
86
+ // in advance of calling tryRead so we have to guess a bit at the size of the buffer
87
+ // to allocate.
88
+ //
89
+ // In the previous implementation of this method, we would just blindly allocate a
90
+ // 4096 byte buffer on every allocation, limiting each read iteration to a maximum
91
+ // of 4096 bytes. This works fine for streams producing a small amount of data but
92
+ // risks requiring a greater number of loop iterations and small allocations for streams
93
+ // that produce larger amounts of data. Also in the previous implementation, every
94
+ // loop iteration would allocate a new buffer regardless of how much of the previous
95
+ // allocation was actually used -- so a stream that produces only 4000 bytes total
96
+ // but only provides 10 bytes per iteration would end up with 400 reads and 400 4096
97
+ // byte allocations. Doh! Fortunately our stream implementations tend to be a bit
98
+ // smarter than that but it's still a worst case possibility that it's likely better
99
+ // to avoid.
100
+ //
101
+ // So this implementation does things a bit differently.
102
+ // First, we check to see if the stream can give an estimate on how much data it
103
+ // expects to produce. If that length is within a given threshold, then best case
104
+ // is we can perform the entire read with at most two allocations and two calls to
105
+ // tryRead. The first allocation will be for the entire expected size of the stream,
106
+ // which the first tryRead will attempt to fulfill completely. In the best case the
107
+ // stream provides all of the data. The next allocation would be smaller and would
108
+ // end up resulting in a zero-length read signaling that we are done. Hooray!
109
+ //
110
+ // Not everything can be best case scenario tho, unfortunately. If our first tryRead
111
+ // does not fully consume the stream or fully fill the desination buffer, we're
112
+ // going to need to try again. It is possible that the new allocation in the next
113
+ // iteration will be wasted if the stream doesn't have any more data so it's important
114
+ // for us to try to be conservative with the allocation. If the running total of data
115
+ // we've seen so far is equal to or greater than the expected total length of the stream,
116
+ // then the most likely case is that the next read will be zero-length -- but unfortunately
117
+ // we can't know for sure! So for this we will fall back to a more conservative allocation
118
+ // which is either 4096 bytes or the calculated amountToRead, whichever is the lower number.
119
+
81
120
kj::Vector<kj::Array<T>> parts;
82
121
uint64_t runningTotal = 0 ;
83
- static constexpr size_t DEFAULT_BUFFER_CHUNK = 4096 ;
84
- static constexpr size_t MAX_BUFFER_CHUNK = DEFAULT_BUFFER_CHUNK * 4 ;
122
+ static constexpr uint64_t MIN_BUFFER_CHUNK = 1024 ;
123
+ static constexpr uint64_t DEFAULT_BUFFER_CHUNK = 4096 ;
124
+ static constexpr uint64_t MAX_BUFFER_CHUNK = DEFAULT_BUFFER_CHUNK * 4 ;
85
125
86
126
// If we know in advance how much data we'll be reading, then we can attempt to
87
127
// optimize the loop here by setting the value specifically so we are only
88
- // allocating once. But, to be safe, let's enforce an upper bound on each allocation
89
- // even if we do know the total.
90
- size_t amountToRead = kj::min (MAX_BUFFER_CHUNK,
91
- input.tryGetLength (StreamEncoding::IDENTITY).orDefault (DEFAULT_BUFFER_CHUNK));
92
-
128
+ // allocating at most twice. But, to be safe, let's enforce an upper bound on each
129
+ // allocation even if we do know the total.
130
+ kj::Maybe<uint64_t > maybeLength = input.tryGetLength (StreamEncoding::IDENTITY);
131
+
132
+ // The amountToRead is the regular allocation size we'll use right up until we've
133
+ // read the number of expected bytes (if known). This number is calculated as the
134
+ // minimum of (limit, MAX_BUFFER_CHUNK, maybeLength or DEFAULT_BUFFER_CHUNK). In
135
+ // the best case scenario, this number is calculated such that we can read the
136
+ // entire stream in one go if the amount of data is small enough and the stream
137
+ // is well behaved.
138
+ // If the stream does report a length, once we've read that number of bytes, we'll
139
+ // fallback to the conservativeAllocation.
140
+ uint64_t amountToRead = kj::min (limit,
141
+ kj::min (MAX_BUFFER_CHUNK,
142
+ maybeLength.orDefault (DEFAULT_BUFFER_CHUNK)));
143
+ // amountToRead can be zero if the stream reported a zero-length. While the stream could
144
+ // be lying about it's length, let's skip reading anything in this case.
93
145
if (amountToRead > 0 ) {
94
146
for (;;) {
95
- // TODO(perf): We can likely further optimize this loop by checking to see
96
- // how much of the buffer was filled and using the remaining buffer space if
97
- // it is not completely filled by the previous iteration. Doing so makes this
98
- // loop a bit more complicated tho, so for now let's keep things simple.
99
147
auto bytes = kj::heapArray<T>(amountToRead);
100
- size_t amount = co_await input.tryRead (bytes.begin (), 1 , bytes.size ());
148
+ // Note that we're passing amountToRead as the *minBytes* here so the tryRead should
149
+ // attempt to fill the entire buffer. If it doesn't, the implication is that we read
150
+ // everything.
151
+ uint64_t amount = co_await input.tryRead (bytes.begin (), amountToRead, amountToRead);
152
+ KJ_DASSERT (amount <= amountToRead);
101
153
102
- if (amount == 0 ) {
154
+ runningTotal += amount;
155
+ JSG_REQUIRE (runningTotal < limit, TypeError, " Memory limit exceeded before EOF." );
156
+
157
+ if (amount < amountToRead) {
158
+ // The stream has indicated that we're all done by returning a value less than the
159
+ // full buffer length.
160
+ // It is possible/likely that at least some amount of data was written to the buffer.
161
+ // In which case we want to add that subset to the parts list here before we exit
162
+ // the loop.
163
+ if (amount > 0 ) {
164
+ parts.add (bytes.slice (0 , amount).attach (kj::mv (bytes)));
165
+ }
103
166
break ;
104
167
}
105
168
106
- runningTotal += amount;
107
- JSG_REQUIRE (runningTotal < limit, TypeError, " Memory limit exceeded before EOF." );
108
- parts.add (bytes.slice (0 , amount).attach (kj::mv (bytes)));
109
- };
169
+ // Because we specify minSize equal to maxSize in the tryRead above, we should only
170
+ // get here if the buffer was completely filled by the read. If it wasn't completely
171
+ // filled, that is an indication that the stream is complete which is handled above.
172
+ KJ_DASSERT (amount == bytes.size ());
173
+ parts.add (kj::mv (bytes));
174
+
175
+ // If the stream provided an expected length and our running total is equal to
176
+ // or greater than that length then we assume we're done.
177
+ KJ_IF_SOME (length, maybeLength) {
178
+ if (runningTotal >= length) {
179
+ // We've read everything we expect to read but some streams need to be read
180
+ // completely in order to properly finish and other streams might lie (although
181
+ // they shouldn't). Sigh. So we're going to make the next allocation potentially
182
+ // smaller and keep reading until we get a zero length. In the best case, the next
183
+ // read is going to be zero length but we have to try which will require at least
184
+ // one additional (potentially wasted) allocation. (If we don't there are multiple
185
+ // test failures).
186
+ amountToRead = kj::min (MIN_BUFFER_CHUNK, amountToRead);
187
+ continue ;
188
+ }
189
+ }
190
+ }
191
+ }
192
+
193
+ KJ_IF_SOME (length, maybeLength) {
194
+ if (runningTotal > length) {
195
+ // Realistically runningTotal should never be more than length so we'll emit
196
+ // a warning if it is just so we know. It would be indicative of a bug somewhere
197
+ // in the implementation.
198
+ KJ_LOG (WARNING, " ReadableStream provided more data than advertised" , runningTotal, length);
199
+ }
110
200
}
111
201
112
202
if (option == ReadOption::NULL_TERMINATE) {
0 commit comments