Flow 2.0.0
Flow project: Full implementation reference.
peer_socket.cpp
Go to the documentation of this file.
1/* Flow
2 * Copyright 2023 Akamai Technologies, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the
5 * "License"); you may not use this file except in
6 * compliance with the License. You may obtain a copy
7 * of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in
12 * writing, software distributed under the License is
13 * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
14 * CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing
16 * permissions and limitations under the License. */
17
18/// @file
25#include "flow/async/util.hpp"
26#include <boost/algorithm/string.hpp>
27#include <boost/tuple/tuple.hpp>
28#include <utility>
29
30namespace flow::net_flow
31{
32
33// Implementations.
34
35// Peer_socket implementations.
36
38 util::Task_engine* task_engine,
39 const Peer_socket_options& opts) :
40 Log_context(logger_ptr, Flow_log_component::S_NET_FLOW),
41 m_opts(opts),
42 m_active_connect(false), // Meaningless; set explicitly.
43 m_state(State::S_CLOSED), // Incorrect; set explicitly.
44 m_open_sub_state(Open_sub_state::S_DISCONNECTING), // Incorrect; set explicitly.
45 m_node(0), // Incorrect; set explicitly.
46 m_rcv_buf(logger_ptr, 0), // Receive buffer mode: block size irrelevant (see Socket_buffer doc header).
47 // Send buffer mode: pack data into block-sized chunks for dequeueing speed. See Socket_buffer doc header.
48 m_snd_buf(logger_ptr, max_block_size()),
49 m_serialized_metadata(logger_ptr),
50 m_local_port(S_PORT_ANY), // Incorrect; set explicitly.
51 m_int_state(Int_state::S_CLOSED), // Incorrect; set explicitly.
52 m_rcv_syn_rcvd_data_cumulative_size(0), // Meaningless unless queue has elements but might as well initialize.
53 m_rcv_reassembly_q_data_size(0),
54 m_rcv_pending_acks_size_at_recv_handler_start(0),
55 m_snd_pending_rcv_wnd(0), // Meaningless originally but might as well initialize.
56 m_rcv_last_sent_rcv_wnd(0),
57 m_rcv_in_rcv_wnd_recovery(false),
58 m_rcv_delayed_ack_timer(*task_engine),
59 m_snd_flying_bytes(0),
60 m_snd_last_order_num(0),
61 m_snd_rexmit_q_size(0),
62 m_snd_remote_rcv_wnd(0),
63 m_snd_smoothed_round_trip_time(0),
64 m_round_trip_time_variance(0),
65 m_snd_drop_timeout(0),
66 m_snd_pacing_data(task_engine),
67 m_security_token(0), // Incorrect; set explicitly.
68 m_init_rexmit_count(0)
69{
70 // Only print pointer value, because most members are garbage at this point.
71 FLOW_LOG_TRACE("Peer_socket [" << static_cast<void*>(this) << "] created.");
72
73 // Log initial option values. Arguable if this should be INFO or TRACE. @todo Reconsider?
74 FLOW_LOG_TRACE("\n\n" << options());
75}
76
78{
79 /* Note that m_snd_cong_ctl, m_snd_bandwidth_estimator (etc.) and others store no Ptr(this),
80 * so this dtor will indeed execute (no circular shared_ptr problem). */
81
82 FLOW_LOG_TRACE("Peer_socket [" << this << "] destroyed.");
83}
84
86{
87 Lock_guard lock(m_mutex); // State is liable to change at any time.
88 if (open_sub_state && (m_state == State::S_OPEN))
89 {
90 *open_sub_state = m_open_sub_state;
91 }
92 return m_state;
93}
94
96{
97 Lock_guard lock(m_mutex); // m_node can simultaneously change to 0 if state changes to S_CLOSED.
98 return m_node;
99}
100
102{
103 Lock_guard lock(m_mutex);
104 return m_disconnect_cause;
105}
106
107bool Peer_socket::sync_send(std::nullptr_t, Error_code* err_code)
108{
109 return sync_send(nullptr, Fine_duration::max(), err_code);
110}
111
113{
114 // Similar to sync_send_impl(), so keeping comments light. Reminder: Goal is to wait until *this is Writable.
115
117
118 Lock_guard lock(m_mutex);
119
120 const Function<size_t (size_t)> empty_snd_buf_feed_func;
121 assert(empty_snd_buf_feed_func.empty());
122
123 lock.release();
124
125 // Intentionally pass empty function obj to indicate "reactor pattern" mode.
126 node_sync_send(empty_snd_buf_feed_func, wait_until, err_code);
127 return !*err_code; // Socket is Writable if and only if !*err_code (i.e., no timeout or other error while waiting).
128}
129
130size_t Peer_socket::node_send(const Function<size_t (size_t max_data_size)>& snd_buf_feed_func,
131 Error_code* err_code)
132{
133 // Everything is locked. (See send() template.)
134
135 const Ptr sock = shared_from_this();
136 if (!Node::ensure_sock_open(sock, err_code)) // Ensure it's open, so that we can access m_node.
137 {
138 return 0;
139 }
140 // else m_node is valid.
141
142 return m_node->send(sock, snd_buf_feed_func, err_code);
143}
144
145size_t Peer_socket::node_sync_send(const Function<size_t (size_t max_data_size)>& snd_buf_feed_func_or_empty,
146 const Fine_time_pt& wait_until,
147 Error_code* err_code)
148{
149 using boost::adopt_lock;
150
151 // Everything is locked. (See sync_send() template.)
152 Lock_guard lock(m_mutex, adopt_lock); // Adopt already-locked mutex.
153
154 const Ptr sock = shared_from_this();
155 if (!Node::ensure_sock_open(sock, err_code)) // Ensure it's open, so that we can access m_node.
156 {
157 return 0;
158 }
159 // else m_node is valid.
160
161 /* Because all Node::sync_*() implementations would follow the same pattern (create Event_set,
162 * add Readable/Writable/Acceptable event, wait, try non-blocking op, if that fails try again with
163 * wait_until ever closer, etc.), for major code reuse we use the sync_op() function template and plug in
164 * the various Peer_socket/send-specific pieces as arguments.
165 *
166 * Performance cost: The only part about this that's not as fast as copy/pasting sync_op() N times, once
167 * for each type of socket/op, is the need to lambda the proper send() call into a function object.
168 * This amounts to storing and copying the arguments and the function pointer, which should not be
169 * too bad and is worth the code reuse IMO. */
170
171 lock.release(); // Again, release lock (mutex is still locked!).
172
173 /* Operating on Peer_sockets, returning size_t; Event_set socket set type is Peer_sockets.
174 * Object is sock; non-blocking operation is m_node->send(...) -- or N/A in "reactor pattern" mode.
175 * size_t(0) is the "would-block" return value for this operation. S_PEER_SOCKET_WRITABLE
176 * is the type of event to watch for here. */
177 return m_node
178 ->sync_op<Peer_socket, size_t>
179 (sock,
180 snd_buf_feed_func_or_empty.empty()
181 ? Function<size_t ()>() // Reactor pattern mode.
182 : Function<size_t ()>([this, sock, snd_buf_feed_func_or_empty, err_code]() -> size_t
183 { return m_node->send(sock, snd_buf_feed_func_or_empty, err_code); }),
185 wait_until, err_code);
186} // Peer_socket::node_sync_send()
187
188bool Peer_socket::sync_receive(std::nullptr_t, Error_code* err_code)
189{
190 return sync_receive(nullptr, Fine_duration::max(), err_code);
191}
192
194{
195 // Similar to sync_receive_impl(), so keeping comments light. Reminder: Goal is to wait until *this is Readable.
196
198
199 Lock_guard lock(m_mutex);
200
201 const Function<size_t ()> empty_rcv_buf_consume_func;
202 assert(empty_rcv_buf_consume_func.empty());
203
204 lock.release();
205
206 // Intentionally pass empty function obj to indicate "reactor pattern" mode.
207 node_sync_receive(empty_rcv_buf_consume_func, wait_until, err_code);
208 return !*err_code; // Socket is Readable if and only if !*err_code (i.e., no timeout or other error while waiting).
209}
210
211size_t Peer_socket::node_receive(const Function<size_t ()>& rcv_buf_consume_func,
212 Error_code* err_code)
213{
214 // Everything is locked. (See receive() template.)
215
216 const Ptr sock = shared_from_this();
217 if (!Node::ensure_sock_open(sock, err_code)) // Ensure it's open, so that we can access m_node.
218 {
219 return 0;
220 }
221 // else m_node is valid.
222
223 return m_node->receive(sock, rcv_buf_consume_func, err_code);
224}
225
226size_t Peer_socket::node_sync_receive(const Function<size_t ()>& rcv_buf_consume_func_or_empty,
227 const Fine_time_pt& wait_until,
228 Error_code* err_code)
229{
230 using boost::adopt_lock;
231
232 // Everything is locked. (See sync_send() template.)
233 Lock_guard lock(m_mutex, adopt_lock); // Adopt already-locked mutex.
234
235 const Ptr sock = shared_from_this();
236 if (!Node::ensure_sock_open(sock, err_code)) // Ensure it's open, so that we can access m_node.
237 {
238 return 0;
239 }
240 // else m_node is valid.
241
242 lock.release(); // Again, release lock (mutex is still locked!).
243
244 // See comment in Peer_socket::node_sync_send().
245
246 /* Operating on Peer_sockets, returning size_t; Event_set socket set type is Peer_sockets.
247 * Object is sock; non-blocking operation is m_node->receive(...) -- or N/A in "reactor pattern" mode.
248 * size_t(0) is the "would-block" return value for this operation. S_PEER_SOCKET_READABLE
249 * is the type of event to watch for here. */
250 return m_node
251 ->sync_op<Peer_socket, size_t>
252 (sock,
253 rcv_buf_consume_func_or_empty.empty()
254 ? Function<size_t ()>() // Reactor pattern mode.
255 : Function<size_t ()>([this, sock, rcv_buf_consume_func_or_empty, err_code]() -> size_t
256 { return m_node->receive(sock, rcv_buf_consume_func_or_empty, err_code); }),
258 wait_until, err_code);
259} // Peer_socket::node_sync_receive()
260
262{
264 ([this](Error_code* actual_err_code) { close_abruptly(actual_err_code); },
265 err_code, FLOW_UTIL_WHERE_AM_I_STR()))
266 {
267 return;
268 }
269 // else
270
271 // We are in user thread U != W.
272
273 Lock_guard lock(m_mutex); // Lock m_node/m_state; also it's a pre-condition for Node::close_abruptly().
274
275 const Ptr sock = shared_from_this();
276 if (!Node::ensure_sock_open(sock, err_code)) // Ensure it's open, so that we can access m_node.
277 {
278 // *err_code will be set to original close reason (m_disconnect_cause) in this case, as advertised.
279 return;
280 }
281 // else m_node is valid.
282
283 // Forward to Node, as is the general pattern for Peer_socket method implementations.
284 lock.release(); // Let go of the mutex (mutex is still LOCKED).
285 m_node->close_abruptly(sock, err_code);
286 // No m_mutex.unlock(): Node::close_abruptly() MUST take care of it.
287} // Peer_socket::close_abruptly()
288
290{
292 // ^-- Call ourselves and return if err_code is null. If got to present line, err_code is not null.
293
294 // We are in thread U != W.
295
296 Lock_guard lock(m_mutex); // Lock m_node at least.
297
298 const Ptr sock = shared_from_this();
299 if (!Node::ensure_sock_open(sock, err_code)) // Ensure it's open, so that we can access m_node.
300 {
301 return false;
302 }
303 // else m_node is valid.
304
305 // As is typical elsewhere, pass the rest of the logic to a Node method.
306 return m_node->sock_set_options(sock, opts, err_code);
307} // Peer_socket::set_options()
308
310{
311 return opt(m_opts);
312}
313
315{
316 // We are in user thread U != W.
317
318 /* There are two cases. If the socket is open (not S_CLOSED), then an m_node owns it and may
319 * change the stats we want to copy in its thread W at any time. In this case we must copy it in
320 * thread W (which we do using a future and post(io_context&), as in listen() and other places in
321 * Node). In the socket is closed (S_CLOSED), then no m_node owns it, so there is no thread W
322 * applicable to this socket anymore, and we can just copy the data in thread U != W. */
323
324 Lock_guard lock(m_mutex); // Lock m_node; also it's a pre-condition for Node::sock_info().
325
326 const Const_ptr sock = shared_from_this();
327
328 // See which case it is.
329 Error_code dummy;
330 if (!Node::ensure_sock_open(sock, &dummy))
331 {
332 // Socket is closed. Done and done. Return the final stats cached at S_CLOSED time.
333 return m_info_on_close;
334 }
335 // else m_node is valid.
336
337 // Forward to Node, as is the general pattern for Peer_socket method implementations.
338 lock.release(); // Let go of the mutex (mutex is still LOCKED).
339 return m_node->sock_info(sock);
340 // No m_mutex.unlock(): Node::sock_info() MUST take care of it.
341} // Peer_socket::info()
342
344{
346}
347
348size_t Peer_socket::max_block_size_multiple(const size_t& opt_val_ref,
349 const unsigned int* inflate_pct_val_ptr) const
350{
351 // Similar to opt() but specialized for this purpose. Lock once to get both values.
353
355 const unsigned int inflate_pct = inflate_pct_val_ptr ? (*inflate_pct_val_ptr) : 0;
356
357 /* We want N's nearest multiple M of B such that M >= N. M = ceil(N/B) * B (no actual floating-point math involved).
358 *
359 * Oh, and N is opt_val_ref inflated by K%, or opt_val_ref * (100 + K)%. */
360 return util::ceil_div(opt_val_ref * (100 + inflate_pct) / 100, max_block_size)
362}
363
365{
366 return opt(m_opts.m_st_rexmit_on);
367}
368
370{
371 // Can't change; no locking needed. Safe info even if S_CLOSED.
372 return m_remote_endpoint;
373}
374
376{
377 // Can't change; no locking needed. Safe (if outdated) info even if S_CLOSED.
378 return m_local_port;
379}
380
381size_t Peer_socket::get_connect_metadata(const boost::asio::mutable_buffer& buffer,
382 Error_code* err_code) const
383{
384 using std::memcpy;
385
387 // ^-- Call ourselves and return if err_code is null. If got to present line, err_code is not null.
388
389 // We are in user thread U != W.
390
391 Lock_guard lock(m_mutex); // Lock m_serialized_metadata (it can be changed in sock_free_memory()).
392
393 if (!ensure_open(err_code)) // Ensure it's open; other m_serialized_metadata has been cleared.
394 {
395 return 0;
396 }
397 // else m_serialized_metadata is valid.
398
399 err_code->clear();
400 const size_t size = std::min(m_serialized_metadata.size(), buffer.size());
401 if (size != 0)
402 {
403 memcpy(buffer.data(), m_serialized_metadata.const_data(), size);
404 }
405
406 return size;
407} // Peer_socket::get_connect_metadata()
408
410{
411 return Node::ensure_sock_open(shared_from_this(), err_code);
412}
413
414std::string Peer_socket::bytes_blocks_str(size_t bytes) const
415{
417 using std::flush;
418
419 const auto block = max_block_size();
420 String_ostream os;
421 os.os() << bytes << '~' << (bytes / block);
422 if ((bytes % block) != 0)
423 {
424 os.os() << '+';
425 }
426 os.os() << flush;
427 return os.str();
428}
429
431 boost::shared_ptr<Data_packet> packet,
432 const Sent_when& sent_when) :
433 m_size(packet->m_data.size()),
434 m_sent_when({ sent_when }),
435 m_acks_after_me(0),
436 m_packet(rexmit_on ? packet : boost::shared_ptr<Data_packet>()) // Store packet only if we may have to rexmit later.
437{
438 // Nothing.
439}
440
442 m_size(size),
443 m_data(logger_ptr)
444{
445 if (src_data)
446 {
447 // Retransmission is on: save *src_data for later reassembly.
448 assert(m_size == size); // As promised in docs....
449
450 m_data = std::move(*src_data); // O(1) operation -- *src_data is probably cleared.
451 }
452}
453
454// Node implementations (dealing with individual Peer_sockets).
455
456// Static initializations.
457
458// Per RFC 5681 (Reno Fast Recovery; used in other congestion control specifications as well to detect drops).
460const uint8_t Node::S_DEFAULT_CONN_METADATA = 0; // Keep in sync with doc get_connect_metadata() doc header.
461
462// Implementations.
463
465 Peer_socket::Ptr sock,
466 boost::shared_ptr<const Syn_ack_packet> syn_ack)
467{
468 // We are in thread W.
469
470 /* We'd sent SYN and just got SYN_ACK. Assuming their SYN is valid, our side of connection can
471 * move to ESTABLISHED state. We can also complete the other side's connection by sending
472 * SYN_ACK_ACK. */
473
474 FLOW_LOG_INFO("NetFlow worker thread continuing active-connect of [" << sock << "]. "
475 "Received [" << syn_ack->m_type_ostream_manip << "] with "
476 "ISN [" << syn_ack->m_init_seq_num << "]; "
477 "security token [" << syn_ack->m_packed.m_security_token << "].");
478
479 // Send SYN_ACK_ACK to finish the handshake.
480
481 async_low_lvl_syn_ack_ack_send(sock, syn_ack);
482 /* send will happen asynchronously, and the registered completion handler will execute in this
483 * thread when done (NO SOONER than this method finishes executing). */
484
485 // Handle the logical SYN part of their SYN_ACK.
486
487 // Save the start of the sequence number series based on their initial sequence number.
488 sock->m_rcv_init_seq_num = syn_ack->m_init_seq_num;
489 sock->m_rcv_next_seq_num = sock->m_rcv_init_seq_num + 1;
490
491 // Move ourselves to connected state.
492
493 // Public state.
495 // Internal state. SYN_SENT -> ESTABLISHED.
496 sock_set_int_state(sock, Peer_socket::Int_state::S_ESTABLISHED);
497
498 // Got the acknowledgment to SYN, so cancel retransmits and the timeout for that SYN.
499 cancel_timers(sock);
500
501 // Setup the Drop Timeout engine (m_snd_drop_timer).
502 setup_drop_timer(socket_id, sock);
503
504 // Record initial rcv_wnd; it should be the entire size of the other side's Receive buffer.
505 sock->m_snd_remote_rcv_wnd = syn_ack->m_packed.m_rcv_wnd;
506
507 /* Since sock is now connected and has an empty Send buffer, it is certainly now Writable.
508 * Therefore we should soon inform anyone waiting on any Event_sets for sock to become Writable.
509 *
510 * Caveat: Similar to that in Node::handle_syn_ack_ack_to_syn_rcvd() at similar point in the
511 * code. */
512
513 // Accumulate the event into the Node store (note: not any Event_set yet).
514 if (m_sock_events[Event_set::Event_type::S_PEER_SOCKET_WRITABLE].insert(sock).second)
515 {
516 // Possibly inform the user for any applicable Event_sets right now.
517 event_set_all_check_delta(true);
518 /* ^-- defer_delta_check == true: because the only way to get to this method is from
519 * async_low_lvl_recv(), which will perform event_set_all_check_delta(false) at the end of itself,
520 * before the boost.asio handler exits. See Node::m_sock_events doc header for details. */
521 }
522} // Node::handle_syn_ack_to_syn_sent()
523
525 boost::shared_ptr<const Syn_ack_packet> syn_ack)
526{
527 // We are in thread W.
528
529 /* We're ESTABLISHED but got a duplicate (valid) SYN_ACK again. For reasons explained in
530 * handle_incoming() at the call to the current method, we simply give them a SYN_ACK_ACK again
531 * and continue like nothing happened. */
532
533 FLOW_LOG_INFO("NetFlow worker thread working on [" << sock << "]. "
534 "In [" << Peer_socket::Int_state::S_ESTABLISHED << "] state "
535 "received duplicate [" << syn_ack->m_type_ostream_manip << "] with "
536 "ISN [" << syn_ack->m_init_seq_num << "]; "
537 "security token [" << syn_ack->m_packed.m_security_token << "]. "
538 "Could be from packet loss.");
539
540 // Everything has already been validated.
541
542 async_low_lvl_syn_ack_ack_send(sock, syn_ack);
543} // Node::handle_syn_ack_to_established()
544
546 Peer_socket::Ptr sock,
547 boost::shared_ptr<Data_packet> packet,
548 bool syn_rcvd_qd_packet)
549{
550 /* This is a complex method that does many things. Therefore readability is hard to accomplish, as the logic
551 * makes sense when writing it, but the big picture is hard to see when reading it. The necessary heavy commenting
552 * further increases the size and therefore (along that dimension) decreases readability. For these reasons,
553 * many logically distinct parts were placed into helper methods -- not to increase code reuse but to help
554 * the aforementioned consideration. */
555
556 // We are in thread W.
557
558 /* Connection is open, and we got data from other side. Note: For maintainability, this method features
559 * (and should continue to feature) mainly high-level flow control and method calls, as opposed to tons of lower-level
560 * detail (this should be factored out into methods being called).
561 *
562 * Summary of below (assuming no misbehavior by other side; also ignoring that every action is categorized
563 * in sock->m_rcv_stats for statistical purposes):
564 *
565 * - Determine `dupe` (is packet a duplicate of previously received packet?) by checking against
566 * sock->m_rcv_{next_seq_num|packets_with_gaps}. If so:
567 * - (Op AAA) Acknowledge packet (ACK to other side).
568 * - Return (do not close connection).
569 * - Determine `slide` (are packet's data the next expected [first -- by seq. # -- not-yet-received] data?)
570 * by checking against sock->m_rcv_{next_seq_num|packets_with_gaps}.
571 * - If retransmission is off:
572 * - (Op ###) Pass packet's data to Receive buffer sock->m_rcv_buf!
573 * - Except if that would overflow sock->m_rcv_buf, then return (do not close connection).
574 * - (Op %%%) Inform the event subsystem that Receive buffer is readable!
575 * - (Op AAA)
576 * - If (!slide):
577 * - Save packet info (except packet->m_data itself!) in sock->m_rcv_packets_with_gaps.
578 * - But if that overflows sock->m_rcv_packets_with_gaps, then also pretend
579 * gap before start of sock->m_rcv_packets_with_gaps has all been filled: set `slide = true;`.
580 * (This will cause below to pop sock->m_rcv_packets_with_gaps to not overflow.)
581 * - If `slide`:
582 * - (Op ***) Update sock->m_rcv_{next_seq_num|packets_with_gaps} (increment the former,
583 * possibly pop-front contiguous packets from the other).
584 * - Else, if retransmission is on:
585 * - If `slide`:
586 * - (Op ###)
587 * - (Op ***)
588 * - Plus, for each packet popped from sock->m_rcv_packets_with_gaps, in increasing seq. # order:
589 * Pass packet's data to Receive buffer sock->m_rcv_buf!
590 * - (Op %%%)
591 * - Else if (!slide):
592 * - Save packet info (including packet->m_data itself!) in sock->m_rcv_packets_with_gaps (reassembly queue).
593 * - But if that WOULD overflow sock->m_rcv_packets_with_gaps, then don't;
594 * and return (do not close connection).
595 * - (Op AAA) */
596
597 /* Set up some short-hand references to commonly used sock members. This should also help
598 * performance a little by skipping the shared_ptr dereference. (Should be safe since sock
599 * cannot get ref-counted down to zero in this method, unless there is an error, at which point
600 * we return anyway.) Just remember these are not simply local variables -- nor const references -- but refer
601 * to on-the-heap stuff! */
602 const bool rexmit_on = sock->rexmit_on();
603 const Sequence_number& seq_num = packet->m_seq_num;
604
605 auto& data = packet->m_data; // NOT const, since we may well be _moving_ this into Receive buffer, etc.
606 assert(!data.empty()); // This should have been verified immediately in handle_incoming().
607 // Save this before we possibly destroy `data`'s contents below when _moving_ into Receive buffer, etc.
608 const size_t data_size = data.size();
609
610 // Register one packet with N bytes of data (not necessarily acceptable data).
611 Peer_socket_receive_stats_accumulator& rcv_stats = sock->m_rcv_stats;
612 rcv_stats.total_data_packet(data_size);
613
614 // Before potential changes, log.
615
616 FLOW_LOG_TRACE("NetFlow worker thread working on [" << sock << "]. "
617 "Received [" << packet->m_type_ostream_manip << "] with "
618 "sequence number [" << seq_num << "]; data size [" << data_size << "].");
619 // Very verbose and CPU-intensive!
620 FLOW_LOG_DATA("Data [" << util::buffers_dump_string(data.const_buffer(), "", size_t(-1)) << "].");
621 // In below TRACE logging we will omit most of the above details, since they'll be already logged.
622
623 log_rcv_window(sock); // Especially log this state.
624
625 /* Compute `dupe` and `slide[_size]`, bits of info that are key to how the incoming packet fits into the rcv window.
626 * Also, regardless of anything else we need to register N bytes worth of data in DATA packets via
627 * one rcv_stats.<...>_data_packet(data_size); we can determine the <...> based on dupe, slide, or lack thereof. */
628
629 /* True will means it's a duplicate packet -- ACK but don't give to the user again.
630 * False will mean it's a new packet -- ACK and save to a buffer for eventual consumption (unless overflow). */
631 bool dupe;
632 // Will mean this packet is the first (by seq. #) unreceived packet we want. Only applies if !dupe.
633 bool slide;
634 /* ^-- @todo Eliminate this; use slide_size == 0 to mean !slide? Less state is a good thing.
635 * Also, slide_size can be assumed to be data_size, except in one case below -- *never* via
636 * sock_categorize_data_to_established(); both of these improvements will lead to cleaner code. */
637 size_t slide_size; // If (slide), this will be how much to increment m_rcv_next_seq_num.
638
639 const Error_code cat_result = sock_categorize_data_to_established(sock, packet, &dupe, &slide, &slide_size);
640 if (cat_result)
641 {
642 // Register one packet with N bytes of data (not acceptable due to error).
643 rcv_stats.error_data_packet(data_size);
644
645 /* Close connection in our structures (inform user if necessary as well). Pre-conditions
646 * assumed by call: sock in m_socks and sock->state() == S_OPEN (yes, since m_int_state ==
647 * S_ESTABLISHED); 3rd arg contains the reason for the close (yes). This will empty the Send
648 * and Receive buffers. That is OK, because this is the abrupt type of close (error). */
649 rst_and_close_connection_immediately(socket_id, sock, cat_result, true);
650 // ^-- defer_delta_check == true: for similar reason as in handle_syn_ack_ack_to_syn_rcvd().
651 return;
652 }
653 // else
654
655 // If we got here, no error so far; `dupe` and `slide` are both set properly.
656
657 if (dupe)
658 {
659 /* It's a duplicate received packet. We should still acknowledge every valid packet, even if
660 * duplicate, since at least it helps the other side measure congestion. Is it "lying," since
661 * we're throwing this dupe away? No, because we DID receive it earlier; and in fact that
662 * earlier packet's ACK packet may have itself gotten lost by the network. (Example: A sends P
663 * to B; A receives and responds with ACK of P; that's lost; A receives dupe of P and responds
664 * with ACK; B receives that ACK. Good.) Anyway if the other side doesn't like it, it can just
665 * ignore it.
666 *
667 * It is also important to ack a duplicate packet, if retransmission is enabled. For example,
668 * sender may send packet X, and we'll ack it; but the ACK may be lost. Then the sender will
669 * retransmit X thinking X was lost; if we don't ACK the retransmitted one, the sender will
670 * retransmit again, until it runs out of retransmissions and closes connection... all because
671 * of one lousy lost ACK. */
672
673 // Plenty of TRACE logging about duplicate packets above; and here is probably too verbose for an INFO; => no log.
674
675 // Register one packet with N bytes of data (not acceptable into Receive buffer but probably legal, just late).
676 rcv_stats.late_or_dupe_data_packet(data_size);
677
678 // Register one individual acknowledgment of N bytes of data (will go out but acks late DATA).
679 rcv_stats.late_or_dupe_to_send_ack_packet(data_size);
680
681 // ACK will happen asynchronously (not in this handler, and at best once UDP net-stack considers itself writable).
682 async_acknowledge_packet(sock, seq_num, packet->m_rexmit_id, data_size); // rcv_stats kept inside.
683 return;
684 }
685 // else if (!dupe), i.e. data to be saved in Receive buffer or reassembly queue (unless overflow).
686
687 // Register one packet with N bytes of data (legal and acceptable into Receive buffer).
688 rcv_stats.good_data_packet(data.size());
689
690 /* Behavior is different at this point depending on whether retransmission is enabled or
691 * disabled. Many of the building blocks are the same and have been factored out into helpers. */
692
693 if (!rexmit_on)
694 {
695 /* No retransmission, so things are fairly simple. Firstly any new received data go
696 * straight to Receive buffer (out of order or not). */
697
698 if (!sock_data_to_rcv_buf_unless_overflow(sock, packet))
699 {
700 /* Not so fast. There's no space in the Receive buffer, so there's no choice except to drop the
701 * packet despite all of the above. Note that this means the packet was not "received" (and
702 * we can't slide the window forward either).
703 *
704 * Should we RST/close? Absolutely not. The sender did nothing wrong (except maybe they suck
705 * at detecting congestion caused by our user not reading the Receive buffer fast enough and
706 * thus letting it fill up, or maybe they just suck at congestion control). Our user is not
707 * consuming the Receive buffer in time. We drop packet and let chips fall where they may
708 * (reliability measures will handle it).
709 *
710 * Should we still acknowledge it? No. Dropping a packet at this late stage is still
711 * dropping a packet and indicates congestion of the network, of sorts; if we ACK it, the
712 * other side will assume the packet is being delivered and won't slow down its packet
713 * onslaught. So nothing else to. */
714 return;
715 }
716
717 /* DO NOT use `data` from this point forward -- it was just emptied by sock_data_to_rcv_buf_unless_overflow()!
718 * data_size is fine. */
719
720 /* Since sock now has a non-empty Receive buffer, it is certainly now Readable. Handle implications
721 * on relevant waiting Event_sets. */
722 sock_rcv_buf_now_readable(sock, syn_rcvd_qd_packet);
723
724 // Successfully wrote to Receive buffer. Can certainly acknowledge it at this point.
725
726 // Register one individual acknowledgment of N bytes of data (will go out and acks new, acceptable DATA).
727 rcv_stats.good_to_send_ack_packet(data_size);
728
729 // ACK will happen asynchronously (not in this handler, and at best once UDP net-stack considers itself writable).
730 async_acknowledge_packet(sock, seq_num, 0, data_size); // rcv_stats kept inside.
731
732 if (!slide)
733 {
734 /* !slide means new packet didn't resolve the first unreceived gap; hence by definition
735 * sock->m_rcv_packets_with_gaps must be updated. Due to certain overflow mechanisms, this may also
736 * cause the removal of part of the first gap, ironically! So pass in &slide, etc.
737 *
738 * Pass in data_size, since data.size() would run on an emptied `data` as noted above and be useless. */
739 sock_track_new_data_after_gap_rexmit_off(sock, packet, data_size, &slide, &slide_size);
740
741 // `slide` may now be true or not.
742 }
743
744 // `slide` may now be true or not.
745
746 /* Finally, update the window, since we've received a new packet. Maintain large invariant described in doc headers
747 * for Peer_socket::m_rcv_packets_with_gaps and related members. */
748
749 if (slide)
750 {
751 sock_slide_rcv_next_seq_num(sock, slide_size, false);
752 }
753 } // if (!rexmit_on)
754 else // if (rexmit_on)
755 {
756 /* Retransmission is on, so we have to deal with the reassembly queue. Namely if this packet
757 * fills the gap between stuff already given to Receive buffer and the first packet in the
758 * reassembly queue, then we should feed-to-user not just the new packet but also all contiguous packets
759 * at the front of the queue into Receive buffer. If it does not fill it, then we have to add
760 * it to reassembly queue in the proper spot. */
761
762 if (slide)
763 {
764 // New packet filled at least part of the first gap. So we should feed it to Receive buffer.
765
766 if (!sock_data_to_rcv_buf_unless_overflow(sock, packet))
767 {
768 /* Not so fast. If there's no space in the Receive buffer, there's no choice except to drop the
769 * packet despite all of the above. All comments from same spot in the no-retransmission
770 * code above apply (not repeating here). */
771 return;
772 }
773 // else
774
775 /* DO NOT use `data` from this point forward -- it was just emptied by sock_data_to_rcv_buf_unless_overflow().
776 * data_size is fine. */
777
778 /* Now update the receive window structure. Maintain invariants described in doc headers
779 * for m_rcv_packets_with_gaps and related members. Additionally, since retransmission is
780 * on, if the new packet bridged gap to the first packet(s) in the reassembly queue, then
781 * add their data to Receive buffer also (the `true` argument triggers this). */
782
783 sock_slide_rcv_next_seq_num(sock, slide_size, true);
784
785 /* Since sock nsqow has a non-empty Receive buffer, it is certainly now Readable. Handle implications
786 * on relevant waiting Event_sets. */
787 sock_rcv_buf_now_readable(sock, syn_rcvd_qd_packet);
788 } // if (slide)
789 else if (!sock_data_to_reassembly_q_unless_overflow(sock, packet)) // && (!slide)
790 {
791 /* Out-of-order packet. Couldn't feed to Receive buffer, so fed to reassembly queue (in sock_data_to_reass...()).
792 * However, if we're here, then that indicated we overflowed reassembly queue and decided to drop the packet
793 * instead. Bail out; which essentially just means don't acknowledge it, as that would occur just below. */
794 return;
795 }
796
797 // Either fed to Receive buffer or reassembly queue. Can certainly acknowledge it at this point.
798
799 // Register one individual acknowledgment of N bytes of data (will go out and acks new, acceptable DATA).
800 rcv_stats.good_to_send_ack_packet(data_size);
801
802 // ACK will happen asynchronously (not in this handler, and at best once UDP net-stack considers itself writable).
803 async_acknowledge_packet(sock, seq_num, packet->m_rexmit_id, data_size); // More rcv_stats kept inside.
804 } // else if (rexmit_on)
805
806 // After changes, log.
807 log_rcv_window(sock);
808} // Node::handle_data_to_established()
809
811 boost::shared_ptr<const Data_packet> packet,
812 bool* dupe, bool* slide, size_t* slide_size)
813{
814 assert(dupe && slide && slide_size);
815
816 /* Note this is a helper to handle_data_to_established() to make it more manageable. See comments and
817 * flow in that caller first.
818 *
819 * Note: not dealing with rcv_stats, as it's less code (assuming 1 call to us anyway) to do it based on our result. */
820
821 // See comment in same spot in handle_data_to_established().
822 Peer_socket_receive_stats_accumulator& rcv_stats = sock->m_rcv_stats;
823 const Sequence_number& rcv_next_seq_num = sock->m_rcv_next_seq_num;
824 const Peer_socket::Recvd_pkt_map& rcv_packets_with_gaps = sock->m_rcv_packets_with_gaps;
825
826 const auto& data = packet->m_data;
827 const Sequence_number& seq_num = packet->m_seq_num;
828
829 // Get the sequence number just past the last datum in this packet.
830 Sequence_number seq_num_end = seq_num;
831 advance_seq_num(&seq_num_end, data.size());
832
833 // If false, all received packets are followed by all unreceived ones. Otherwise there's at least 1 gap.
834 bool first_gap_exists;
835 // If true, then this is the sequence number of the first datum right after that first gap.
836 Sequence_number seq_num_after_first_gap;
837 rcv_get_first_gap_info(sock, &first_gap_exists, &seq_num_after_first_gap);
838
839 // Validate the 1st sequence number in DATA against the ISN.
840
841 if (seq_num <= sock->m_rcv_init_seq_num)
842 {
843 /* Sequence number precedes or equals the original SYN's sequence number. Either the other side
844 * is an a-hole, or somehow a socket_id was reused from a recent connection, which we do try to
845 * avoid like the plague. Therefore, send them an RST and abort connection. If they send more
846 * data packets to this port (which is quite possible; many could already be on the way),
847 * they'll get more RSTs still. */
848
849 // Interesting/rare enough to log a WARNING.
850 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
851 "Received [" << packet->m_type_ostream_manip << "] with "
852 "sequence number [" << seq_num << "]; data size [" << data.size() << "]; "
853 "sequence number precedes "
854 "ISN [" << sock->m_rcv_init_seq_num << "].");
855
856 return error::Code::S_SEQ_NUM_IMPLIES_CONNECTION_COLLISION; // Bad behavior from other side is fatal.
857 }
858 // else if (seq_num >= sock->m_rcv_init_seq_num)
859
860 if (seq_num < rcv_next_seq_num)
861 {
862 /* The packet claims to begin BEFORE the first gap (i.e., unreceived packet). This may be a
863 * valid duplicate packet. First, though, ensure it's not a "straddling" packet, i.e., that its
864 * last datum's sequence number is not past rcv_next_seq_num. If it is, that would imply one
865 * sequence number's datum is in two packets that are not duplicates of each other which is illegal. */
866
867 if (seq_num_end > rcv_next_seq_num)
868 {
869 // Interesting/rare enough to log a WARNING.
870 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
871 "Received [" << packet->m_type_ostream_manip << "] with "
872 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
873 "data size [" << data.size() << "]; "
874 "straddle first unreceived "
875 "sequence number [" << rcv_next_seq_num << "].");
876
877 // Yep, it straddles the boundary. Other side is behaving badly. RST/close as above.
879 }
880 // else ([seq_num, end seq_num] is before the first unreceived packet sequence, a/k/a gap)
881
882 FLOW_LOG_TRACE("Duplicate packet before first unreceived sequence number [" << rcv_next_seq_num << "].");
883
884 *dupe = true;
885 *slide = false;
886 return Error_code();
887 } // if (seq_num < rcv_next_seq_num)
888 // else if (seq_num >= rcv_next_seq_num)
889
890 /* Packet claims to be in what TCP would call the receive window (somewhere at or after the
891 * first gap). Pin down in what part of that space it is, in order of increasing seq. #s. */
892
893 // First see if it's right at the start of the first gap.
894
895 if (seq_num == rcv_next_seq_num)
896 {
897 /* Great. It's at the start of the first gap, so we should be able to advance the window
898 * (increment rcv_next_seq_num). First check that it doesn't straddle the next received packet
899 * after the gap, if any. (Again, if it does that means one sequence number is inside 2
900 * packets that aren't dupes of each other, which is illegal.) */
901 if (first_gap_exists && (seq_num_end > seq_num_after_first_gap))
902 {
903 // Interesting/rare enough to log a WARNING.
904 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
905 "Received [" << packet->m_type_ostream_manip << "] with "
906 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
907 "data size [" << data.size() << "]; "
908 "supposed gap-filling data "
909 "straddle the boundary of packet [" << seq_num_after_first_gap << ", ...).");
910
912 }
913 // else legal -- can slide window to the right and save to Receive buffer.
914
915 FLOW_LOG_TRACE("Packet filled first [" << data.size() << "] unreceived sequence numbers "
916 "starting with [" << rcv_next_seq_num << "].");
917
918 *dupe = false;
919 *slide = true;
920 *slide_size = size_t(seq_num_end - seq_num);
921 assert(*slide_size == data.size());
922 return Error_code();
923 }
924
925 // else if:
926 assert(seq_num > rcv_next_seq_num);
927
928 *slide = false; // This much is certain, as we're not filling the first gap from the front.
929
930 /* Packet doesn't fill that first gap. It's somewhere after the start of the first gap. Now
931 * there are 3 possibilities:
932 *
933 * -1- It's illegal: it straddles the boundary of one of the packets in m_rcv_packets_with_gaps,
934 * meaning some sequence number is inside 2 non-identical packets. RST/close as above.
935 *
936 * -2- It is a duplicate (same starting sequence number and length) of one of the packets
937 * past the first gap (i.e., of the packets in rcv_packets_with_gaps). Thus dupe =
938 * true (we should ACK but not save to Receive buffer).
939 *
940 * -3- It fits into one of the gaps; i.e. its sequence number range is either entirely
941 * before that of rcv_packets_with_gaps; entirely after it; or entirely before the
942 * first sequence number of an element of rcv_packets_with_gaps AND entirely after the
943 * last sequence number of the preceding element of rcv_packets_with_gaps. Thus we
944 * should ACK and save to Receive buffer.
945 *
946 * Determine which one it is.
947 *
948 * @todo Below technique is fun and all, but I now suspect the following might be simpler:
949 * 1, is seq_num in rcv_packets_with_gaps already? If so but different length, error; if so but
950 * but same length, *dupe is true. Otherwise: 2, insert a thing representing `packet` into rcv_packets_with_gaps
951 * as if for real; call inserted thing P. 3, check for straddling against right edge of prior(P), if any;
952 * if so, error. 4, check for straddling against left edge of next(P), if any; if so, error.
953 * 5, *dupe is false. The problem? It requires insertion, when this is supposed to not modify `packet` but only
954 * categorize it. Can of course remove it at the end, but that's cheesy. Can also modify our contract
955 * accordingly, but that reduces separation of concerns in caller's algorithm. Also, possibly the resulting
956 * algorithm might be easier to grok but not much shorter, if at all, anyway. Finally, could leave the
957 * straddling detection to later parts of the algorithm (again, changing our contract to be weaker though).
958 * In any case, not a top concern; and in terms of performance I doubt it would differ much from below. */
959
960 /* Find where we are compared to the various received packets past the first gap.
961 * This gets the first packet whose first sequence number is >= seq_num. There are 3 possibilities:
962 * that is equal to seq_num, past seq_num, or there is no such packet.
963 *
964 * Note that the lookup is O(log n) amortized, and then the subsequent checking is O(1).
965 * This is one of the reasons to use a sorted map by seq. #. */
966 const Peer_socket::Recvd_pkt_const_iter next_packet = rcv_packets_with_gaps.lower_bound(seq_num);
967
968 if (next_packet == rcv_packets_with_gaps.end())
969 {
970 /* There is no packet after ours, and there is no packet equal to ours. Thus we'll just
971 * insert our packet at the end. Check, however, that there is no straddling (-1- above).
972 * What packet's boundary can we straddle? At least the last one (assuming there's a gap). Its
973 * last number may be >= seq_num. (Its first is guaranteed to be < seq_num based on the
974 * above check.) If we don't straddle that boundary, we can't straddle any other packet's boundary,
975 * since all other packets precede the last one, so just check the last one (if exists). */
976 if (first_gap_exists)
977 {
978 const Peer_socket::Recvd_pkt_const_iter last_packet = prior(rcv_packets_with_gaps.end());
979 Sequence_number seq_num_last_end;
980 get_seq_num_range(last_packet, 0, &seq_num_last_end);
981
982 if (seq_num_last_end > seq_num) // (Corner case check: == means it contiguously precedes `packet`; no straddle.)
983 {
984 // Yep, packet straddles boundary of last_packet.
985
986 // Interesting/rare enough to log a WARNING.
987 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
988 "Received [" << packet->m_type_ostream_manip << "] with "
989 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
990 "data size [" << data.size() << "]; "
991 "supposed middle gap-filling packet data "
992 "straddle the boundary of last packet [..., " << seq_num_last_end << ").");
993
994 // Register one packet with N bytes of data (not acceptable due to error).
995 rcv_stats.error_data_packet(data.size());
997 }
998 // else OK, we're a new packet that happens to be the newest (by sequence number).
999
1000 FLOW_LOG_TRACE("New packet is newest packet after unreceived gap; "
1001 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
1002 "first unreceived packet [" << rcv_next_seq_num << "].");
1003 }
1004 else // if (!first_gap_exists)
1005 {
1006 // OK, we're a new packet that happens to be the packet that forms the first gap by being after that gap.
1007
1008 FLOW_LOG_TRACE("New packet forms gap; sequence numbers [" << seq_num << ", " << seq_num_end << "); "
1009 "first unreceived packet [" << rcv_next_seq_num << "].");
1010 }
1011
1012 *dupe = false;
1013 return Error_code();
1014 } // if (next_packet does not exist)
1015 // else if (next_packet exists at the same or later sequence number as seq_num)
1016
1017 // Get the [range) of sequence numbers in the packet that starts at or after seq_num.
1018 Sequence_number seq_num_next_start, seq_num_next_end;
1019 get_seq_num_range(next_packet, &seq_num_next_start, &seq_num_next_end);
1020
1021 if (seq_num_next_start == seq_num)
1022 {
1023 /* Our first datum has same sequence number as next_packet. Thus it's a duplicate.
1024 * Check, however, that their last sequence numbers are also identical. Otherwise, again,
1025 * one datum is in two different packets, which is illegal. */
1026 if (seq_num_next_end != seq_num_end)
1027 {
1028 // Yep, not a valid duplicate.
1029
1030 // Interesting/rare enough to log a WARNING.
1031 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
1032 "Received [" << packet->m_type_ostream_manip << "] with "
1033 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
1034 "data size [" << data.size() << "]; "
1035 "do not match supposed "
1036 "duplicate packet [" << seq_num << ", " << seq_num_next_end << ").");
1037
1039 }
1040 // else
1041
1042 /* @todo With rexmit_on we can also/instead compare `data` against actual data payload in next_packet -- not just
1043 * the sequence numbers. With !rexmit_on, there's no need to store the payloads, as they're always fed directly
1044 * to user upon receipt, even out of order. */
1045
1046 FLOW_LOG_TRACE("Duplicate packet after unreceived data; "
1047 "sequence numbers [" << seq_num << ", " << seq_num_end << ").");
1048
1049 *dupe = true;
1050 return Error_code();
1051 } // if (seq_num_next_start == seq_num)
1052 // else if:
1053 assert(seq_num_next_start > seq_num); // lower_bound() is not horrifically broken.
1054
1055 // We've eliminated all dupe possibilities above. It's either error or not, at this point.
1056 *dupe = false;
1057
1058 /* Since next_packet starts after `packet`, the best outcome is that packet is entirely
1059 * before next_packet and entirely after prev_packet, where prev_packet == prior(next_packet) (if
1060 * such a thing exists). So we must check that we don't straddle
1061 * either next_packet's starting boundary or prev_packet's ending boundary. All other
1062 * preceding boundaries are straddled if and only if the prev_packet end is, and all
1063 * succeding boundaries iff next_packet start is. */
1064
1065 if (seq_num_end > seq_num_next_start) // Corner case check: == means `packet` contiguously precedes next_packet.
1066 {
1067 // Straddle one or more succeding packets. RST/close as above.
1068
1069 // Interesting/rare enough to log a WARNING.
1070 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
1071 "Received [" << packet->m_type_ostream_manip << "] with "
1072 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
1073 "data size [" << data.size() << "]; "
1074 "supposed middle gap-filling packet data "
1075 "straddle the left boundary of packet "
1076 "[" << seq_num_next_start << ", " << seq_num_next_end << ").");
1077
1078 return error::Code::S_SEQ_NUM_ARITHMETIC_FAILURE; // Bad behavior is fatal to connection, as above.
1079 }
1080 // else succeding packets OK. Check preceding packets.
1081
1082 if (next_packet == rcv_packets_with_gaps.begin())
1083 {
1084 FLOW_LOG_TRACE("New packet partially fills first gap without sliding window; "
1085 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
1086 "first unreceived packet [" << rcv_next_seq_num << "].");
1087 return Error_code(); // There are none. We're good.
1088 }
1089
1090 const Peer_socket::Recvd_pkt_const_iter prev_packet = prior(next_packet);
1091 Sequence_number seq_num_prev_start, seq_num_prev_end;
1092 get_seq_num_range(prev_packet, &seq_num_prev_start, &seq_num_prev_end);
1093
1094 if (seq_num_prev_end > seq_num) // Corner case check: == means prev_packet contiguously precedes `packet`.
1095 {
1096 // Straddling one or more preceding packets. RST/close as above.
1097
1098 // Interesting/rare enough to log a WARNING.
1099 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
1100 "Received [" << packet->m_type_ostream_manip << "] with "
1101 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
1102 "data size [" << data.size() << "]; "
1103 "supposed middle gap-filling packet data "
1104 "straddle the right boundary of packet "
1105 "[" << seq_num_prev_start << ", " << seq_num_prev_end << ").");
1106
1107 return error::Code::S_SEQ_NUM_ARITHMETIC_FAILURE; // Bad behavior is fatal to connection, as above.
1108 }
1109 // else preceding packets OK.
1110
1111 FLOW_LOG_TRACE("New packet fills some middle gap; "
1112 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
1113 "first unreceived packet [" << rcv_next_seq_num << "].");
1114
1115 return Error_code();
1116} // Node::sock_categorize_data_to_established()
1117
1119 boost::shared_ptr<Data_packet> packet)
1120{
1121 using util::Blob;
1122
1123 /* Note this is a helper to handle_data_to_established() to make it more manageable. See comments and
1124 * flow in that caller first. */
1125
1126 // See comment in same spot in handle_data_to_established().
1127 Peer_socket_receive_stats_accumulator& rcv_stats = sock->m_rcv_stats;
1128 Blob& data = packet->m_data; // NOT const due to Socket_buffer::feed*(). See below.
1129 // Save this before we possibly destroy data's contents below (for performance).
1130 const size_t data_size = data.size();
1131
1132 size_t buf_size;
1133 {
1134 // Receive Buffer can be consumed by user threads (not W) at the same time. Must lock.
1135 Peer_socket::Lock_guard lock(sock->m_mutex);
1136
1137 /* First we must check if block will fit into sock->m_rcv_buf. Why not just use feed_buf_move()'s
1138 * max_data_size argument? Because that would allow to partially enqueue the block, if there's
1139 * space for some but not all of the block. Since we can't partially ACK a packet, we have to
1140 * drop the whole thing in that case.
1141 *
1142 * Round up to a multiple of max-block-size to ensure we never fragment a max-block-size-sized
1143 * chunk of data when they're using unreliable mode! Also apply the slack % to account for
1144 * the fact that rcv_wnd sent to the other side may lag behind reality (the key is to NOT
1145 * apply the slack % when sending rcv_wnd, so that it is more conservative). */
1146 if ((sock->m_rcv_buf.data_size() + data_size)
1147 > sock->max_block_size_multiple(sock->m_opts.m_st_rcv_buf_max_size,
1148 &sock->m_opts.m_st_rcv_buf_max_size_slack_percent))
1149 {
1150 // Receive buffer overflow.
1151
1152 // Register one packet of N bytes of acceptable data that we unfortunately have to drop due to buffer overflow.
1153 rcv_stats.good_data_dropped_buf_overflow_packet(data_size);
1154
1155 // Not an error but interesting. Might be too verbose for INFO but what the hell.
1156 FLOW_LOG_INFO("NetFlow worker thread working on [" << sock << "]. "
1157 "Received [" << packet->m_type_ostream_manip << "] with "
1158 "sequence numbers [" << packet->m_seq_num << ", " << (packet->m_seq_num + data_size) << "); "
1159 "data size [" << data_size << "]; "
1160 "dropping because Receive buffer full.");
1161 return false;
1162 }
1163 // else can successfully write to Receive buffer (enough space for entire block).
1164
1165 /* Let's make data available to user! This is a constant-time operation that MOVES
1166 * packet.data's contents into m_rcv_buf (via swap). That's why packet is Ptr and not
1167 * Const_ptr. Note that after that we no longer work with packet -- it's a goner; data.empty()
1168 * is true.
1169 *
1170 * No need to provide max buffer size -- we already checked that's not an issue above. */
1171
1172#ifndef NDEBUG
1173 const size_t written =
1174#endif
1175 sock->m_rcv_buf.feed_buf_move(&data, std::numeric_limits<size_t>::max());
1176 // `data` is now empty.
1177 assert(written == data_size);
1178
1179 buf_size = sock->m_rcv_buf.data_size();
1180 } // lock(sock->m_mutex)
1181
1182 // Register one packet of N bytes of acceptable data that we accepted -- did not drop.
1183 rcv_stats.good_data_accepted_packet(data_size);
1184 // Register one packet of N bytes of acceptable data that we delivered to user.
1185 rcv_stats.good_data_delivered_packet(data_size);
1186 // Register that the Receive buffer grew.
1187 rcv_stats.buffer_fed(buf_size);
1188
1189 // They've sent reasonable data -- so handle the implications on rcv_wnd recovery (if any).
1190 receive_wnd_recovery_data_received(sock);
1191
1192 return true;
1193} // Node::sock_data_to_rcv_buf_unless_overflow()
1194
1195void Node::sock_rcv_buf_now_readable(Peer_socket::Ptr sock, bool syn_rcvd_qd_packet)
1196{
1197 /* We are told sock now has a non-empty Receive buffer and is thus Readable. Therefore we
1198 * should soon inform anyone waiting on any Event_sets for sock to become Readable.
1199 *
1200 * Caveat: Similar to that in Node::handle_syn_ack_ack_to_syn_rcvd() at similar point in the
1201 * code.
1202 *
1203 * Also: why do this outside the locked block that likely preceded this to actually write to the
1204 * Receive buffer? Avoid possibility of deadlock, since there
1205 * are two mutexes at play: sock->m_mutex (locked in the likely Receive buffer
1206 * update and in event_set_all_check_delta()) and Event_set::m_mutex (which is locked in
1207 * event_set_all_check_delta()). Different mutexes should always be locked in the same order,
1208 * and other threads lock in the sock->m_mutex/event_set->m_mutex order.
1209 *
1210 * Finally: if this packet was not received in ESTABLISHED but rather in SYN_RCVD and saved
1211 * until ESTABLISHED, then we skip this (syn_rcvd_qd_packet).
1212 * Why? Answer: in this case the socket has not yet been
1213 * given to the user (they need to call accept() or equivalent). Therefore, they could not have
1214 * added it to an Event_set and thus are not interested in Readable status on it. (For
1215 * background on this queueing, see handle_data_to_syn_rcvd(). */
1216
1217 // Accumulate the event into the Node store (note: not any Event_set yet) (if received during ESTABLISHED).
1218 if ((!syn_rcvd_qd_packet) &&
1219 m_sock_events[Event_set::Event_type::S_PEER_SOCKET_READABLE].insert(sock).second)
1220 {
1221 // Possibly inform the user for any applicable Event_sets right now.
1222 event_set_all_check_delta(true);
1223 /* ^-- defer_delta_check == true: because the only way to get to this method is from
1224 * async_low_lvl_recv(), which will perform event_set_all_check_delta(false) at the end of itself,
1225 * before the boost.asio handler exits. See Node::m_sock_events doc header for details. */
1226 }
1227} // Node::sock_rcv_buf_now_readable()
1228
1230 boost::shared_ptr<const Data_packet> packet,
1231 size_t data_size,
1232 bool* slide, size_t* slide_size)
1233{
1234 using std::make_pair;
1235
1236 /* Note this is a helper to handle_data_to_established() to make it more manageable. See comments and
1237 * flow in that caller first. */
1238
1239 *slide = false;
1240 *slide_size = 0;
1241
1242 // See comment in same spot in handle_data_to_established().
1243 Peer_socket_receive_stats_accumulator& rcv_stats = sock->m_rcv_stats;
1244 Peer_socket::Recvd_pkt_map& rcv_packets_with_gaps = sock->m_rcv_packets_with_gaps;
1245 const Sequence_number& seq_num = packet->m_seq_num;
1246
1247 /* Since we may increase rcv_packets_with_gaps size below, we may exceed the limit as described
1248 * in m_rcv_packets_with_gaps doc header. (The limit is due to memory concerns.) Let's compute
1249 * that limit. */
1250 const size_t max_packets_after_unrecvd_packet = sock_max_packets_after_unrecvd_packet(sock);
1251
1252 /* A pre-condition is: The received packet is NOT the first (earliest) unreceived packet we're waiting
1253 * for; in other words it is not the packet at the start of the first gap. So we should save
1254 * the packet into rcv_packets_with_gaps. (This will elsewhere help us, at least, detect if this
1255 * packet comes in again [duplicate]. See sock_categorize_data_to_established().) */
1256#ifndef NDEBUG
1257 const auto insert_result =
1258#endif
1259 rcv_packets_with_gaps.insert
1260 (make_pair(seq_num,
1262 // m_rcv_reassembly_q_data_size untouched because !rexmit_on.
1263 assert(!sock->rexmit_on());
1264 assert(insert_result.second); // If was already there, there's some serious bug in above logic.
1265 // No other part of the invariant is violated, so that's it.
1266
1267 bool first_gap_exists;
1268 // The sequence number of the first datum right after the first unreceived gap.
1269 Sequence_number seq_num_after_first_gap;
1270
1271 rcv_get_first_gap_info(sock, &first_gap_exists, &seq_num_after_first_gap);
1272 assert(first_gap_exists);
1273
1274 /* We would be done here, except we need to protect against rcv_packets_with_gaps growing too
1275 * large. This is explained in detail in the m_rcv_packets_with_gaps doc comment. Long story
1276 * short: if we exceed a certain length in this structure, pretend we have "received" the entire
1277 * first gap, which will allow us to slide the window forward and eliminate all the contiguous
1278 * received packets following this gap, of which there will be at least one
1279 * (rcv_packets_with_gaps.begin()), bringing the structure's size back to the limit. */
1280
1281 if (rcv_packets_with_gaps.size() == max_packets_after_unrecvd_packet + 1)
1282 {
1283 // Use these output knobs to reduce rcv_packets_with_gaps.size() after all to avoid overflow.
1284 *slide = true;
1285 *slide_size = size_t(seq_num_after_first_gap - sock->m_rcv_next_seq_num);
1286
1287 // Register unknown # of packets with N bytes of data, which we are assuming are dropped.
1288 rcv_stats.presumed_dropped_data(data_size);
1289
1290 // Not an error but interesting. Might be too verbose for INFO but what the hell.
1291 FLOW_LOG_INFO("NetFlow worker thread working on [" << sock << "]. "
1292 "Received [" << packet->m_type_ostream_manip << "] with "
1293 "sequence numbers [" << packet->m_seq_num << ", " << (packet->m_seq_num + data_size) << "); "
1294 "exceeded max gapped packet list size [" << max_packets_after_unrecvd_packet << "]; "
1295 "assuming Dropped; "
1296 "will fake receiving all [" << slide_size << "] sequence numbers in the first unreceived gap.");
1297 }
1298 else
1299 {
1300 // Our logic shouldn't be allowing the max to be exceeded by more than 1 at any time; we "wrist-slap" it above at 1.
1301 assert(rcv_packets_with_gaps.size() <= max_packets_after_unrecvd_packet);
1302 }
1303} // Node::sock_track_new_data_after_gap_rexmit_off()
1304
1306 boost::shared_ptr<Data_packet> packet)
1307{
1308 using std::make_pair;
1309
1310 /* Note this is a helper to handle_data_to_established() to make it more manageable. See comments and
1311 * flow in that caller first. */
1312
1313 Peer_socket_receive_stats_accumulator& rcv_stats = sock->m_rcv_stats;
1314 Peer_socket::Recvd_pkt_map& rcv_packets_with_gaps = sock->m_rcv_packets_with_gaps;
1315 const Sequence_number& seq_num = packet->m_seq_num;
1316
1317 auto& data = packet->m_data; // NOT const due to the move into Received_packet; see below.
1318 // Save this before we possibly destroy data's contents below (for performance).
1319 const size_t data_size = data.size();
1320
1321 /* Since we will increase rcv_packets_with_gaps size below, we may exceed the limit as
1322 * described in m_rcv_packets_with_gaps doc header. (The limit is due to memory concerns.)
1323 * Let's compute that limit. */
1324 size_t max_packets_after_unrecvd_packet = sock_max_packets_after_unrecvd_packet(sock);
1325
1326 /* Update: Actually, that limit is (as noted in the doc header for Peer_socket::m_rcv_packets_with_gaps, whose
1327 * growth we are constraining here) more of a formality, as in practice things like sender's CWND or
1328 * sender's following our rcv-wnd guidance should keep the size of this retranmission queue much lower than
1329 * the limit that was just computed. However! There IS a retransmission-enabled-exclusive limit we should
1330 * apply here, and it may at times be applied in practice, unlike what we just computed. Namely, consider
1331 * that if we receive N in-order, fully populated (up to max-block-size) DATA packets, and NxMBS exceeds
1332 * max-on-Receive-buffer, then indeed we will drop the overflowing portion and not put into Receive buffer;
1333 * but if we don't receive 1 in-order packet, get the next (N - 1) packets, and then finally get the one
1334 * missing DATA packet, then they will all be delivered to Receive buffer without a problem. (The next in-order
1335 * packet would indeed hit overflow, unless user dequeues some. This only highlights the oddness.)
1336 * Why? Because the above-computed limit is far higher than the equivalent max-on-Receive-buffer configuration
1337 * (typically), so the reassembly queue would be loaded up with stuff without hitting any limit, and the
1338 * code that dequeues from reassembly queue into Receive buffer does not follow any overflow logic (nor can it,
1339 * really, since by that point those DATA packets have long since been ACKed, and we do not renege ACKs).
1340 * Long story short, that is not good, and we should simply apply the max-on-Receive-buffer to not just
1341 * the Receive buffer but to this reassembly queue PLUS the Receive buffer.
1342 *
1343 * Caution! This policy means the rcv-wnd advertisements to the other side must follow this policy too.
1344 *
1345 * OK, make the computation as described. First compute the max-on-Receive-buffer, same as when actually computing
1346 * that when enqueueing that structure. Then subtract how much of it we've used in actual Receive buffer.
1347 * What remains is what's allowed for rcv_packets_with_gaps:
1348 *
1349 * Rbufdata + Rqdata <= Rbufmax <==> Rqdata <= Rbufmax - Rbufdata = S.
1350 * S_blocks = floor(S / max-block-size).
1351 * Ensure Rcurdata_blocks + 1 <= S_blocks.
1352 *
1353 * This is about right but actually slightly oversimplified, because that limit assumes the data are packed
1354 * in max-block-sized packets except possibly the last one. In reality the existing payload of the reassembly queue
1355 * may be not stored so efficiently (who knows how stuff got packetized or supplied by user or both?). To compute
1356 * this quite carefully (maybe overkill, but I feel deterministically understood to be correct = a good thing), we
1357 * model it as the queue already storing what it's storing; and we must allow a certain number of packets
1358 * on top of that and no more; and the question is whether that's enough for the incoming 1 DATA packet.
1359 * So then, we want this:
1360 *
1361 * Ensure Rqcurdata_blocks + 1 <= Rqcurdata_blocks + Sleft_blocks.
1362 * Sleft_blocks = # additional packets allowed by policy = floor(Sleft / max-block-size).
1363 * Sleft = max(Rbufmax - Rqcurdata - Rbufdata, 0).
1364 *
1365 * So we're doctoring it: we know Rqcurdata_blocks = rcv_packets_with_gaps.size() are already used; so we will
1366 * allow some # of packets beyond that, and the question is what is that # according to our policy? Well, it's just
1367 * the configured limit minus the used Receive buffer in bytes and minus the sum of rcv_packets_with_gaps's bytes.
1368 * Since we're using bytes there, that's the maximum possible accuracy, without any inefficiency being assumed to
1369 * not exist. Note that we have Rqcurdata* being subtracted from Rqcurdata* on one side, and that may seem like
1370 * those should cancel each other out to zero, but no -- that was the case in the simpler model above, but the more
1371 * realistic one means those are (sligthly, potentially) different. */
1372 size_t max_packets_in_reassembly_q
1373 = sock->max_block_size_multiple(sock->m_opts.m_st_rcv_buf_max_size,
1374 &sock->m_opts.m_st_rcv_buf_max_size_slack_percent);
1375 // We have to momentarily lock sock due to access to sock->m_rcv_buf.
1376 size_t rcv_buf_size;
1377 {
1379 rcv_buf_size = sock->m_rcv_buf.data_size(); // This access requires locking.
1380 }
1381 util::subtract_with_floor(&max_packets_in_reassembly_q, rcv_buf_size) && // [sic]
1382 util::subtract_with_floor(&max_packets_in_reassembly_q, sock->m_rcv_reassembly_q_data_size);
1383 // Convert from bytes to max-block-sizes. Note this is the floor of the division (so it is strict).
1384 max_packets_in_reassembly_q /= sock->max_block_size();
1385 /* Okay, we have Sleft in blocks now; add this for direct comparison to the left side, which will be .size() + 1,
1386 * where the 1 is the incoming packet `packet`. Full-circle, this is `Rqcurdata_blocks + Sleft_blocks` from
1387 * the above big comment. */
1388 max_packets_in_reassembly_q += rcv_packets_with_gaps.size();
1389
1390 // The final limit is the lower of the two limits; realistically we expect max_packets_in_reassembly_q to "win."
1391 if (max_packets_in_reassembly_q < max_packets_after_unrecvd_packet)
1392 {
1393 max_packets_after_unrecvd_packet = max_packets_in_reassembly_q;
1394 }
1395 else
1396 {
1397 // Not an error but pretty weird configuration (but too verbose for INFO, if it really does occur).
1398 FLOW_LOG_TRACE("Unexpected Receive buffer limits: safety net [" << max_packets_after_unrecvd_packet << "] <= "
1399 "real limit [" << max_packets_in_reassembly_q << "], but the opposite is typical. "
1400 "See details just below."); // See next log message.
1401 }
1402
1403 if (rcv_packets_with_gaps.size() + 1 > max_packets_after_unrecvd_packet)
1404 {
1405 /* Overflow. Drop this new packet instead of queueing it. Note that this is different
1406 * from the handling of the same situation in the no-retransmit case. In that case, this
1407 * situation is probably more common under loss, since once a packet is considered Dropped by sender, it is NEVER
1408 * re-sent; thus Receiver eventually also considers it Dropped and (instead of dropping
1409 * the new packet, which would be a disastrous policy) simply pretends the gap has been
1410 * filled, thus consolidating the front of rcv_packets_with_gaps. */
1411
1412 // Register one packet of N bytes of acceptable data that we unfortunately have to drop due to overflow.
1414
1415 // This is an error, though not our fault.
1416 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
1417 "Received [" << packet->m_type_ostream_manip << "] with "
1418 "sequence numbers [" << packet->m_seq_num << ", " << (packet->m_seq_num + data_size) << "); "
1419 "exceeded max gapped packet list size [" << max_packets_after_unrecvd_packet << "]; "
1420 "dropping packet.");
1421 return false;
1422 }
1423 // else we can insert into reassembly queue (priority queue by seq. #) rcv_packets_with_gaps.
1424
1425 FLOW_LOG_TRACE("NetFlow worker thread working on [" << sock << "]. "
1426 "Enqueueing [" << packet->m_type_ostream_manip << "] payload onto reassembly queue with "
1427 "sequence numbers [" << packet->m_seq_num << ", " << (packet->m_seq_num + data_size) << ") "
1428 "of size [" << data_size << "]; "
1429 "successfully fit into max gapped packet list size [" << max_packets_after_unrecvd_packet << "]; "
1430 "could have fit [" << (max_packets_after_unrecvd_packet - rcv_packets_with_gaps.size()) << "] more.");
1431
1432 // This decimates `data` but is constant time, much like the buffer enqueueing done elsewhere.
1433#ifndef NDEBUG
1434 const auto insert_result =
1435#endif
1436 rcv_packets_with_gaps.insert
1437 (make_pair(seq_num, // Decimation occurs in here: ------------------v, hence the `&`: -------------v.
1439 sock->m_rcv_reassembly_q_data_size += data_size;
1440 assert(insert_result.second); // If was already there, there's some serious bug in above logic.
1441 // No other part of the invariant is violated, so that's it.
1442
1443 // DO NOT use `data` from this point forward -- it was just emptied by moving into the new Received_packet.
1444
1445 // Register one packet of N bytes of acceptable data that we accepted -- did not drop.
1446 rcv_stats.good_data_accepted_packet(data_size);
1447 // Register one packet of N bytes of acceptable data that we queued for reassembly -- not yet in Receive buffer.
1448 rcv_stats.good_data_first_qd_packet(data_size);
1449
1450 // They've sent reasonable data -- so handle the implications on rcv_wnd recovery (if any).
1451 receive_wnd_recovery_data_received(sock);
1452
1453 return true;
1454} // Node::sock_data_to_reassembly_q_unless_overflow()
1455
1456void Node::sock_slide_rcv_next_seq_num(Peer_socket::Ptr sock, size_t slide_size, bool reassembly_in_progress)
1457{
1458 /* Note this is a helper to handle_data_to_established() to make it more manageable. See comments and
1459 * flow in that caller first. */
1460
1461 // See comment in same spot in handle_data_to_established().
1462 Peer_socket_receive_stats_accumulator& rcv_stats = sock->m_rcv_stats;
1463 Peer_socket::Recvd_pkt_map& rcv_packets_with_gaps = sock->m_rcv_packets_with_gaps;
1464 Sequence_number& rcv_next_seq_num = sock->m_rcv_next_seq_num;
1465
1466 /* OK, caller determined that the front of the gap between rcv_next_seq_num and
1467 * seq_num_after_first_gap has been received. Indeed mark this fact by sliding the former to a higher value,
1468 * indicating sliding right of the left edge of the receive window, in TCP terminology. */
1469 rcv_next_seq_num += slide_size; // Use op+= over advance_seq_num(): slide_size is of Sequence_numbers, not bytes.
1470
1471 FLOW_LOG_TRACE("First unreceived packet pointer moved from "
1472 "[" << (rcv_next_seq_num - slide_size) << "] to "
1473 "[" << rcv_next_seq_num << "].");
1474
1475 /* Now update the receive window structure. Maintain invariant described in doc headers for
1476 * for m_rcv_packets_with_gaps and related members. Additionally, IF retranmission-related
1477 * reassembly is in progress (presumably, because retransmission is enabled), and if the new packet bridged
1478 * gap to the first seq.-#-contiguous packet(s) in the reassembly queue, then add their data to Receive buffer
1479 * also. */
1480
1481 // Start of range to delete.
1482 const Peer_socket::Recvd_pkt_iter start_contig_it = rcv_packets_with_gaps.begin();
1483 // End of range to delete (just past last element to delete).
1484 Peer_socket::Recvd_pkt_iter end_contig_it;
1485 size_t total_written = 0;
1486
1487 // The following loop is O(n) worst case.
1488 for (end_contig_it = start_contig_it;
1489 /* Search until the infinite gap is found; or the first finite gap is found.
1490 * Note invariant at entry to each loop iteration: rcv_next_seq_num is seq. # just past last received
1491 * packet's data (so for contiguousness, it must equal the 1st seq. # in next packet). */
1492 (end_contig_it != rcv_packets_with_gaps.end()) && (end_contig_it->first == rcv_next_seq_num);
1493 ++end_contig_it)
1494 {
1495 Peer_socket::Received_packet& rcvd_packet = *end_contig_it->second;
1496
1497 if (reassembly_in_progress)
1498 {
1499 /* Receive Buffer can be consumed by user threads (not W) at the same time. Must lock.
1500 * @todo Probably possible to make the critical section smaller.
1501 *
1502 * Conversely, maybe it's better to lock around the entire while () loop, for potentially less
1503 * locking/unlocking while another thread is reading from buffer, which intuitively "feels" churn-y.
1504 * Arguments against: the loop may have 0 iterations, meaning the locking was a waste; also, locking
1505 * once per packet is no worse in aggregate than if we'd received these packets in order without
1506 * needing reassembly -- and that's the much more typical state of affairs; so it's not like we're
1507 * adding some unusually excessive amount of locking/unlocking by locking once per packet during
1508 * reassembly. */
1509 size_t written;
1510 size_t buf_size;
1511 {
1512 Peer_socket::Lock_guard lock(sock->m_mutex);
1513
1514 /* Reassemble! This is constant-time. Note we don't check for overflow here, but that's because we
1515 * checked for it cleverly in first enqueueing this in rcv_packets_with_gaps
1516 * (see sock_data_to_reassembly_q_unless_overflow()). */
1517 written = sock->m_rcv_buf.feed_buf_move(&rcvd_packet.m_data, std::numeric_limits<size_t>::max());
1518 // rcvd_packet.m_data is now empty.
1519 buf_size = sock->m_rcv_buf.data_size();
1520 }
1521 total_written += written;
1522
1523 // Similarly to when receiving a first-gap-filling (or just in-order, if there is no gap) DATA packet:
1524 rcv_stats.good_data_delivered_packet(written);
1525 rcv_stats.buffer_fed(buf_size);
1526
1527 assert(written != 0);
1528 }
1529
1530 advance_seq_num(&rcv_next_seq_num, rcvd_packet.m_size);
1531
1532 FLOW_LOG_TRACE("First unreceived packet pointer moved again to "
1533 "[" << rcv_next_seq_num << "]; packet subsumed by this move.");
1534 } // while (keep encountering contiguous packets)
1535
1536 // The following, according to STL requirements, is O(k + log n), where k is # erased; thus O(n) worst case.
1537 rcv_packets_with_gaps.erase(start_contig_it, end_contig_it); // Does nothing if end_contig_it == start_contig_it.
1538 sock->m_rcv_reassembly_q_data_size -= total_written;
1539} // Node::sock_slide_rcv_next_seq_num()
1540
1542{
1543 /* The limit itself is not an option but rather computed from other options to be
1544 * more dynamic. Let N be the desired max ratio of rcv_packets_with_gaps.size() * max-block-size
1545 * to the max Receive buffer size, expressed in percent. Then the max
1546 * rcv_packets_with_gaps.size() value is N% * <max Receive buffer size> / max-block-size / 100%.
1547 * N is the option m_st_rcv_max_packets_after_unrecvd_packet_ratio_percent. */
1548 return uint64_t(sock->opt(sock->m_opts.m_st_rcv_max_packets_after_unrecvd_packet_ratio_percent)) *
1549 uint64_t(sock->opt(sock->m_opts.m_st_rcv_buf_max_size)) /
1550 uint64_t(sock->max_block_size()) /
1551 100;
1552}
1553
1555 bool* first_gap_exists, Sequence_number* seq_num_after_first_gap)
1556{
1557 // If false, all received packets are followed by all unreceived ones. Otherwise there's at least 1 gap.
1558 *first_gap_exists = !sock->m_rcv_packets_with_gaps.empty();
1559 // If true, then this is the sequence number of the first datum right after that first gap.
1560 if (*first_gap_exists)
1561 {
1562 *seq_num_after_first_gap = sock->m_rcv_packets_with_gaps.begin()->first;
1563 }
1564}
1565
1566void Node::async_acknowledge_packet(Peer_socket::Ptr sock, const Sequence_number& seq_num, unsigned int rexmit_id,
1567 size_t data_size)
1568{
1569 // We are in thread W.
1570
1571 // Plenty of info logged in caller, so don't re-log.
1572 FLOW_LOG_TRACE("Accumulating for acknowledgment.");
1573
1574 // Register one packet with N bytes of data (not necessarily acceptable data).
1575 sock->m_rcv_stats.total_to_send_ack_packet(data_size);
1576
1577 const size_t acks_pending_before_this = sock->m_rcv_pending_acks.size();
1578
1579 static_assert(std::is_aggregate_v<Peer_socket::Individual_ack>,
1580 "We want it to be direct-initializable.");
1581 static_assert((!std::is_copy_constructible_v<Peer_socket::Individual_ack>)
1582 && (!std::is_copy_assignable_v<Peer_socket::Individual_ack>),
1583 "We want it to be noncopyable but rather passed-around via its ::Ptr.");
1584
1585 /* Just the starting sequence number sufficient to identify a single packet. The time point saved
1586 * here is subtracted from time_now() at ACK send time, to compute the artificial delay introduced
1587 * by ACK delaying (explained just below). This helps other side calculate a more accurate RTT by
1588 * substracting the ACK delay from its RTT measurement. */
1589 sock->m_rcv_pending_acks.push_back
1591 (new Peer_socket::Individual_ack{ seq_num, rexmit_id, Fine_clock::now(), data_size }));
1592
1593 /* m_rcv_pending_acks now stores at least one packet to acknowledge. We can acknowledge it
1594 * immediately (modulo UDP layer availability of course). However, suppose there is a fast stream
1595 * of packets coming in, such that several DATA packets were read in within one
1596 * low_lvl_recv_and_handle() call. Then each DATA packet will result in one ACK packet.
1597 * This introduces a ton of overhead, as the header is quite large given that the payload is just
1598 * a Sequence_number. Instead we would want to pack all the DATA packets' acknowledgments into
1599 * one ACK packet (unless it overflows, in which case create more ACK packets as needed). So we
1600 * only accumulate the individual acknowledgments here; we will possibly send the actual ACK(s) in
1601 * perform_accumulated_on_recv_tasks(), which runs at the end of low_lvl_recv_and_handle() (or its
1602 * bro, the async part of async_wait_latency_then_handle_incoming()).
1603 *
1604 * Caveat: The above is rock-solid if the different DATA packets being acked were contiguous to
1605 * each other chronologically. What if there is another type of packet between some two of these
1606 * DATAs? Well, it depends on what it is. Ignoring the misbehaving/duplicate/whatever packets
1607 * (SYN, for example) -- which will just be discarded basically -- let's consider the
1608 * possibilities. If the packet is ACK, then it is irrelevant; NetFlow (like TCP) is full-duplex
1609 * (actually more so, since there's no DATA+ACK piggy-backing), therefore the micro-ordering of
1610 * traffic in opposite directions is irrelevant. If the packet is RST, then that means the socket
1611 * will get closed (no longer ESTABLISHED) before we get a chance to send any of the individual
1612 * acknowledgments. However, that is more or less OK; if the other side sent RST, then they won't
1613 * accept any ACKs we may send them anyway. The only other possibility has to with graceful close,
1614 * but that is not yet implemented.
1615 * @todo Revisit this when graceful close is implemented. (Preliminary idea: force immediate ACK
1616 * handling when FIN/etc. detected? Or something.) */
1617
1618 if (m_socks_with_accumulated_pending_acks.insert(sock).second)
1619 {
1620 /* First acknowledgment to be accumulated in this handler (low_lvl_recv_and_handle() or
1621 * async part of async_wait_latency_then_handle_incoming()). So mark down whether at that time there were
1622 * already timer-delayed acknowledgments pending (and how many). See
1623 * sock_perform_accumulated_on_recv_tasks() for details on delayed ACKs. */
1624 sock->m_rcv_pending_acks_size_at_recv_handler_start = acks_pending_before_this;
1625 }
1626 // else already had registered pending acknowledgment in this handler.
1627} // Node::async_acknowledge_packet()
1628
1630{
1631 using boost::chrono::milliseconds;
1632 using boost::chrono::microseconds;
1633 using boost::chrono::duration_cast;
1634 using boost::chrono::round;
1635 using std::vector;
1636
1637 // We are in thread W.
1638
1639 // For background see Node::perform_accumulated_on_recv_tasks().
1640
1641 // For brevity and speed:
1642 vector<Peer_socket::Individual_ack::Ptr>& pending_acks = sock->m_rcv_pending_acks;
1643
1644 if (sock->m_int_state != Peer_socket::Int_state::S_ESTABLISHED)
1645 {
1646 // For example, we got DATA and then RST on the same socket almost simultaneously.
1647 FLOW_LOG_TRACE("Was about to perform accumulated acknowledgment tasks on [" << sock << "] but skipping because "
1648 "state is now [" << sock->m_int_state << "].");
1649 return;
1650 }
1651
1652 // Check explicit pre-condition.
1653 assert(!pending_acks.empty());
1654
1655 /* Deal with any accumulated acknowledgments. Naively, we'd simply call async_low_lvl_ack_send()
1656 * here, which would take pending_acks and bundle them up into as few as possible ACK
1657 * packets and send them off.
1658 *
1659 * However, we potentially instead use delayed ACKing as in typical TCP implementations (based on
1660 * various standard RFCs). The idea is that a few DATA packets have come in around the same time,
1661 * but not close enough to be handled in one receive handler. So upon detecting the first DATA
1662 * packet in the steady state, start a timer; until it fires accumulate more packets in
1663 * pending_acks; and when it fires finally assemble and flush (send) the ACK(s). Something else may trigger
1664 * the flushing of the ACK(s) ahead of this timer or even immediately.
1665 *
1666 * These are situations where we must short-circuit the timer and send the ACK(s)
1667 * immediately:
1668 *
1669 * 1. From TCP (RFC 5681-4.2), which says that an ACK should be generated for at
1670 * least every second full-sized (data size = MSS) incoming data segment. The reasoning is
1671 * two-fold: causing bursty sending by the receiver of the ACKs; and slowing down slow start
1672 * in Reno (and others) congestion control. The latter is not really a problem for us (since
1673 * ACKs are not cumulative but selective and handled as such by our congestion control logic);
1674 * but the former is definitely an easily demonstrable issue. @todo This paragraph is difficult
1675 * to understand right now. There might be 1 or more unintentional meaning inversions, wherein
1676 * I mean to say X is good, but instead say X is bad, or vice vera, or at least it's unclear. Research;
1677 * rephrase.
1678 *
1679 * 2. Also from TCP (RFC 5681-3.2), which says that an ACK should be
1680 * immediately generated upon detecting an out-of-order data segment. This is to inform
1681 * congestion control of any loss event as soon as possible (Fast Recovery algorithm).
1682 *
1683 * Note that TCP RFCs don't account for the implementation detail that several packets can be
1684 * received "simultaneously" (in one handler in our case), nor for selective ACKs (in this
1685 * context), so when they say we must send an ACK for every 2 incoming segments at least, we do
1686 * not take this literally. Instead, we just say that if (here, after a full receive handler has
1687 * run) there are at least 2 full blocks' worth of pending acknowledgments (there could be many
1688 * more in theory) and/or there's an out-of-order DATA packet, then we send immediate ACK(s), thus
1689 * following the spirit of the rules in the RFC. The spirit of the rule is to short-circuit the
1690 * timer the moment at least 2 full packets can be acknowledged.
1691 *
1692 * We detect both of these situations below and act accordingly. We also start the delayed ACK
1693 * timer, if necessary, otherwise. Oh, and there's a mode to disable delayed ACKs.
1694 *
1695 * @todo We may also force immediate ACKing during graceful shutdown. Revisit when graceful
1696 * shutdown is implemented.... */
1697
1698 const Fine_duration delayed_ack_timer_period = sock->opt(sock->m_opts.m_st_delayed_ack_timer_period);
1699
1700 bool force_ack = delayed_ack_timer_period == Fine_duration::zero(); // Delayed ACKs disabled.
1701
1702 if (force_ack)
1703 {
1705 ("Delayed [ACK] feature disabled on [" << sock << "]; forcing immediate [ACK]. "
1706 "Receive window state: [" << sock->m_rcv_init_seq_num << ", " << sock->m_rcv_next_seq_num << ") "
1707 "| " << sock->m_rcv_packets_with_gaps.size() << ":{...}.");
1708 }
1709 else if (!sock->m_rcv_packets_with_gaps.empty())
1710 {
1711 /* Scan to see if there was an out-of-order DATA packet. That is to say, have we received a
1712 * DATA packet -- i.e., have we queued a pending acknowledgment in this receive handler -- that
1713 * follows at least one unreceived packet in the sequence number space.
1714 *
1715 * There is a gap in the received sequence number space, so this is potentially possible. Scan
1716 * only the DATA packets (acknowledgments) accumulated in THIS handler (since previous ones
1717 * have already been checked, and unreceived gaps can't just appear out of nowhere later). If
1718 * any is past the first gap, it qualifies. (The reverse is true. If it's past any gap, it's
1719 * past the first gap.) */
1721 for (size_t ack_idx = sock->m_rcv_pending_acks_size_at_recv_handler_start;
1722 ack_idx != pending_acks.size(); ++ack_idx)
1723 {
1724 ack = pending_acks[ack_idx];
1725 if (ack->m_seq_num > sock->m_rcv_next_seq_num)
1726 {
1727 force_ack = true;
1728 break;
1729 }
1730 }
1731
1732 if (force_ack)
1733 {
1735 ("On [" << sock << "] "
1736 "received out-of-order packet [" << ack->m_seq_num << ", size " << ack->m_data_size << ", "
1737 "rexmit " << ack->m_rexmit_id << "]; "
1738 "forcing immediate [ACK]. "
1739 "Receive window state: [" << sock->m_rcv_init_seq_num << ", " << sock->m_rcv_next_seq_num << ") "
1740 "| " << sock->m_rcv_packets_with_gaps.size() << ":{...}.");
1741 }
1742 }
1743 if (!force_ack)
1744 {
1745 // No out-of-order stuff. See if there are at least N * max-block-size bytes pending to be acknowledged.
1746
1747 const size_t limit // Default 2.
1748 = sock->opt(sock->m_opts.m_st_max_full_blocks_before_ack_send) * sock->max_block_size();
1749 size_t bytes = 0;
1750 for (Peer_socket::Individual_ack::Const_ptr ack : pending_acks)
1751 {
1752 bytes += ack->m_data_size;
1753 if (bytes >= limit)
1754 {
1755 force_ack = true;
1756 break;
1757 }
1758 }
1759
1760 if (force_ack)
1761 {
1762 FLOW_LOG_TRACE("On [" << sock << "] "
1763 "accumulated at least [" << limit << "] bytes to acknowledge; "
1764 "forcing immediate [ACK].");
1765 }
1766 }
1767
1768 // OK; force_ack is set finally.
1769
1770 if (force_ack)
1771 {
1772 /* Yep, must send ACK(s) now. There are two possibilities. One, a delayed ACK timer may
1773 * already be running. If so, we should cancel it and send immediately. If the cancel fails
1774 * (returns 0 tasks canceled), then it was already queued to fire very soon, so we should
1775 * just let the ACKing happen that way instead of sending immediately.
1776 *
1777 * Two, a timer is not running, so we shouldn't cancel and should just send immediately.
1778 *
1779 * How to determine if timer is currently running? If
1780 * m_rcv_pending_acks_size_at_recv_handler_start == 0, then the timer was either never scheduled
1781 * (only scheduled when pending_acks.empty()) or was triggered and handled before the current
1782 * handler; therefore it is not running. Otherwise, there were pending acks to send, yet they
1783 * were not sent by the end of the last handler, which means the timer must be running.
1784 *
1785 * (There may be some corner case I'm not imagining such that the timer was running even while
1786 * m_rcv_pending_acks_size_at_recv_handler_start == 0, but even then the worst that will happen is
1787 * that we will perform the ACKing here, not cancel that wait, and that timer will
1788 * harmlessly expire with the timer handler doing nothing.) */
1789
1790 if (sock->m_rcv_pending_acks_size_at_recv_handler_start != 0)
1791 {
1792 FLOW_LOG_TRACE("On [" << sock << "] "
1793 "canceling delayed [ACK] timer due to forcing "
1794 "immediate [ACK]; would have fired "
1795 "in [" << round<milliseconds>(sock->m_rcv_delayed_ack_timer.expiry() - Fine_clock::now()) << "] "
1796 "from now.");
1797
1798 if (sock->m_rcv_delayed_ack_timer.cancel() == 0)
1799 {
1800 /* Unlikely but legitimate; timer was queued to trigger very soon, so we could not
1801 * cancel it. No problem -- just let the ACKing happen per timer. Log INFO due to
1802 * rarity of this situation. */
1803 FLOW_LOG_INFO("On [" << sock << "] "
1804 "tried to cancel delayed [ACK] timer while "
1805 "forcing [ACK], but it was already just about to fire.");
1806 force_ack = false;
1807 }
1808 } // if (m_rcv_pending_acks_size_at_recv_handler_start != 0) [timer was running]
1809
1810 // If still forcing immediate ACK, finally do it.
1811 if (force_ack)
1812 {
1813 async_low_lvl_ack_send(sock);
1814 assert(pending_acks.empty());
1815 }
1816 } // if (force_ack)
1817 else // if (!force_ack)
1818 {
1819 /* There are pending individual acks but no reason to send them off right now. The only
1820 * remaining question is whether we need to schedule the delayed ACK timer to send them
1821 * later. That depends on whether the timer is already running. If
1822 * m_rcv_pending_acks_size_at_recv_handler_start == 0, then the timer was either never scheduled
1823 * or was triggered and handled before the current handler; therefore it is not running. So
1824 * in that case we should start it, as we've just received our first ackable DATA since
1825 * we've sent off our last ACK. If m_rcv_pending_acks_size_at_recv_handler_start != 0, then the
1826 * timer must be running, because there were pending acks to send, yet they were not send by
1827 * the end of the last handler (which would have caused this very code to schedule the
1828 * timer).
1829 *
1830 * (There may be some corner case I'm not imagining such that the timer was running even while
1831 * m_rcv_pending_acks_size_at_recv_handler_start == 0, but even then it can't possibly be set to
1832 * the right time [which is S_DELAYED_ACK_TIMER_PERIOD for now], so we need to re-set it
1833 * anyway. [Re-setting the expiry time will cancel that running timer wait. Even if that
1834 * somehow fails, the worst case is that the ACK(s) will be sent prematurely.]) */
1835
1836 if (sock->m_rcv_pending_acks_size_at_recv_handler_start == 0)
1837 {
1838 // First individual acknowledgment accumulated: start countdown to send the next batch of acknowledgments.
1839
1840 sock->m_rcv_delayed_ack_timer.expires_after(delayed_ack_timer_period);
1841
1842 FLOW_LOG_TRACE("On [" << sock << "] "
1843 "scheduled delayed [ACK] timer to fire "
1844 "in [" << round<milliseconds>(delayed_ack_timer_period) << "].");
1845
1846 // When triggered or canceled, call this->async_low_lvl_ack_send(sock, <error code>).
1847 sock->m_rcv_delayed_ack_timer.async_wait([this, socket_id, sock](const Error_code& sys_err_code)
1848 {
1849 async_low_lvl_ack_send(sock, sys_err_code);
1850 });
1851 // ^-- defer_delta_check == false: for similar reason as in send_worker_check_state() calling send_worker().
1852 }
1853 // else the timer is already started, so just accumulating onto pending_acks is enough. Done.
1854 } // if (!force_ack)
1855
1856 // Register the current # of DATA packets to acknowledge. Note that we're near the end of current handler.
1857 sock->m_rcv_stats.current_pending_to_ack_packets(pending_acks.size());
1858} // Node::sock_perform_accumulated_on_recv_tasks()
1859
1860void Node::log_rcv_window(Peer_socket::Const_ptr sock, bool force_verbose_info_logging) const
1861{
1862 using std::vector;
1863 using std::string;
1864 using boost::algorithm::join;
1865
1866 // We're in thread W.
1867
1868 // For brevity and a little speed:
1869 const Peer_socket::Recvd_pkt_map& rcv_packets_with_gaps = sock->m_rcv_packets_with_gaps;
1870
1871 // force_verbose_info_logging => log the most detail, as INFO (if INFO logging enabled).
1872
1873 auto const logger_ptr = get_logger();
1874 if (((!logger_ptr) || (!logger_ptr->should_log(log::Sev::S_DATA, get_log_component()))) &&
1875 (!(force_verbose_info_logging && logger_ptr->should_log(log::Sev::S_INFO, get_log_component()))))
1876 {
1877 // Can't print entire In-flight data structure, but can print a summary, if TRACE enabled.
1879 ("Receive window state for [" << sock << "]: "
1880 "[" << sock->m_rcv_init_seq_num << ", " << sock->m_rcv_next_seq_num << ") "
1881 "| " << rcv_packets_with_gaps.size() << ":{...}.");
1882 return;
1883 }
1884 // else
1885
1886 /* Construct full printout of the packets we've received past the first unreceived gap.
1887 *
1888 * Very verbose and slow! Even so, if it gets beyond a certain size it's absurd, so skip some in
1889 * that case even though DATA logging is sanctioned. (That amount of data cannot really be useful
1890 * in any case.) */
1891
1892 vector<string> pkt_strs;
1893 pkt_strs.reserve(rcv_packets_with_gaps.size());
1894
1895 const size_t MAX_TO_SHOW = 100;
1896 bool skipped_some = false;
1897 size_t count = 0;
1898
1899 for (Peer_socket::Recvd_pkt_const_iter pkt_it = rcv_packets_with_gaps.begin();
1900 pkt_it != rcv_packets_with_gaps.end();
1901 ++pkt_it)
1902 {
1903 const bool last_iteration = (count == rcv_packets_with_gaps.size() - 1);
1904
1905 if ((!skipped_some) && (count > MAX_TO_SHOW) && (!last_iteration))
1906 {
1907 // First packet past the limit we can print. Start skipping mode.
1908 skipped_some = true;
1909 ++count;
1910 continue;
1911 }
1912 // else either we are in skipping more from before, or we are not in skipping mode.
1913
1914 string pkt_str;
1915
1916 if (skipped_some)
1917 {
1918 // We are in skipping mode from before.
1919 if (!last_iteration)
1920 {
1921 // Since it's not the last iteration, skip: print nothing.
1922 ++count;
1923 continue;
1924 }
1925 // else we are in skipping more from before, and this is the last iteration. Print the placeholder.
1926 pkt_str = "[...skipped...] ";
1927 }
1928 // Either we are not in skipping mode (just print the thing) or we are and it's last iteration (also print it).
1929
1930 Sequence_number start, end;
1931 get_seq_num_range(pkt_it, &start, &end);
1932
1933 util::ostream_op_to_string(&pkt_str, '[', start, ", ", end, ')');
1934 pkt_strs.push_back(pkt_str);
1935
1936 ++count;
1937 } // for (packets in rcv_packets_with_gaps)
1938
1940 (force_verbose_info_logging ? log::Sev::S_INFO : log::Sev::S_DATA,
1941 "Receive window state for [" << sock << "]: "
1942 "[" << sock->m_rcv_init_seq_num << ", " << sock->m_rcv_next_seq_num << ") "
1943 "| " << rcv_packets_with_gaps.size() << ":{" << join(pkt_strs, " ") << "}.");
1944} // Node::log_rcv_window()
1945
1947 boost::shared_ptr<const Ack_packet> ack)
1948{
1949 // We are in thread W.
1950
1951 /* packet is an ACK, so its payload consists of at least m_rcv_wnd (the current advertised Receive
1952 * buffer space on the receiver) and packet->m_rcv_acked_packets, which is basically a list of ZERO or
1953 * more sequence numbers, each of which represents a packet we'd (hopefully) sent that the
1954 * receiver has received. Naively we'd just handle the window update and each individual ack here
1955 * in a loop, then inform congestion control, etc. etc. However there is an optimization to make.
1956 * Suppose in the calling low_lvl_recv_and_handle() or async-part-of-async_wait_latency_then_handle_incoming()
1957 * there are several more ACKs for this socket sock that will be received. This may well happen in
1958 * high traffic; for instance the sender may have had too many individual acks for one ACK and
1959 * thus sent several; or maybe the UDP net-stack had a few packets ready by the time boost.asio was
1960 * free in thread W. In this case, it is better to collect all the individuals acks in these
1961 * several ACKs, and then handle them all at the same time. Why? Answer: it will update our
1962 * sender state (what's ACKed, what's dropped) entirely in one go instead of doing it in two or
1963 * more steps. Because congestion control activities ("on drop event," "on acknowledgment") are
1964 * performed after handling all the available acks, it gives a truer, simpler picture to the
1965 * congestion control module, when compared to giving it one picture and then almost instantly
1966 * giving it another. Another way to think of it is simply that since the different ACKs arrived
1967 * at the same time, and all an ACK is is a collection of individual acks that could fit into the
1968 * ACK packet, then conceptually this is no different from being one super-ACK with all the
1969 * individual acks contained in it. Therefore it is at least not worse.
1970 *
1971 * (In addition, m_rcv_wnd also affects the decision on whether to send more data over the wire,
1972 * as can_send() is part of that same algorithm.)
1973 *
1974 * Caveat: The above is rock-solid if the different ACKs being combined were contiguous to each
1975 * other chronologically. What if there is another type of packet between some two of these ACKs?
1976 * Well, it depends on what it is. Ignoring the misbehaving/duplicate/whatever packets (SYN, for
1977 * example) -- which will just be discarded basically -- let's consider the possibilities. If
1978 * the packet is DATA, then it is irrelevant; NetFlow (like TCP) is full-duplex (actually more so,
1979 * since there's no DATA+ACK piggy-backing), therefore the micro-ordering of traffic in opposite
1980 * directions is irrelevant. If the packet is RST, then that means the socket will get closed (no
1981 * longer ESTABLISHED) before we get a chance to process any of the individual acknowledgments.
1982 * However, that is more or less OK; if the other side sent RST, then they won't accept any
1983 * further data we may send after processing the acknowledgments anyway. The only other
1984 * possibility has to with graceful close, but that is not yet implemented.
1985 * @todo Revisit this when graceful close is implemented. (Preliminary idea: accumulate DATA and
1986 * FIN/etc. packets and always handle them after handling ACKs. Then the DATA/FIN stream will not
1987 * have a chance to disrupt (by initiating closing the connection) the ACK handling, while the ACK
1988 * handling should have no bearing on the DATA/FIN stream.)
1989 *
1990 * So, let's accumulate the individual acks in packet->m_rcv_acked_packets into a big
1991 * sock->m_rcv_acked_packets to be handled from perform_accumulated_on_recv_tasks() at the end of the
1992 * current handler. Similarly save m_rcv_wnd into sock->m_pending_rcv_wnd. To let that method
1993 * know sock has a new m_pending_rcv_wnd and possibly non-empty sock->m_rcv_acked_packets, insert sock
1994 * into m_socks_with_accumulated_acks. */
1995
1996 /* Note: We're not setting the actual sock->m_snd_remote_rcv_wnd until
1997 * perform_accumulated_on_recv_tasks().
1998 *
1999 * Also note: the latest ACK to arrive in this receive handler will contain the most up-to-date
2000 * rcv_wnd value (previous ones are overwritten by this). */
2001 sock->m_snd_pending_rcv_wnd = ack->m_rcv_wnd;
2002
2003 // It's a (ref-counted) pointer copy. Note there may be 0 elements there, if it's just an m_rcv_wnd update alone.
2004 sock->m_rcv_acked_packets.insert(sock->m_rcv_acked_packets.end(), // Append.
2005 ack->m_rcv_acked_packets.begin(), ack->m_rcv_acked_packets.end());
2006 m_socks_with_accumulated_acks.insert(sock); // May already be in there.
2007
2008 FLOW_LOG_TRACE("NetFlow worker thread working on [" << sock << "]. "
2009 "Received and accumulated [" << ack->m_type_ostream_manip << "] with "
2010 "[" << ack->m_rcv_acked_packets.size() << "] individual acknowledgments "
2011 "and rcv_wnd = [" << ack->m_rcv_wnd << "]; total for this socket in this "
2012 "receive handler is [" << sock->m_rcv_acked_packets.size() << "] individual acknowledgments.");
2013
2014 sock->m_snd_stats.received_low_lvl_ack_packet(ack->m_rcv_acked_packets.empty());
2015} // Node::handle_ack_to_established()
2016
2018{
2019 using std::min;
2020 using std::vector;
2021 using boost::tuple;
2022 using boost::unordered_set;
2023 using boost::chrono::round;
2024 using boost::chrono::milliseconds;
2025 using boost::chrono::seconds;
2026
2027 /* This is a complex method that does many things. Therefore readability is hard to accomplish, as the logic
2028 * makes sense when writing it, but the big picture is hard to see when reading it. The necessary heavy commenting
2029 * further increases the size and therefore (along that dimension) decreases readability. For these reasons,
2030 * many logically distinct parts were placed into helper methods -- not to increase code reuse but to help
2031 * the aforementioned consideration. */
2032
2033 // We are in thread W.
2034
2035 log_accumulated_acks(sock);
2036 // Below TRACE messages omit most of the just-logged detail, since it's already logged now.
2037
2038 // For brevity and a little speed:
2039 using Acks = vector<Ack_packet::Individual_ack::Ptr>;
2040 Acks& acked_packets = sock->m_rcv_acked_packets;
2041 /* To not put already-handled acknowledgments up for handling again in the next run of this method
2042 * (which would be wrong), we must clear acked_packets before exiting this method. To be safe,
2043 * make sure acked_packets.clear() runs no matter how this method exits. */
2044 util::Auto_cleanup cleanup = util::setup_auto_cleanup([&]() { acked_packets.clear(); });
2045
2046 /* Handle all the acknowledgments we've received in this receive handler. Background on the
2047 * accumulation tactic is in handle_ack_to_established(). As explained in that method, some
2048 * packet between the first and last ACK received in this handler may have changed state away from
2049 * ESTABLISHED. For example, there could have been an RST. Check for that. */
2050 if (sock->m_int_state != Peer_socket::Int_state::S_ESTABLISHED)
2051 {
2052 // Rare/interesting enough for INFO.
2053 FLOW_LOG_INFO("NetFlow worker thread working on [" << sock << "]. "
2054 "Accumulated [ACK] packets with [" << acked_packets.size() << "] "
2055 "individual acknowledgments, but state is now [" << sock->m_int_state << "]; ignoring ACKs forever.");
2056 return;
2057 }
2058 // else OK. Handle the accumulated acknowledgments.
2059 assert(sock->m_int_state == Peer_socket::Int_state::S_ESTABLISHED);
2060
2061 /* The individual acknowledgments are (sequence number, ACK delay in unit X, retransmission ID)
2062 * triples, where the latter is always zero unless retransmission is enabled. Let's handle each
2063 * one by updating m_snd_flying_pkts* (i.e., removing that packet from m_snd_flying_pkts*) and
2064 * informing congestion control. Before continuing reading the method please look at the large
2065 * comments for Peer_socket::m_snd_flying_pkts_by_{sent_when|seq_num} (drawing a diagram might also help).
2066 *
2067 * Before continuing, quick discussion of corner cases:
2068 *
2069 * Any two given such triples may have equal sequence number/retransmission ID entries. This
2070 * means that during the last ACK delay timer or boost.asio handler, while accumulating the
2071 * acknowledgments for this ACK packet, the receiver received the same packet twice (duplicate).
2072 * (This can happen due to network misbehavior; and due to ACK loss and other conditions when
2073 * retransmits are enabled.) Call 2 such packets P1 and P2, where P1 was received first and thus
2074 * appears earlier in acked_packets. How do we handle this?
2075 *
2076 * Suppose instead of being in the same ACK, P1 and P2 were in different ACKs that arrived in that
2077 * order, P1 and P2 (something that certainly could happen depending on how the delayed ACK timer
2078 * works out). That situation is basically the same (except that if they're in one ACK there's
2079 * the added guarantee that we KNOW what P1 is acknowledging arrived to the receiver before
2080 * what P2 was acknowledging did, which is even more solid knowledge). Therefore, it makes sense
2081 * to simply handle each acknowledgment in the ACK in the order they're listed in acked_packets.
2082 * The 2nd, 3rd, etc. occurrence will thus be treated the same way as if it arrived in a later
2083 * ACK. */
2084
2085 /* Congestion control: First see introduction to this topic in class Congestion_control_strategy
2086 * doc header. Then resume here.
2087 *
2088 * Since information stored in ACKs is of paramount importance to how congestion control views the
2089 * pipe, congestion control is very relevant in this method: this method is the main (but not
2090 * only) source of events for m_snd_cong_ctl.
2091 *
2092 * These ACK-based events are of interest to m_snd_cong_ctl:
2093 *
2094 * - on_acks(N, M): N bytes in M packets have just been converted from In-flight to
2095 * Acknowledged. Note that these packets have NOT been previously Acknowledged or considered
2096 * Dropped (they are In-flight just before the ACK).
2097 * - This should also be immediately preceded with M on_individual_ack(N', T, CWND) events, where N'
2098 * is the # of bytes in the individual acked packet; and T is the RTT of the packet, and CWND is the
2099 * # of bytes in cwnd that was used when the acked data pkt was sent.
2100 * In the rest of the discussion I omit this event, as it can be thought of as part of
2101 * on_acks() for purposes of the discussion.
2102 * - on_loss_event(N', M'): N' bytes in M' packets have just been converted from In-flight to
2103 * Dropped.
2104 *
2105 * The basic idea is to determine which of these events are implied by the acks passed to this
2106 * method, inform m_snd_cong_ctl, and then check if the new m_snd_cong_ctl->congestion_window_bytes()
2107 * value (a/k/a CWND) -- if it has changed -- allows us to now send more bytes (if we have any).
2108 *
2109 * An important decision (and one sadly not very explicitly exposed [perhaps as an exercise to the
2110 * reader, or to avoid being too rigid] in the various TCP RFCs) is how to group these events and
2111 * in what order. In other words, do we call on_acks(N, 1) for each acknowledged packet? Do we
2112 * then check for drops and call on_loss_event(N', M') immediately, or wait to process all acked
2113 * packets first?
2114 *
2115 * The answer we choose is simple. First, scan all individual (i.e., for each sent packet) acks
2116 * given to us and update m_snd_flying_pkts_by_seq_num (the "scoreboard"). While doing so keep track of
2117 * the cumulative N and M. Having done that, we will also expose zero or more In-flight packets
2118 * as Dropped. (In this method, a given packet is exposed as Dropped if the total number of
2119 * acknowledged packets AFTER that packet exceeds a constant like 2. So basically if the acks we
2120 * process here make that counter exceed that limit for a given packet P, P is Dropped and removed
2121 * from m_snd_flying_pkts_by_seq_num.) So after the ack scanning phase, tally up all packets now
2122 * considered Dropped, which gives us N' and M'.
2123 *
2124 * Finally, call on_loss_event(N', M') (assuming N' and M' are not zero). And then call
2125 * on_acks(M, N) (assuming N and M are not zero).
2126 *
2127 * Let's justify this. First, assume it's correct to tally these things up and then just
2128 * call each method once. Is the "report loss, report acks" order right? Yes. Intuitively,
2129 * m_snd_cong_ctl wants to know about events in the chronological order they occur. While the Drop(s)
2130 * are detected at the same time as the Ack(s), the actual packet dropping INFERRED from the
2131 * Ack(s) occurred in the past; we're only deducing it now. The received Acks are in fact for
2132 * packets AFTER the now-Dropped packets. Hence this is the right order.
2133 *
2134 * Now the only remaining thing is to justify combining the ack and drop events in one (each). For
2135 * acknowledgments, it's straightforward: so far, most Congestion_control_strategy modules
2136 * don't need to know about each individual ack, so for simplicity/efficiency we can just combine
2137 * them. (However, some algorithms do need it; e.g., FAST would need it; still, many don't.
2138 * Other modules, like Send_bandwidth_estimator, may also care about individual acks.)
2139 *
2140 * What about the drop events? Why combine all the drops into one? Should we include all the
2141 * drops into the one? To answer, I use as a reference DCCP CCID 2 RFC 4341 (standards track)
2142 * which describes a protocol similar to ours and implies the following model. Basically, over
2143 * time, the pipe experiences a series of 0 or more congestion events (more accurately loss
2144 * events). Two loss events cannot overlap in this implied model. Thus any given Dropped packet
2145 * belongs to exactly one loss event. Here is how the RFC (section 5) more or less formally
2146 * defines whether 2 packets belong to one event: "As in TCP, two losses [...] are considered part
2147 * of a single congestion event when the second packet was sent before the loss [...] of the first
2148 * packet was detected." Presumably the text also assumes that the "second" packet was
2149 * found to be dropped either at the same or later time as the "first" packet was found to be
2150 * dropped (otherwise the text makes no sense, as the very earliest Dropped packet would be in the
2151 * same congestion event as the very last Dropped packed in a very long session). Let's build an
2152 * algorithm inductively based on this definition.
2153 *
2154 * At first there are no loss events. We get a group of acks which render another group of
2155 * packets P1, P2, ... (in order of increasing sequence number) Dropped. Certainly P1 is in a
2156 * loss event; call it L1. P2 was found to be dropped at the same or later time as P1; and it was
2157 * obviously sent before L1 was detected (which was NOW; call it T1). So P2 is in loss event L1.
2158 * Similarly, so is P3, P4, .... Now let's say some time passes and we get more acks and thus
2159 * dropped packets P7, P8, P9, .... Suppose P7 was sent before T1 (but found Dropped at T2 > T1),
2160 * which is quite possible (e.g., T2 could be just after T1). Then by the above definition P7 is
2161 * in loss event L1 (no new loss event). P8 could be in the same situation. In fact, all Dropped
2162 * packets from this ack group may be in L1. Suppose, conversely, that P9 was sent AFTER T1. By
2163 * the above definition, it is part of a new loss event L2, detected at T2. Now P10, is certainly
2164 * in L2 as well, since it was sent before T2, obviously. Thus we can, for each Dropped packet P,
2165 * determine whether it's part of the preceding loss event or part of a new one.
2166 *
2167 * Intuitively, it makes sense as well. If, say, we got 5 dropped packets at the same time, and
2168 * informed Congestion_control_classic (Reno) with 5 calls to on_loss_event(), then CWND would get
2169 * halved 5 times! Intuitively that's not right (and way too conservative). More likely the 5
2170 * packets belong to the same congestion or loss event, so CWND should only be halved once. Then
2171 * the only question is how to group packets into separate loss events. The above algorithm,
2172 * roughly speaking, considers two packets as part of the same loss event if they're within an RTT
2173 * of each other (indeed RFC 4341 says one can use the SRTT to approximate the above algorithm,
2174 * although we choose to use the exact definition instead).
2175 *
2176 * Therefore the final algorithm is justified and is as follows:
2177 *
2178 * 0. Before the current method is ever called, set time stamp m_snd_last_loss_event_when =
2179 * -infinity.
2180 * 1. Scan all acknowledgments, updating m_snd_flying_pkts* and m_snd_flying_bytes.
2181 * Keep track of total acknowledgment stats (bytes and packets). (Inform side modules like
2182 * Send_bandwidth_estimator with any required individual ack info like RTTs.)
2183 * Ignore acks of packets not in m_snd_flying_pkts* (not In-flight).
2184 * 2. Tally up which packets are exposed as Dropped by the above m_snd_flying_pkts* updates.
2185 * Keep track of total loss stats (bytes and packets). However, when doing the latter ignore
2186 * any packet P for which P.m_sent_when < m_snd_last_loss_event_when.
2187 * 3. If at least 1 packet exposed as Dropped in step 2, call
2188 * m_snd_cong_ctl->on_loss_event(...stats...); and set m_snd_last_loss_event_when to the current time,
2189 * marking this the start of a new loss event.
2190 * 4. If at least 1 packet exposed as Acknowledged in step 1, call
2191 * m_snd_cong_ctl->on_acks(...stats...). */
2192
2193 // Set up work state and save certain "before" values.
2194
2195 // For RTT at least. Use steady, high-res clock. Use one coherent value for entire method to simulate simultaneity.
2196 const Fine_time_pt time_now = Fine_clock::now();
2197
2198 // For brevity and a little speed:
2199 const bool rexmit_on = sock->rexmit_on();
2200 auto& snd_stats = sock->m_snd_stats;
2201 auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
2202 /* These guys are only stored in Peer_socket (instead of creating locally here) for a bit of performance. Reuse now.
2203 * Note that clear() is a very fast operation; it will essentially just set the internal element count to 0. */
2204 auto& pkts_marked_to_drop = sock->m_snd_temp_pkts_marked_to_drop;
2205 pkts_marked_to_drop.clear();
2206
2207 // To check, at the end, whether we've changed can_send() false => true.
2208 const bool could_send_before_acks = can_send(sock);
2209 // To check, at the end, whether we've changed snd_deqable() false => true.
2210 const bool had_rexmit_data_before_acks = !sock->m_snd_rexmit_q.empty();
2211
2212 /* Includes each order number (unique packet ID) for which the packet was acknowledged.
2213 * Used for Drop_timer events to register at the bottom; and also to feed the priority queue high_ack_count_q
2214 * (explained below in detail). */
2215 unordered_set<Peer_socket::order_num_t> flying_now_acked_pkts;
2216
2217 // These are the N, M arguments to on_acks() described just above in big comment.
2218 size_t clean_acked_bytes = 0;
2219 size_t clean_acked_packets = 0;
2220
2221 /* The are the individual T, N' (RTT, acked_bytes, sent_cwnd_bytes) arguments to pass to on_individual_ack() described
2222 * just above in big comment. We will be accumulating these across all the acks in the loop below. */
2223 using Clean_acked_packet = tuple<Fine_duration, size_t, size_t>;
2224 vector<Clean_acked_packet> clean_acked_packet_events;
2225 clean_acked_packet_events.reserve(min(acked_packets.size(), snd_flying_pkts_by_when.size())); // Small optimization.
2226
2227 /* Handle each acknowledgment in the order that the corresponding packet was received by other
2228 * side (earliest to latest) per above discussion. */
2229 for (const Ack_packet::Individual_ack::Const_ptr ack : acked_packets)
2230 {
2231 /* Use helper to classify this individual ack as one of the following:
2232 * - Malformed/illegal. => error_ack is true. Else:
2233 * - Legal but referring to an already-acknowledged packet, or arriving too late. => dupe_or_late is true.
2234 * - The packet being acknowledged is unknown. => flying_pkt_it == past_oldest() (a/k/a end()).
2235 * - The packet being acknowledged is known. => flying_pkt_it points to that acked packet.
2236 * - Legal and acking a not-yet-acked packet, arriving in time. => dupe_or_late is false.
2237 * => flying_pkt_it points to that acked packet.
2238 * Note: The helper takes care of snd_stats updating, closing socket on error, and relevant logging. */
2239
2241 bool dupe_or_late;
2242
2243 const bool error_ack = !categorize_individual_ack(socket_id, sock, ack, &dupe_or_late, &flying_pkt_it);
2244 if (error_ack)
2245 {
2246 return; // Fatal error for entire socket (malformed ack, etc.). Socket is closed; all logged; bail out now.
2247 }
2248 // else
2249
2250 // Note these may never be initialized.
2251 Fine_duration round_trip_time;
2253 const Peer_socket::Sent_packet::Sent_when* sent_when;
2254
2255 // Compute RTT, assuming we ID'ed the original DATA. (RTT logged even if we still throw away the ack just below.)
2256 if (flying_pkt_it != snd_flying_pkts_by_when.past_oldest())
2257 {
2258 // Use helper to compute RTT and, as a side effect, get `Sent_when* sent_when` set to point to appriate structure.
2259 flying_pkt = flying_pkt_it->second;
2260 round_trip_time = compute_rtt_on_ack(flying_pkt, time_now, ack, &sent_when); // It logs details.
2261 } // Done computing (if possible) RTT and logging it.
2262
2263 if (dupe_or_late)
2264 {
2265 continue; // Do NOT return! There may well be valid individual acks after it. All logged; get out now.
2266 }
2267
2268 // else it's an in-time acking of DATA packet that has not yet been acked (is considered In-flight)!
2269 assert(!dupe_or_late);
2270 // The following is guaranteed by helper above, since !dupe_or_late. Hence, also, flying_pkt, sent_when, RTT set.
2271 assert(flying_pkt_it != snd_flying_pkts_by_when.past_oldest());
2272 assert(flying_pkt);
2273
2274 // Update SRTT, etc.
2275 new_round_trip_time_sample(sock, round_trip_time);
2276
2277 /* Similarly, inform congestion control (see big comment at top of method). Some strategies
2278 * use individual acks to gauge the pipe's properties. Save the info to
2279 * later call on_individual_ack(). Why not just call
2280 * it here? Answer: Congestion_control_strategy interface specifies that
2281 * on_individual_ack() must be called AFTER on_loss_event() (which can only be called once
2282 * we've fully updated snd_flying_pkts, thus handled all acks). It also specifies that
2283 * snd_flying_pkts must be updated to reflect the handled ack. So we have no choice but
2284 * to save it. (@todo Performance?) */
2285 const size_t bytes_acked = flying_pkt->m_size;
2286 const size_t cwnd_bytes = sent_when->m_sent_cwnd_bytes;
2287 clean_acked_packet_events.emplace_back(round_trip_time, bytes_acked, cwnd_bytes);
2288
2289 // Maintain invariant. Packet acknowledged, so remove from In-flight packet list and related structures.
2290 snd_flying_pkts_erase_one(sock, flying_pkt_it);
2291
2292 // Bona fide In-flight->Acknowledged data; accumulate to inform congestion control below.
2293 clean_acked_bytes += bytes_acked;
2294 ++clean_acked_packets;
2295
2296 /* If we got here, then it is in fact what we consider a valid acknowledgment of packet
2297 * sent at time sent_when. Therefore, we should increment m_acks_after_me for any packet that has NOT
2298 * been acknowledged that was sent earlier than sent_when. (Later we'd consider Dropped any
2299 * packets for which this value is too high, as in TCP Fast Recovery/Retransmit.) Note that if
2300 * retransmission is off, that's the same as all packets with a lower first sequence number.
2301 * However if retransmission is on, then a packet may have a lower sequence number but be sent
2302 * later. Thus we use sent_when and not seq_num.
2303 *
2304 * Naively, we could just have a for () loop here to increment all such data members. However
2305 * that's inefficient -- order O(k * n), where k = acked_packets.size() and n =
2306 * snd_flying_pkts*.size(), in the worst case. Moreover, some of the Sent_packet structs in
2307 * which we increment m_acks_after_me may be acknowledged and thus erased from snd_flying_pkts*
2308 * in subsequent iterations of the for () loop we are in, wasting that work.
2309 *
2310 * So instead we count the individual acks in a hash map that maps sent_when to the number of
2311 * times (in this ACK) that sequence number's packet was validly acknowledged. This is O(k)
2312 * amortized total. Then elsewhere we use that hash map to more efficiently update m_acks_after_me
2313 * where appropriate. In addition, this hash map is used to register certain Drop_timer
2314 * at the end of the method. */
2315
2316 /* Note that we track these by "order number"; each sent packet (no matter if retransmitted or
2317 * not) gets a unique order number, higher than all previous. Since no two packets will have
2318 * the same order number, we keep a set of order numbers. */
2319 flying_now_acked_pkts.insert(sent_when->m_order_num);
2320 } // for (all acked_packets)
2321
2322 /* snd_flying_pkts* is updated w/r/t removing the In-flight-now-acked packets. Now, realize that
2323 * for a given packet P still In-flight, if packets sent BEFORE it have just become acked, intuitively
2324 * it raises the probability P has been lost and should be considered Dropped. In fact, as explained in
2325 * helper categorize_pkts_as_dropped_on_acks(), if one finds the latest-sent such packet P, then all
2326 * packets sent before it should all be dropped as well. So, let's find this latest-sent P: */
2327 const Peer_socket::Sent_pkt_ordered_by_when_iter last_dropped_pkt_it
2328 = categorize_pkts_as_dropped_on_acks(sock, flying_now_acked_pkts);
2329
2330 /* OK, so P and all In-flight packets sent before it must be dropped. This helper takes all the actions
2331 * necessary (or at least records data we use to take such actions below) w/r/t all those packets.
2332 * Namely: erases them from snd_flying_pkts*; accumulates packet and bytes counts to do with these
2333 * dropped packets; saves the packet IDs from Drop timer purposes into pkts_marked_to_drop. */
2334 size_t dropped_pkts;
2335 size_t dropped_bytes;
2336 size_t cong_ctl_dropped_bytes;
2337 size_t cong_ctl_dropped_pkts;
2338 if (!drop_pkts_on_acks(sock, last_dropped_pkt_it,
2339 &cong_ctl_dropped_pkts, &cong_ctl_dropped_bytes,
2340 &dropped_pkts, &dropped_bytes, &pkts_marked_to_drop))
2341 {
2342 return; // Already closed/logged/etc. (too many retransmissions probably).
2343 }
2344
2345 // As long promised since the top of this method, let congestion control (and B/W estimator) know what happened!
2346
2347 /* Bandwidth estimation: It can be useful to estimate the available outgoing bandwidth (available
2348 * meaning the total bandwidth of the empty pipe minus any other traffic other than this
2349 * connection [NetFlow or otherwise] currently occupying this pipe). Mostly it's useful for certain
2350 * congestion control strategies like Congestion_control_classic_with_bandwidth_est, but it may be
2351 * good information to have if only for the user's general information. Therefore we keep an
2352 * independent m_snd_bandwidth_estimator regardless of the congestion control strategy in use.
2353 * Like Congestion_control_strategy, it updates its state based on events. It currently cares
2354 * about at least one event: on_acks(N), where N is the number of bytes acknowledged. This is
2355 * very similar to the on_acks(N, M) event for congestion control (see above). None of the other
2356 * aspects of the above discussion (such as loss events) apply to m_snd_bandwidth_estimator. */
2357
2358 // Note that the order is as required by Congestion_control_strategy() API: loss, individual acks, consolidated acks.
2359
2360 // Report loss event info to congestion control.
2361 if (dropped_pkts != 0)
2362 {
2363 // @todo Might be too verbose to keep it as INFO!
2364 FLOW_LOG_INFO("NetFlow worker thread working on [" << sock << "]. "
2365 "Considering Dropped: [" << dropped_bytes << "] bytes = [" << dropped_pkts << "] packets.");
2366
2367 if (cong_ctl_dropped_pkts != 0) // Again, cong_ctl_dropped_pkts != dropped_pkts, potentially.
2368 {
2369 // New loss event!
2370 assert(cong_ctl_dropped_bytes != 0); // Empty blocks not allowed (should have been eliminated by now).
2371
2372 FLOW_LOG_INFO("cong_ctl [" << sock << "] update: loss event: "
2373 "Dropped [" << cong_ctl_dropped_bytes << "] bytes "
2374 "= [" << cong_ctl_dropped_pkts << "] packets.");
2375
2376 sock->m_snd_cong_ctl->on_loss_event(cong_ctl_dropped_bytes, cong_ctl_dropped_pkts);
2377 sock->m_snd_last_loss_event_when = Fine_clock::now();
2378
2379 // As a silver lining, we probably got some nice new acknowledgments following that drop.
2380 }
2381 }
2382 else
2383 {
2384 assert(dropped_pkts == 0);
2385 assert(cong_ctl_dropped_pkts == 0);
2386 }
2387
2388 if (clean_acked_packets != 0)
2389 {
2390 assert(clean_acked_bytes != 0); // Empty blocks not allowed (should have been eliminated by now).
2391 assert(!clean_acked_packet_events.empty());
2392
2393 // Report individual (clean) acks to congestion control.
2394 for (const auto& [rtt, bytes, cwnd_bytes] : clean_acked_packet_events)
2395 {
2396 FLOW_LOG_TRACE("cong_ctl [" << sock << "] update: clean individual acknowledgment: "
2397 "[" << sock->bytes_blocks_str(bytes) << "] with RTT [" << round<milliseconds>(rtt) <<
2398 "] and sent_cwnd_bytes [" << cwnd_bytes << "].");
2399
2400 sock->m_snd_cong_ctl->on_individual_ack(rtt, bytes, cwnd_bytes);
2401 }
2402
2403 FLOW_LOG_TRACE("cong_ctl/bw_est [" << sock << "] update: clean acknowledgments: "
2404 "[" << sock->bytes_blocks_str(clean_acked_bytes) << "] = "
2405 "[" << clean_acked_packets << "] packets.");
2406
2407 // Report the totality of (clean) acks to congestion control and bandwidth estimator.
2408 sock->m_snd_bandwidth_estimator->on_acks(clean_acked_bytes);
2409 sock->m_snd_cong_ctl->on_acks(clean_acked_bytes, clean_acked_packets);
2410 }
2411
2412 /* For debugging it can be useful to log socket state right after loss and handling everything.
2413 * Do so but only if the last time we so logged was some time ago; this is a CPU-intensive
2414 * operation.
2415 *
2416 * Also, register dropped data in snd_stats. */
2417 if (dropped_pkts != 0)
2418 {
2419 // Register that we have convered N bytes over M packets from In-flight to Acknowledged.
2420 snd_stats.dropped_data(dropped_bytes, dropped_pkts);
2421
2422 const seconds MIN_TIME_BETWEEN_LOGS(1);
2423 const Fine_duration since_last_loss_sock_log = Fine_clock::now() - m_last_loss_sock_log_when;
2424
2425 if (since_last_loss_sock_log > MIN_TIME_BETWEEN_LOGS)
2426 {
2427 FLOW_LOG_INFO("Will log socket state on loss, because last such loss-driven logging was "
2428 "[" << round<milliseconds>(since_last_loss_sock_log) << " >"
2429 " " << MIN_TIME_BETWEEN_LOGS << "] ago.");
2430 sock_log_detail(sock);
2431 m_last_loss_sock_log_when = Fine_clock::now();
2432 }
2433 else
2434 {
2435 FLOW_LOG_INFO("Will NOT log socket state on loss, because last such loss-driven logging was "
2436 "[" << round<milliseconds>(since_last_loss_sock_log) << " <="
2437 " " << MIN_TIME_BETWEEN_LOGS << "] ago.");
2438 }
2439 }
2440
2441 // Log the send window state after the above changes (if at least TRACE enabled).
2442 log_snd_window(sock);
2443
2444 /* Handle possible effect of above activities on the Drop Timer. (It may get disabled or restarted anew.)
2445 * Why not just do this right when we erase the associated packets from snd_flying_pkts*? Answer: We don't want to
2446 * trigger disruptive behavior like possibly retransmitting everything in the middle of all that accounting
2447 * which is not yet complete. Now it's complete, so it's the right time to handle this.
2448 *
2449 * Recall that snd_flying_pkts* have been updated and no longer contain the associated packets' info. */
2450
2451 const Drop_timer::Ptr drop_timer = sock->m_snd_drop_timer;
2452 drop_timer->start_contemporaneous_events();
2453
2454 for (const auto pkt_order_num : flying_now_acked_pkts)
2455 {
2456 drop_timer->on_ack(pkt_order_num);
2457 drop_timer->on_packet_no_longer_in_flight(pkt_order_num);
2458 }
2459 for (const auto pkt_order_num : pkts_marked_to_drop)
2460 {
2461 drop_timer->on_packet_no_longer_in_flight(pkt_order_num);
2462 }
2463
2464 drop_timer->end_contemporaneous_events();
2465
2466 /* As avertised, handle the rcv_wnd update: the latest ACK we are handling here contains the
2467 * latest info about the Receive buffer space on the other side that is available. */
2468 if (sock->m_snd_pending_rcv_wnd != sock->m_snd_remote_rcv_wnd)
2469 {
2470 FLOW_LOG_TRACE("Other side advertised "
2471 "rcv_wnd change [" << sock->m_snd_remote_rcv_wnd << "] => [" << sock->m_snd_pending_rcv_wnd << "].");
2472 sock->m_snd_remote_rcv_wnd = sock->m_snd_pending_rcv_wnd;
2473 /* Why have this intermediate m_snd_pending_rcv_wnd thing at all then? Answer: can_send(),
2474 * checked at the start of this method and saved into could_send_before_acks, uses the "before
2475 * handling the ACKs" state, which should not yet include the receive window update. Then
2476 * since we update m_snd_remote_rcv_wnd after that is computed, but before can_send() is
2477 * re-checked just below, we are able to see if the ACKs have changed can_send() from false to
2478 * true. */
2479
2480 /* Register whether after this window update, if we had a packet to send and no data In-flight,
2481 * we would be able to send at least one full DATA packet or not (i.e., can_send() would return
2482 * true). That is, register whether Receive window is ~0. */
2483 sock->m_snd_stats.updated_rcv_wnd(sock->m_snd_remote_rcv_wnd < sock->max_block_size());
2484 }
2485
2486 /* We've received ACKs and thus have quite likely reduced the number of bytes we
2487 * consider In-flight. Moreover we may have increased CWND. Moreover we may have added packets
2488 * to retransmit queue (if retransmission is on). Moreover we may have increased m_snd_remote_rcv_wnd.
2489 * Therefore can_send() may now return true while at the beginning of the method it returned
2490 * false; and similarly for snd_deqable(). So have send_worker() check and send more if possible.
2491 * See Node::send() for discussion of overall strategy on this topic. */
2492 if ((!could_send_before_acks) || (rexmit_on && (!had_rexmit_data_before_acks)))
2493 {
2494 send_worker(sock, true);
2495 /* ^-- defer_delta_check == true: because the only way to get to this method is from
2496 * async_low_lvl_recv(), which will perform event_set_all_check_delta(false) at the end of itself,
2497 * before the boost.asio handler exits. See Node::m_sock_events doc header for details. */
2498 }
2499} // Node::handle_accumulated_acks()
2500
2503 bool* dupe_or_late, Peer_socket::Sent_pkt_ordered_by_when_iter* acked_pkt_it)
2504{
2505 assert(dupe_or_late);
2506 assert(acked_pkt_it);
2507
2508 /* This helper of handle_accumulated_acks() exists to make the latter method briefer/readable, not for code reuse
2509 * as of this writing. It figures out whether the given individual ack is invalid, valid but duplicate/late, or
2510 * valid and on-time. Results go into the return value and *dupe_or_late and *acked_pkt_it. */
2511
2512 /* Now to discuss what happens when an ACK is received, with a seemingly valid sequence number
2513 * (i.e., in [m_snd_init_seq_num + 1, m_snd_next_seq_num - 1] range) -- but the corresponding
2514 * packet is not in m_snd_flying_pkts_by_seq_num. What does this mean? One, unlikely, possibility is
2515 * that it's a fake/wrong acknowledgment, not pertaining to any packet we'd sent but in the range
2516 * of sequence numbers we did send (in other words, the sequence number is in the right range but
2517 * doesn't correspond to a first sequence number of a packet we'd really sent). Unfortunately we
2518 * have no way to detect that fully, since it's not in m_snd_flying_pkts_by_seq_num, and that's basically the only
2519 * place we store packet boundaries of sent packets. Suppose we eliminate that possibility.
2520 *
2521 * Then the only remaining possibility is that this acknowledgment is a duplicate of a previous
2522 * one, which had caused us to remove that packet from m_snd_flying_pkts_by_seq_num. So, how DO we handle
2523 * a duplicate acknowledgment? We already know they got packet, as we've already measured RTT
2524 * from the previous copy of this ack, so there's nothing useful for us. Conclusion: ignore
2525 * duplicate acknowledgments.
2526 *
2527 * Note that the above discussion pertains to a dupe ack where both the sequence number and the
2528 * retransmission ID are the same as a previous one. If the retransmission ID is different (only
2529 * legal when retransmission is enabled), that's a different situation -- the acknowledgment is
2530 * not duplicate but rather acknowledging a different send attempt for the same-numbered packet.
2531 * That is less of a corner case and is handled below explicitly.
2532 *
2533 * Sent, unacknowledged packets are eventually considered Dropped. In terms of our data structures
2534 * they are handled just like acknowledged ones. Therefore, an acknowledgment of such a Dropped
2535 * packet may arrive. This is a "late" acknowledgment. It is treated just like a duplicate
2536 * acknowledgment (in fact, there is no way to tell them apart). (Note that a packet is still
2537 * considered Dropped even if retransmission is on -- it's just that in that case it's also queued
2538 * on the retransmission queue to be re-sent when possible.)
2539 *
2540 * Another caveat is that two acknowledgments that are duplicates of each other can get
2541 * mis-ordered and thus arrive in opposite order. Thus the one with the longer one-way time would
2542 * yield the higher RTT, while the shorter one would get ignored. However, RTT measurement is an
2543 * art, not a science, so this is acceptable.
2544 *
2545 * @todo Acknowledgments themselves could actually be identified with something other other
2546 * than sequence numbers and retransmission IDs; e.g., with reflected sender time stamps. Then
2547 * one could do fancier stuff... but let's not overdo it for now. */
2548
2549 // For brevity and a little speed:
2550 const bool rexmit_on = sock->rexmit_on();
2551 auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
2552 auto& snd_flying_pkts_by_seq = sock->m_snd_flying_pkts_by_seq_num;
2553 auto& snd_stats = sock->m_snd_stats;
2554
2555 // First sequence number in acknowledged packet.
2556 const Sequence_number& seq_num = ack->m_seq_num;
2557 // Retransmission ID (0 = first attempt, 1 = 1st retransmission, 2 = 2nd, ...).
2558 const unsigned int rexmit_id = ack->m_rexmit_id;
2559 assert(rexmit_on || (rexmit_id == 0)); // Should be guaranteed by deserialization.
2560
2561 // Register one individual acknowledgment of unknown # of bytes of data (may or may not be acceptable).
2562 snd_stats.received_ack();
2563
2564 /* Ensure it's within the range of sequence numbers we've already sent.
2565 * Note that this doesn't really guarantee its validity. It could be in that range but still
2566 * not correspond to any packet we'd actually sent. We try to detect that below. */
2567
2568 if (!util::in_open_open_range(sock->m_snd_init_seq_num, seq_num, sock->m_snd_next_seq_num))
2569 {
2570 /* Either the other side is an a-hole, or somehow a socket_id was reused from a recent
2571 * connection, which we do try to avoid like the plague. Therefore, send them an RST and
2572 * abort connection. If they send more data packets to this port (which is quite possible;
2573 * many could already be on the way), they'll get more RSTs still. */
2574
2575 // Interesting/rare enough to log a WARNING.
2576 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
2577 "Received [ACK]; "
2578 "acknowledgment [" << seq_num << ", ...) is outside (ISN, snd_next) "
2579 "range (" << sock->m_snd_init_seq_num << ", " << sock->m_snd_next_seq_num << ").");
2580
2581 // Register one individual acknowledgment of unknown # of bytes of data (not acceptable due to error).
2582 snd_stats.error_ack();
2583
2584 /* Close connection in our structures (inform user if necessary as well). Pre-conditions
2585 * assumed by call: sock in m_socks and sock->state() == S_OPEN (yes, since m_int_state ==
2586 * S_ESTABLISHED); 3rd arg contains the reason for the close (yes). This will empty the Send
2587 * and Receive buffers. That is OK, because this is the abrupt type of close (error). */
2588 rst_and_close_connection_immediately(socket_id, sock,
2590 /* ^-- defer_delta_check == true: because the only way to get to this method is from
2591 * async_low_lvl_recv(), which will perform event_set_all_check_delta(false) at the end of itself,
2592 * before the boost.asio handler exits. See Node::m_sock_events doc header for details. */
2593 return false; // Other out-params are meaningless.
2594 }
2595 // else within sane range.
2596
2597 // Check if the sequence number matches that of one of the packets we've sent and want acnowledged.
2598 *acked_pkt_it = snd_flying_pkts_by_when.find(seq_num);
2599 if (*acked_pkt_it == snd_flying_pkts_by_when.past_oldest()) // A/k/a end().
2600 {
2601 /* No such packet. Assuming no foul play/dumbassery, it's probably a duplicate acknowledgment
2602 * (i.e., we've already sent and got the ack, removing that packet from snd_flying_pkts*)
2603 * or a late acknowledgment (i.e., we've already sent and eventually considered Dropped the
2604 * the packet, removing it from snd_flying_pkts*).
2605 *
2606 * There is a corner case if retransmission is on. Suppose we sent packet P, consider it
2607 * Dropped (removing it from snd_flying_pkts*), and thus we place it on retransmission
2608 * queue. Suppose there is not enough CWND space to send it right away, so while it's pending
2609 * on that queue, we now get a late ack for it. Ideally in this case we'd remember it was in
2610 * retransmission queue, remove it from there, and basically act as if we hadn't removed it
2611 * from snd_flying_pkts* and got the ack for it. Instead we're just going to ignore this
2612 * information and needlessly retransmit. So why do this? Answer: It is troublesome to
2613 * design and code this. The part where we wouldn't retransmit it is fairly straightforward
2614 * and is a nice @todo. However acting as if it was normally ACKed after all is complex; for
2615 * instance, since we thought it was Dropped, we already informed m_cong_ctl of the loss event
2616 * -- how can we undo that in a clean way? It does not seem worth it. Again, checking
2617 * and updating the retransmission queue, though, is a nice @todo (but would ideally need fast
2618 * lookup into that linked list so not totally trivial).
2619 *
2620 * So, let's say that the concession described in the previous paragraph is OK.
2621 *
2622 * Could also be invalid. We only know seq_num (one boundary of packet), so how do we detect
2623 * it's invalid? One case where we know it's invalid is if this left boundary happens to be
2624 * straddled by a sequence number range in an element of snd_flying_pkts_by_seq. That would mean
2625 * that the same sequence number is in two different packets, which is in no way legal.
2626 * Example: we sent [5, 10), then received ACK with [7, ...). 7 is inside [5, 10) and is
2627 * thus illegal. */
2628
2629 /* Here's the technique we use. snd_flying_pkts_by_seq.upper_bound(S) gets the first packet
2630 * [U1, U2) such that U1 > S. Let prev(P) denote the packet preceding P in
2631 * snd_flying_pkts_by_seq; let prev([U1, U2)) = [L1, L2). Note that [U1, U2) may not exist
2632 * -- i.e., nothing after S is in the map. If so, [U1, U2) == snd_flying_pkts_by_seq.end(). Even
2633 * in that case [L1, L2) = prev([U1, U2)) MAY still exist; it is the last element of
2634 * snd_flying_pkts_by_seq in that situation.
2635 *
2636 * Given that, here are all the situations that mean P is straddled by a packet:
2637 *
2638 * - S inside [U1, U2) or any packet after it.
2639 * - Impossible. U1 > S by definition; so S is not inside any packet at U1 or later.
2640 * - S inside [L1, L2).
2641 * - Possible. We know S > L1, since otherwise S <= L1, which means we can't be inside this
2642 * if (and we are), or snd_flying_pkts_by_seq.upper_bound(S) == [L1, L2) (not true, since
2643 * snd_flying_pkts_by_seq.upper_bound(S) == [U1, U2), which != [L1, L2)). So, since S > L1,
2644 * we must check for S < L2. If true, S is straddled.
2645 * - S inside some packet [K1, K2) before [L1, L2).
2646 * - Impossible. Suppose S is inside [K1, K2) immediately preceding [L1, L2). Then
2647 * snd_flying_pkts_by_seq.upper_bound(S) == [L1, L2). But we already know
2648 * snd_flying_pkts_by_seq.upper_bound(S) == [U1, U2) (which != [L1, L2)). So that's
2649 * impossible. Repeat this logic for all packets [K1, K2) preceding [L1, L2) to show that
2650 * it can't be straddled by any of those either.
2651 *
2652 * Therefore, S is straddled by a packet if and only if:
2653 * - prev(snd_flying_pkts_by_seq.upper_bound(S)) exists; call it [L1, L2); and
2654 * - S < L2.
2655 *
2656 * This can be further restated as:
2657 * - snd_flying_pkts_by_seq.upper_bound(S) != snd_flying_pkts_by_seq.begin(); and
2658 * - (letting [L1, L2) = prev(snd_flying_pkts_by_seq.upper_bound(S)))
2659 * S < L2.
2660 *
2661 * So check for that. */
2662
2663 // Find U.
2664 Peer_socket::Sent_pkt_ordered_by_seq_const_iter pkt_it = snd_flying_pkts_by_seq.upper_bound(seq_num);
2665 // Check that prev(U) exists.
2666 if (pkt_it != snd_flying_pkts_by_seq.begin())
2667 {
2668 // prev(U) = L exists. Compute L.
2669 --pkt_it;
2670 // Compute [L1, L2), and check for straddling: S < L2. pkt_it->second points into snd_flying_pkts_by_when.
2671 Sequence_number l1, l2;
2672 get_seq_num_range(pkt_it->second, &l1, &l2);
2673
2674 assert(l1 < seq_num); // Sanity-check of upper_bound().
2675 if (seq_num < l2)
2676 {
2677 // Straddles. Other side is sending us bad stuff. As above, warn and RST/close.
2678
2679 // Register one individual acknowledgment of unknown # of bytes of data (not acceptable due to error).
2680 snd_stats.error_ack();
2681
2682 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
2683 "Received [ACK]; "
2684 "acknowledgment [" << seq_num << ", ...) is at least partially inside "
2685 "packet [" << l1 << ", " << l2 << ").");
2686 rst_and_close_connection_immediately(socket_id, sock, error::Code::S_SEQ_NUM_ARITHMETIC_FAILURE, true);
2687 /* ^-- defer_delta_check == true: because the only way to get to this method is from
2688 * async_low_lvl_recv(), which will perform event_set_all_check_delta(false) at the end of itself,
2689 * before the boost.asio handler exits. See Node::m_sock_events doc header for details. */
2690 return false; // Other out-params are meaningless.
2691 }
2692 // else if (seq_num >= l2) { It's past [L1, L2); does not straddle. }
2693 }
2694 // else { Legit because there is no packet L that could possibly straddle seq_num. }
2695
2696 /* OK, so NOW do we know it's a duplicate/late acknowledgment? Well, no. Suppose we sent packet
2697 * [5, 10) and get ACK with [5, ...). That's fine. So we erase [5, 10) from
2698 * snd_flying_pkts_by_seq. Now say we get ACK with [7, ...). Well, that's in the
2699 * [m_snd_next_seq_num, m_snd_next_seq_num) range certainly; and it doesn't get straddled by
2700 * any member of snd_flying_pkts_by_seq. Yet it's certainly invalid: we never sent (and could've
2701 * never sent) [7, ...). We can't know that, however, since [5, 10) is gone from
2702 * snd_flying_pkts_by_seq. Is this OK? More or less, yes. What do we do with a duplicate/late
2703 * acknowledgment just below? We log and ignore it. That doesn't seem harmful. NORMALLY
2704 * when something is invalid we'd RST and close connection, but here we can't know we should
2705 * do that; however ignoring it still seems fine and better than doggedly inventing data
2706 * structures to detect this corner case.
2707 *
2708 * What about m_snd_cong_ctl? Should we report this in m_snd_cong_ctl->on_acks()?
2709 * No. on_acks() specifically documents that it wants info on
2710 * In-flight->Acknowledged acknowledgments, not duplicates. (Briefly,
2711 * that's because it's measuring sent data in the pipe; acknowledgment duplication has unclear
2712 * implications about what it's acknowledging; it is unlikely that it represents more pipe
2713 * being available than if only one acknolwedgment had been received. In any case this should
2714 * hopefully be pretty rare and thus not too significant either way.)
2715 *
2716 * Same reasoning for not counting it in m_snd_bandwidth_estimator->on_acks(). */
2717
2718 // Per above discussion, ignore duplicate (or maybe invalid, but we can't know/assume that) acknowledgment.
2719
2720 // Register one individual acknowledgment of unknown # of bytes of data (late, dupe, or maybe invalid).
2721 snd_stats.late_or_dupe_ack();
2722
2723 FLOW_LOG_INFO("NetFlow worker thread working on [" << sock << "]. "
2724 "Acknowledged packet [" << seq_num << ", ...) is duplicate or late (or invalid). "
2725 "RTT unknown. Ignoring.");
2726
2727 // Ensure out-params indicating a dupe/late ack such that the packet being acked is not known.
2728 *dupe_or_late = true;
2729 assert(*acked_pkt_it == snd_flying_pkts_by_when.past_oldest()); // A/k/a end().
2730 return true;
2731 } // if (seq_num is not in snd_flying_pkts*) // i.e., duplicate/late acknowledgment with unknown acked packet.
2732 // else if (seq_num IS in snd_flying_pkts*): *acked_pkt_it points to snd_flying_pkts_by_when[seq_num].
2733 assert(*acked_pkt_it != snd_flying_pkts_by_when.past_oldest());
2734
2735 // It's an ack of sequence number we'd sent, but if retransmission is on it may not be of the one we LAST sent.
2736
2737 const Peer_socket::Sent_packet& acked_pkt = *((*acked_pkt_it)->second);
2738 const unsigned int acked_rexmit_id = rexmit_on ? acked_pkt.m_packet->m_rexmit_id : 0;
2739 Sequence_number seq_num_end; // Get sequence number just past last datum in packet.
2740 get_seq_num_range(*acked_pkt_it, 0, &seq_num_end);
2741
2742 // Note that both rexmit_id and acked_rexmit_id are guaranteed 0 at this point if !rexmit_on.
2743
2744 if (rexmit_id > acked_rexmit_id)
2745 {
2746 // This is entirely illegal. Can't acknowledge a packet copy we hadn't sent yet.
2747 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
2748 "Acknowledged packet [" << seq_num << ", " << seq_num_end << ") "
2749 "rexmit_id [" << int(rexmit_id) << "] "
2750 "exceeds highest sent rexmit_id [" << int(acked_rexmit_id) << "].");
2751 rst_and_close_connection_immediately(socket_id, sock, error::Code::S_SEQ_NUM_ARITHMETIC_FAILURE, true);
2752 /* ^-- defer_delta_check == true: because the only way to get to this method is from
2753 * async_low_lvl_recv(), which will perform event_set_all_check_delta(false) at the end of itself,
2754 * before the boost.asio handler exits. See Node::m_sock_events doc header for details. */
2755 return false; // Other out-params are meaningless.
2756 }
2757 // else if (rexmit_id <= acked_rexmit_id)
2758
2759 if (rexmit_id != acked_rexmit_id)
2760 {
2761 assert(rexmit_id < acked_rexmit_id);
2762
2763 /* This is legal: it's possible we had sent packet P, considered it Dropped, retransmitted it
2764 * (thus incrementing rexmit_id), and have now received a late acknowledgment of the
2765 * PREVIOUS attempt to send P (before retransmission). We could actually consider this
2766 * entirely equivalent to simply getting the last attempt acked. In fact I specifically kept
2767 * an array for m_sent_when, so that we can even compute accurate RTT. Yet, at least for now,
2768 * I am going to ignore such an acknowledgment. Reasons:
2769 *
2770 * - The RTT may be an outlier affected by some random event; we considered it Dropped, so
2771 * if those heuristics are generally sound, getting a late ack is suspicious.
2772 *
2773 * - Suppose I do take the RTT and report to congestion control, use for SRTT computation,
2774 * and remove from snd_flying_pkts*. I've in effect recorded a loss but then also
2775 * reported a successful retransmission, even though the ack is not for the retransmission
2776 * but more like a correction on the original loss. That's potentially fine, but chances
2777 * are I will soon receive the ack for the latest transmission, which is what I was really
2778 * expecting. That one will now be considered a late ack and will be ignored, even though
2779 * that RTT is actually probably more accurate, since chances are it arrived before the
2780 * retransmission would've been considered Dropped as well. So, basically, we're kind of
2781 * trying to use the "two wrongs make a right" philosophy, which seems messy.
2782 *
2783 * - Earlier in the method, I mentioned that if we detect P as dropped and queue it for
2784 * retransmission but get P acked *before* we get a chance to retransmit, then we consider
2785 * that ack as late and ignore it (and will still retransmit P). The reasons for that are
2786 * given in that comment. However, given that we made that decision, it would seem
2787 * strange to follow a different philosophy just because we did happen to get to
2788 * retransmit P. That would be inconsistent.
2789 *
2790 * - Keeping it in perspective, it should be fairly rare that a packet we considered Dropped
2791 * is acked after all. So it is perhaps not worth the trouble to go crazy about this
2792 * corner case.
2793 *
2794 * Nevertheless, a @todo would be to experimentally measure the effect of this policy and
2795 * decide whether it is sound. In that case also consider the aforementioned "P is acked
2796 * after queued for retransmission but before retransmitted" corner case. */
2797
2798 // Register one individual acknowledgment of unknown # of bytes of data (late).
2799 snd_stats.late_or_dupe_ack();
2800
2801 FLOW_LOG_INFO("NetFlow worker thread working on [" << sock << "]. "
2802 "Acknowledged packet [" << seq_num << ", " << seq_num_end << ") "
2803 "order_num [" << acked_pkt.m_sent_when[rexmit_id].m_order_num << "] "
2804 "rexmit_id [" << int(rexmit_id) << "] "
2805 "is less than highest sent [" << int(acked_rexmit_id) << "]. Ignoring.");
2806
2807 // Ensure out-params indicating a dupe/late ack of a specific known sent packet.
2808 *dupe_or_late = true;
2809 assert(*acked_pkt_it != snd_flying_pkts_by_when.past_oldest()); // A/k/a end().
2810 return true;
2811 }
2812 // else
2813 assert(rexmit_id == acked_rexmit_id);
2814
2815 // Do not log this mainstream case; only the exceptions above. RTT will probably be logged separately.
2816
2817 // Register one individual acknowledgment of N bytes of data (converts from In-flight to Acknowledged).
2818 snd_stats.good_ack(acked_pkt.m_size);
2819
2820 // Ensure out-params indicating an in-time, first ack of a specific known sent packet.
2821 *dupe_or_late = false;
2822 assert(*acked_pkt_it != snd_flying_pkts_by_when.past_oldest()); // A/k/a end().
2823 return true;
2824} // Node::categorize_individual_ack()
2825
2827 const Fine_time_pt& time_now,
2829 const Peer_socket::Sent_packet::Sent_when** sent_when) const
2830{
2831 using boost::chrono::milliseconds;
2832 using boost::chrono::round;
2833
2834 Fine_duration round_trip_time;
2835
2836 /* This helper of handle_accumulated_acks() exists to make the latter method briefer/readable, not for code reuse
2837 * as of this writing. It computes the RTT implied by the given individual ack and also returns the Sent_when
2838 * (which contains info on when the original packet was sent) structure as an out-param. */
2839
2840 /* RTT subtleties:
2841 *
2842 * How long did the other side, upon receiving the acked packet, wait before sending this
2843 * containing ACK with that individual acknowledgment? Why do we care? For RTT. Why do we
2844 * want RTT? To measure how long it takes for a sent packet to reach the receiver (one-way trip
2845 * time, or OWTT). Since mesuring OWTT is quite hard/impossible due to lack of absolute clock
2846 * synchronization between us and the receiver, RTT/2 is used as the next best way to get OWTT.
2847 * We can measure RTT by subtracting our recorded packet send time from the current time (ACK
2848 * receipt time). However, the ACK delay introduced by the receiver to reduce ACK overhead has
2849 * nothing to do with OWTT; it just (randomly, from the other side's point of view) inflates the RTT.
2850 * Thus we subtract the ACK delay from the RTT to get the actual RTT we use for congestion control, etc. */
2851
2852 const unsigned int rexmit_id = ack->m_rexmit_id;
2853 // Get the RTT for the transmission attempt that is actually being acknowledged (always 0 if retransmission off).
2854 *sent_when = &(flying_pkt->m_sent_when[rexmit_id]);
2855 const Peer_socket::order_num_t order_num = (*sent_when)->m_order_num;
2856
2857 /* ?second-resolution value (ack_delay) subtracted from max-resolution values. If ack_delay is
2858 * also in the max-resolution time unit, then there is no loss of precision. Otherwise we lose
2859 * precision by subtracting a number with fewer significant digits from one with more
2860 * significant digits. So Ack_delay_time_unit should ideally be Fine_duration, for precise RTT
2861 * values (especially for queueing delay-based congestion control algorithms); however that
2862 * decision is discussed elsewhere (Low_lvl_packet). */
2863 const auto& ack_delay = ack->m_delay;
2864 round_trip_time = time_now - (*sent_when)->m_sent_time - ack_delay;
2865
2866 if (round_trip_time.count() < 0)
2867 {
2868 /* Because this combines measurements on both sides, and each may have some error (plus or
2869 * minus a few hundred microseconds, possibly), and the result can be quite close to zero in
2870 * extremely low-latency situations, this may come out to be negative. So assume zero and
2871 * log a TRACE message at most.
2872 *
2873 * @todo Should we put also a ceiling on the RTT?
2874 * @todo For the floor, maybe it's better to use a higher guess than zero? */
2875 FLOW_LOG_TRACE("Acknowledged packet [" << ack->m_seq_num << ", ...) "
2876 "order_num [" << order_num << "] has negative "
2877 "RTT [" << round_trip_time << "]; assuming zero. "
2878 "Sent at [" << (*sent_when)->m_sent_time << "]; "
2879 "received at [" << time_now << "]; "
2880 "receiver-reported ACK delay [" << ack_delay << "].");
2881 round_trip_time = Fine_duration::zero();
2882 }
2883 FLOW_LOG_TRACE("Acknowledged packet [" << ack->m_seq_num << ", ...) "
2884 "order_num [" << order_num << "] "
2885 "has RTT [" << round<milliseconds>(round_trip_time) << "] "
2886 "(ACK delay [" << round<milliseconds>(ack_delay) << "]).");
2887
2888 return round_trip_time;
2889} // Node::compute_rtt_on_ack()
2890
2893 const boost::unordered_set<Peer_socket::order_num_t>& flying_now_acked_pkts)
2894{
2895 using std::priority_queue;
2896
2897 /* This helper of handle_accumulated_acks() exists to make the latter method briefer/readable, not for code reuse
2898 * as of this writing. The background is that once a set of individual acks has been processed in the sense that
2899 * sock->m_snd_flying_pkts* (which tracks In-flight outbound DATA packets) has been updated by removing the
2900 * acked packets (they are no longer In-flight), it's time to also recategorize certain further In-flight
2901 * packets as Dropped -- the intuition being that once N packets sent LATER than a given packet P have been
2902 * acked, it's highly probable that P has been Dropped by the network. This method determines the packets to drop
2903 * in that fashion.
2904 *
2905 * Now, as explained below, when ack set S causes packet set P' to be Dropped, this (possibly null) set P'
2906 * always has the following form: there is some particular packet P which is the most-recently-sent one
2907 * that is in P'; and therefore ALL other In-flight packets sent before P must be dropped too and also are in P'.
2908 * Thus P is necessary/sufficient to specify P'. Thus this method simply finds and returns a thing pointing to P. */
2909
2910 // For brevity and a little speed:
2911 auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
2912
2913 /* OK, snd_flying_pkts* has been updated, in that we've removed any Sent_packet entries
2914 * corresponding to valid acknolwedgments in this ACK. As promised elsewhere we should also update
2915 * the remaining Sent_packets' m_acks_after_me entries and erase any Sent_packets that we consider
2916 * Dropped due to too high m_acks_after_me values. (As in TCP Fast Retransmit/Recovery, an
2917 * unacknowledged packet is considered Dropped based on the heuristic that a few packets with
2918 * higher sequence numbers have been acknowledged. Except since we have Sent_when, that should be
2919 * even better than using sequence number ordering as TCP would.)
2920 *
2921 * (Warning: below and nearby, I make pseudo-code-y leaps, such as saying flying_now_acked_pkts stores
2922 * Sent_whens when really it stores order_nums; just bear with me by trusting that it makes the logic
2923 * easier to explain, and that the actual structure in code is sufficiently
2924 * similar to the wording here to not make a salient difference in practice.)
2925 *
2926 * Consider the two structures we have now. snd_flying_pkts_by_when (call
2927 * it F) is a collection of Sent_packets, each with Sent_packet::m_acks_after_me, ordered by decreasing
2928 * Sent_when. flying_now_acked_pkts (call it C) is an unordered collection that contains each Sent_when
2929 * (i.e., reference to a send-packet attempt) that has been ACKed. That is, flying_now_acked_pkts tells us
2930 * by Sent_when which exact send attempts from the past are acknowledged in this set of accumulated acks.
2931 *
2932 * Even less formally -- just for sanity's sake -- F are In-flight packets; C are just-acked packets that were
2933 * very recently in F. C may be interleaved among F if viewed in increasing Sent_when order:
2934 * e.g., [ F F F F C F C F C C ] (where F represents a still-In-flight send attempt, or an F element;
2935 * C a just-acked send attempt, thus a C element; and the order is from earlier/lower Sent_when to
2936 * later/higher Sent_when).
2937 *
2938 * Note that, conceptually, the key sets (Sent_when values) in F and C are disjoint,
2939 * since each send attempt has a unique Sent_when value (because it at least consists of a unique m_order_num).
2940 * How do we correctly yet efficiently increment m_acks_after_me (call it A) for each
2941 * element in F to represent the new ackage? First observe that if F[t].A is incremented by N, then
2942 * F[prev(t)].A should be incremented by N PLUS the number of acks for all packets sent at times in range
2943 * (prev(t), t), where prev(t) is the element of C with the next lower (ealier) Sent_when.
2944 * Consider the example scoreboard above, [ F F F F C F# C F* C C ]. F*.A is incremented by 2, because
2945 * plainly there are two Cs after it. Therefore, the preceding F, which is F#,
2946 * is also incremented by 2; plus another 1, because there is another C (recall, simply another acknowledgment)
2947 * between F# and F*. And so it goes for all the Fs. Side/sanity note: The range is (prev[t], t), not
2948 * [prev(t), t), simply because F and C are disjoint; and prev(t) by definition is in F (hence not in C, hence
2949 * no ack for that seq. #).
2950 *
2951 * This suggests a simple inductive algorithm, wherein the latest F element's F[t].A is incremented by I, which
2952 * is the count of C elements with Sent_when > t; memorize I; now for each progressively older F[t],
2953 * count C elements in (t, next(t)) and increment F[T].A by I += <that count>. Repeat until all Fs incremented.
2954 * Ultimately I = # of new valid, acknowledgments. (Recall: scoreboard cannot begin with any Cs, [C C ... ], as
2955 * such a C would be acking a non-In-flight send attempt, so a dupe, and we specifically eliminate dupes from
2956 * consideration before inserting into C.) So that's O(F.size()) increment operations.
2957 *
2958 * OK, but how do we get this "count of acks between t and next(t)"? Let t be the last element of
2959 * F. For it, that count is the count of all keys > t in C (i.e., the total # of acks for all
2960 * packets sent after t). Let the lowest such key (Sent_when value) be `s`. Now let t' = prev(t) as before.
2961 * For t', the count of acks sent in (t', t) is the count of all elements in C with keys
2962 * in (s', s), where s' is again the lowest key > t'. Having counted that, set s = s, and repeat for each key t' of F.
2963 *
2964 * Of course, for that to be practical, C would need to be sorted by Sent_when. Since in reality it's not sorted,
2965 * we could first sort it in O(k log k) operations, worst-case, k = C.size(). More convenient, however, is to
2966 * construct a priority queue (heap) from C; then keep popping the Sent_whens down to and
2967 * including s at each step. That's O(k) to make the heap and O(k log k) total time spent
2968 * popping it.
2969 *
2970 * The above explanation strikes me as somewhat cryptic, but hopefully the code will clarify it; I was just
2971 * trying to explain why the code works. */
2972
2973 /* Make heap out of flying_now_acked_pkts; top()/pop() will return the element with the highest (latest) Sent_when.
2974 * Just store the Sent_when values directly in the heap; std::pair::operator<() will do
2975 * the right thing since no element's Sent_when equals another element's Sent_when (they were
2976 * stored in a uniquely-keyed dictionary in the first place).
2977 *
2978 * Let cur_sent_pkt be the element of snd_flying_pkts_by_sent_when we're currently
2979 * considering, and it starts at F.newest() and progresses accordingly through F.
2980 * Then, invariant: high_ack_count_q contains the acks for all send attempts P where
2981 * P.m_sent_when < cur_sent_pkt.m_sent_when. In particular, P.m_sent_when.top < cur_sent_pkt.m_sent_when. */
2982 priority_queue<Peer_socket::order_num_t>
2983 high_ack_count_q(flying_now_acked_pkts.begin(), flying_now_acked_pkts.end());
2984
2985 // Invariant: this will be the m_acks_after_me increment applied to the just-considered packet in snd_flying_pkts*.
2986 using ack_count_t = Peer_socket::Sent_packet::ack_count_t;
2987 ack_count_t ack_increment_after_me = 0;
2988
2989 // As explained above, start with the first (latest send time) unacked packet and go forward (earlier and earlier).
2991 for (last_dropped_pkt_it = snd_flying_pkts_by_when.newest();
2992 last_dropped_pkt_it != snd_flying_pkts_by_when.past_oldest();
2993 ++last_dropped_pkt_it) // Up to k repetitions.
2994 {
2995 Peer_socket::Sent_packet& cur_sent_pkt = *(last_dropped_pkt_it->second);
2996 const Peer_socket::Sent_packet::Sent_when& cur_pkt_sent_when = cur_sent_pkt.m_sent_when.back();
2997
2998 /* We will increment cur_sent_pkt.m_acks_after_me by ack_increment_after_me + X, where X is
2999 * the total number of acks for packets with send times between cur_pkt_sent_when and the
3000 * cur_pkt_sent_when in the last loop iteration (or infinity if this is the first loop
3001 * iteration). The high_ack_count_q invariant we maintain is that high_ack_count_q holds the
3002 * ack counts for all packets with Sent_when values EXCEPT those >= the previous
3003 * iteration's cur_pkt_sent_when. Therefore, we need only find all elements of high_ack_count_q
3004 * whose Sent_whens are > our cur_pkt_sent_when. Since high_ack_count_q.top() is always the ack
3005 * count with the highest sent_when in that structure (priority queue), we just pop and sum
3006 * until high_ack_count_q.top() < cur_pkt_sent_when. */
3007
3008 // We've just assigned cur_sent_pkt, breaking invariant; pop until it holds again.
3009 while ((!high_ack_count_q.empty()) &&
3010 // Compare order numbers -- they are always unique.
3011 (high_ack_count_q.top() > cur_pkt_sent_when.m_order_num))
3012 {
3013 // Found acked packet with sent_when > cur_pkt_sent_when (but < previous iteration's cur_pkt_sent_when).
3014 ++ack_increment_after_me; // So add that packet's ack.
3015
3016 // And remove it, bringing the next highest entry to the top. O(log k).
3017 high_ack_count_q.pop(); // Note this maintains the invariant that defines high_ack_count_q.
3018 }
3019 // Note we've maintained the invariant defining ack_increment_after_me.
3020
3021 // Hence this many more acks for packets after us have occurred within this ack set.
3022 cur_sent_pkt.m_acks_after_me += ack_increment_after_me;
3023
3024 if (cur_sent_pkt.m_acks_after_me > S_MAX_LATER_ACKS_BEFORE_CONSIDERING_DROPPED)
3025 {
3026 /* Ah ha! For this packet we've exceeded the limit -- we will consider it Dropped. What
3027 * about the next (meaning, earlier-sent) unacknowledged packets? Observe that packets with
3028 * earlier send times MUST (if we were to continue the loop in this manner) end up with
3029 * equal or larger cur_sent_pkt.m_acks_after_me. (Intuitively: any acknowledgment after
3030 * packet P is also after any packet preceding P in the sent_when ordering.) Therefore, we
3031 * can break out of the loop and consider Dropped ALL packets from last_dropped_pkt_it to
3032 * snd_flying_pkts_by_when.past_oldest(). Yay! */
3033
3034 auto const logger_ptr = get_logger();
3035 if (logger_ptr && logger_ptr->should_log(log::Sev::S_TRACE, get_log_component()))
3036 {
3037 Sequence_number cur_pkt_seq_num, cur_pkt_seq_num_end;
3038 get_seq_num_range(last_dropped_pkt_it, &cur_pkt_seq_num, &cur_pkt_seq_num_end);
3039
3041 ("Unacknowledged packet [" << cur_pkt_seq_num << ", " << cur_pkt_seq_num_end << ") "
3042 "order_num [" << cur_pkt_sent_when.m_order_num << "] has "
3043 "had [" << cur_sent_pkt.m_acks_after_me << "] acknowledgments "
3044 "for later packets; considering it and "
3045 "all unacknowledged packets sent earlier as Dropped.");
3046 }
3047
3048 break;
3049 }
3050 // else
3051
3052 // ack_increment_after_me and high_ack_count_q invariants hold, so the next iteration can proceed.
3053 } // for (all elements in snd_flying_pkts_by_when, in decreasing m_sent_when order: newest -> oldest)
3054
3055 return last_dropped_pkt_it;
3056} // Node::categorize_pkts_as_dropped_on_acks()
3057
3059 const Peer_socket::Sent_pkt_ordered_by_when_iter& last_dropped_pkt_it,
3060 size_t* cong_ctl_dropped_pkts, size_t* cong_ctl_dropped_bytes,
3061 size_t* dropped_pkts, size_t* dropped_bytes,
3062 std::vector<Peer_socket::order_num_t>* pkts_marked_to_drop)
3063{
3064 // using boost::next; // Still ambiguous for some reason (in clang at least).
3065
3066 /* This helper of handle_accumulated_acks() exists to make the latter method briefer/readable, not for code reuse
3067 * as of this writing. The background is that once a set of individual acks has been processed in the sense that
3068 * sock->m_snd_flying_pkts* (which tracks In-flight outbound DATA packets) has been updated by removing the
3069 * acked packets (they are no longer In-flight), it's time to also recategorize certain further In-flight
3070 * packets as Dropped -- the intuition being that once N packets sent LATER than a given packet P have been
3071 * acked, it's highly probable that P has been Dropped by the network. This method does that (dropping
3072 * all such packets P) and certain related tasks such as tracking the associated loss event(s) for congestion
3073 * control.
3074 *
3075 * Now, as explained elsewhere, when ack set S causes packet set P' to be Dropped, this (possibly null) set P'
3076 * always has the following form: there is some particular packet P which is the most-recently-sent one
3077 * that is in P'; and therefore ALL other In-flight packets sent before P must be dropped too and also are in P'.
3078 * Thus P is necessary/sufficient to specify P'. last_droppped_pkt_it argument points to P' and is determined
3079 * elsewhere and used by this helper. */
3080
3081 // For brevity and a little speed:
3082 const bool rexmit_on = sock->rexmit_on();
3083 auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
3084 auto& snd_stats = sock->m_snd_stats;
3085
3086 /* Pre-condition: all elements starting with (inclusive) last_dropped_pkt_it (within
3087 * snd_flying_pkts_by_when) should be considered Dropped. If last_dropped_pkt_it ==
3088 * snd_flying_pkts_by_when.past_oldest() a/k/a end(), then none should be considered
3089 * Dropped (i.e., no m_acks_after_me became high enough).
3090 *
3091 * Given that, we have a number of tasks remaining:
3092 *
3093 * 1. Count the total # of packets and bytes now considered Dropped and pass this to congestion control.
3094 * Omit those packets/bytes heuristically determined to belong to a loss event detected in an earlier
3095 * call, namely those for which m_sent_when < m_snd_last_loss_event_when.
3096 * 2. (If retransmission is enabled) Queue those Dropped packets for retransmission in retransmission queue.
3097 * 3. Erase the Dropped packets from snd_flying_packets*.
3098 *
3099 * For (non-asymptotic) performance, ideally we want to traverse snd_flying_pkts_by_when just once,
3100 * computing what's needed for these. Drilling down a bit:
3101 *
3102 * (2) and (3) are simple and involve walking over the Dropped range that has been computed (pre-condition above)
3103 * and adding-elsewhere or erasing those elements, respectively, though (2) must be done in chronological order
3104 * (increasing Sent_when).
3105 *
3106 * (1) is a matter of walking in anti-chronological (decreasing Sent_when) order over that same range, until
3107 * a certain Sent_when threshold is found, and stopping there.
3108 *
3109 * Thus, the kitchen-sink algorithm emerges: walk through Dropped range in decreasing Sent_when order, so
3110 * from last_dropped_pkt_it along snd_flying_pkts_by_when. Accumulate bytes/packets for (1), but stop
3111 * accumulating once m_snd_last_loss_event_when is reached w/r/t m_sent_when. Erase from snd_flying_pkts*
3112 * (carefully, since we are walking along one of them), for (3). And add to the retransmission queue, but in
3113 * reverse order versus the walking order, for (2). */
3114
3115 *dropped_pkts = snd_flying_pkts_by_when.size(); // We will just compute the final value by subtracting "after."
3116 *dropped_bytes = sock->m_snd_flying_bytes; // Ditto.
3117
3118 *cong_ctl_dropped_bytes = 0;
3119 *cong_ctl_dropped_pkts = 0;
3120 bool loss_event_finished = false;
3121
3122 /* We want to add to retransmission queue (if retransmission is on). We also want to traverse
3123 * snd_flying_pkts_by_when in forward newest->oldest order (for convenience and also to efficiently compute
3124 * cong_ctl_dropped_*). However we want to retransmit in reverse order (oldest->newest). So we
3125 * put the packets to retransmit in the latter order into snd_rexmit_q, at the end of the latter.
3126 * So, if it was [ABC], and we dropped [DEF], then we want to insert to yield [ABCFED] (ABC->ABCFED).
3127 * list<>::insert(it, v) will insert `v` before *it and return iterator to just-inserted element.
3128 * So we can memorize the latter and pass it in as `it` in the next insert(), rinse, repeat.
3129 * In the above example: ABC->ABC(D)->ABC(E)D->ABC(F)ED. // () is inserted element.
3130 * ^ ^ ^ // ^ is "fulcrum": insertion point for insertion following next ->.
3131 *
3132 * snd_rexmit_q_fulcrum_it, the insertion point, is so named due to being the "fulcrum" between the old and
3133 * new parts of snd_rexmit_q. History: Used to use a local new list<> here which would be spliced onto
3134 * the real queue at the end; but IMO this is more elegant (and probably a bit speedier). */
3135 auto& snd_rexmit_q = sock->m_snd_rexmit_q;
3136 decltype(sock->m_snd_rexmit_q)::iterator snd_rexmit_q_fulcrum_it = snd_rexmit_q.end();
3137
3138 // We are to fill this up, so it should not have anything yet.
3139 assert(pkts_marked_to_drop->empty());
3140
3141 auto pkt_it = last_dropped_pkt_it;
3142 while (pkt_it != snd_flying_pkts_by_when.past_oldest())
3143 {
3144 // We can't just ++pkt_it later on, because we are going to erase() at pkt_it soon, invalidating it.
3145 auto next_pkt_it = boost::next(pkt_it);
3146 // Now see end of loop body.
3147
3148 // Accumulate stuff for passing into congestion control at the end.
3149
3150 const Peer_socket::Sent_packet::Ptr sent_pkt = pkt_it->second;
3151 const Peer_socket::Sent_packet::Sent_when& sent_when = sent_pkt->m_sent_when.back();
3152
3153 if (!loss_event_finished)
3154 {
3155 if (// This is part of a new loss event if: There has been no loss event before this...
3156 (sock->m_snd_last_loss_event_when != Fine_time_pt())
3157 // ...OR there has, but this packet was sent after that event was detected.
3158 && (sent_when.m_sent_time < sock->m_snd_last_loss_event_when))
3159 {
3160 /* This is the first packet encountered to be part of a previous loss event. If
3161 * retransmission is off, this will also cause the loop to exit. */
3162 loss_event_finished = true;
3163 }
3164 else
3165 {
3166 // Only got here if this packet and all Dropped packets after it are part of a new loss event.
3167 *cong_ctl_dropped_bytes += sent_pkt->m_size;
3168 ++(*cong_ctl_dropped_pkts);
3169 }
3170 }
3171 // else { Already found end of new loss event, if any, so no need to keep looking for it. }
3172
3173 // Add to retransmission queue if applicable.
3174
3175 if (rexmit_on)
3176 {
3177 if (!ok_to_rexmit_or_close(sock, pkt_it, true)) // Ensure not too many retransmissions already.
3178 /* ^-- defer_delta_check == true: because the only way to get to this method is from
3179 * async_low_lvl_recv(), which will perform event_set_all_check_delta(false) at the end of itself,
3180 * before the boost.asio handler exits. See Node::m_sock_events doc header for details. */
3181 {
3182 return false; // Already closed/logged/etc.
3183 }
3184 // else
3185
3186 /* Save a ref-counted pointer (to what includes packet data) in retransmission queue. We'll soon remove such
3187 * a pointer from snd_flying_pkts*, lowering the ref-count again. In other words, we are moving the sent-packet
3188 * object from snd_flying_pkts* to snd_rexmit_q (Dropped -> In-flight).
3189 *
3190 * Insert at the same position each time to ultimately arrange them in the reversed order that we want. */
3191 snd_rexmit_q_fulcrum_it = snd_rexmit_q.insert(snd_rexmit_q_fulcrum_it, sent_pkt);
3192 ++sock->m_snd_rexmit_q_size;
3193 }
3194
3195 /* Finally, we can erase it from snd_flying_pkts* and adjust snd_flying_bytes.
3196 * Will NOT invalidate other iterators into snd_flying_pkts_by_when.
3197 *
3198 * Also, save in pkts->pkts_marked_to_drop as advertised. */
3199
3200 static_assert
3202 "Scoreboard must not get otherwise changed when a packet is erased.");
3203 pkts_marked_to_drop->push_back(sent_when.m_order_num);
3204 snd_flying_pkts_erase_one(sock, pkt_it);
3205
3206 pkt_it = next_pkt_it;
3207 } // while (pkt_it != snd_flying_pkts_by_when.past_oldest())
3208
3209 // Includes ALL Dropped packets (not just ones from new loss event, if any), so != cong_ctl_dropped_pkts.
3210 *dropped_pkts -= snd_flying_pkts_by_when.size(); // Subtract "after" from "before" to get dropped count.
3211 *dropped_bytes -= sock->m_snd_flying_bytes; // Similar.
3212
3213 if (*cong_ctl_dropped_pkts != 0)
3214 {
3215 // Register that we've detected a NEW loss event (not the same as dropped_data() -- see that elsewhere).
3216 snd_stats.loss_event();
3217 }
3218
3219 return true;
3220} // Node::drop_pkts_on_acks()
3221
3223{
3224 using boost::algorithm::join;
3225 using boost::chrono::symbol_format;
3226 using std::string;
3227 using std::vector;
3228 using std::transform;
3229 using std::ostream;
3230
3231 // We are in thread W.
3232
3233 // This helper of handle_accumulated_acks() just logs the individual acks about to be processed.
3234
3235 // For brevity and a little speed:
3236 using Ack = Ack_packet::Individual_ack;
3237 using Acks = vector<Ack::Ptr>;
3238 const Acks& acked_packets = sock->m_rcv_acked_packets;
3239
3240 auto const logger_ptr = get_logger();
3241 if (logger_ptr && logger_ptr->should_log(log::Sev::S_DATA, get_log_component())) // Very verbose and slow!
3242 {
3243 // Prepare serialization of m_rcv_acked_packets for TRACE logging; quite verbose and slow!
3244 vector<string> ack_strs(acked_packets.size());
3245 transform(acked_packets.begin(), acked_packets.end(), ack_strs.begin(),
3246 [](Ack::Const_ptr ack) -> string
3247 {
3248 return util::ostream_op_string('[', ack->m_seq_num, ", ", int(ack->m_rexmit_id), ", ",
3249 symbol_format,
3250 ack->m_delay, ']'); // "ns," not "nanoseconds."
3251 });
3252 const string ack_str = join(ack_strs, " ");
3253
3254 FLOW_LOG_DATA_WITHOUT_CHECKING("NetFlow worker thread working on [" << sock << "]. "
3255 "Accumulated [ACK] packets with "
3256 "acknowledgments [seq_num, rexmit_id, delay]: "
3257 "[" << ack_str << "].");
3258 } // if (DATA)
3259 else
3260 {
3261 FLOW_LOG_TRACE("NetFlow worker thread working on [" << sock << "]. "
3262 "Accumulated [ACK] packets with "
3263 "[" << acked_packets.size() << "] individual acknowledgments.");
3264 }
3265
3266 if (sock->m_int_state == Peer_socket::Int_state::S_ESTABLISHED)
3267 {
3268 log_snd_window(sock);
3269 }
3270 // else { Why is this possible? Answer: See handle_accumulated_acks() for explanation near similar check. }
3271} // Node::log_accumulated_acks()
3272
3273void Node::drop_timer_action(Peer_socket::Ptr sock, bool drop_all_packets)
3274{
3275 using std::list;
3276 using boost::prior;
3277
3278 // We are in thread W.
3279
3280 // Since we call m_snd_drop_timer->done() when exiting ESTABLISHED, this should hold.
3281 assert(sock->m_int_state == Peer_socket::Int_state::S_ESTABLISHED);
3282
3283 // For brevity and a bit of speed:
3284 auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
3285 auto& snd_flying_pkts_by_seq = sock->m_snd_flying_pkts_by_seq_num;
3286
3287 // Timer must not be running if there are no In-flight packets. Thus it should not have fired.
3288 assert(!snd_flying_pkts_by_when.empty());
3289
3290 /* Drop Timer fired and is telling us to consider Dropped some packets. If drop_all_packets, then
3291 * it's all of them. Otherwise it's just the earliest unacknowledged packet
3292 * (m_snd_flying_pkts_by_sent_when.begin()). */
3293
3294 // Log details of the In-flight packets before we change things.
3295 log_snd_window(sock);
3296
3297 const bool rexmit_on = sock->rexmit_on();
3298 // To check, at the end, whether we've changed can_send() false => true.
3299 const bool could_send_before_drops = can_send(sock);
3300 // To check, at the end, whether we've changed snd_deqable() false => true.
3301 const bool had_rexmit_data_before_drops = !sock->m_snd_rexmit_q.empty();
3302 // Will store ID of the one packet to drop; reserved value 0 will mean ALL packets are dropped.
3303 Peer_socket::order_num_t packet_marked_to_drop_or_drop_all;
3304
3305 // Used below for congestion control.
3306 size_t cong_ctl_dropped_bytes = 0;
3307 size_t cong_ctl_dropped_pkts = 0;
3308
3309 if (drop_all_packets)
3310 {
3311 cong_ctl_dropped_bytes = sock->m_snd_flying_bytes;
3312 cong_ctl_dropped_pkts = snd_flying_pkts_by_when.size();
3313
3314 // Queue them for retransmission, to be sent as soon as CWND provides enough space (could even be immediately).
3315 if (rexmit_on)
3316 {
3317 // Order is from earliest-sent to latest-sent (retransmission in the same order as transmission).
3318 for (Peer_socket::Sent_pkt_by_sent_when_map::Reverse_iterator pkt_it = snd_flying_pkts_by_when.oldest();
3319 pkt_it != snd_flying_pkts_by_when.past_newest();
3320 ++pkt_it)
3321 {
3322 // The forward iterator F pointing to same list element as reverse iterator R is prior(R.base()). Google it.
3323 if (!ok_to_rexmit_or_close(sock, prior(pkt_it.base()), false)) // Ensure not too many retransmissions already.
3324 /* ^-- defer_delta_check == false: because we were invoked from a timer event. Therefore, we will NOT perform
3325 * event_set_all_check_delta(false) before the boost.asio handler exits. Therefore boost.asio
3326 * may sleep (block) before event_set_all_check_delta(false). Therefore that would delay
3327 * delivery of the event to the user. Therefore force the delta check immediately. See
3328 * Node::m_sock_events doc header for details. */
3329 {
3330 return; // Already closed/logged/etc.
3331 }
3332 // else
3333
3334 sock->m_snd_rexmit_q.push_back(pkt_it->second); // Only a ref-counted pointer copy (constant time).
3335 }
3336 sock->m_snd_rexmit_q_size += cong_ctl_dropped_pkts;
3337 }
3338 // else { Just drop it. }
3339
3340 // Update our image of the pipe. For efficiency we use clear() instead of doing it one-by-one above.
3341
3342 // Update byte count.
3343 snd_flying_pkts_updated(sock, snd_flying_pkts_by_when.newest(), snd_flying_pkts_by_when.past_oldest(), false);
3344 snd_flying_pkts_by_when.clear();
3345 snd_flying_pkts_by_seq.clear();
3346
3347 packet_marked_to_drop_or_drop_all = 0; // Means drop all.
3348 }
3349 else
3350 {
3351 // Get the packet that was sent before all the others.
3352 const Peer_socket::Sent_pkt_ordered_by_when_iter& oldest_pkt_it = prior(snd_flying_pkts_by_when.past_oldest());
3353 Peer_socket::Sent_packet::Ptr oldest_pkt = oldest_pkt_it->second;
3354
3355 cong_ctl_dropped_bytes = oldest_pkt->m_size;
3356 cong_ctl_dropped_pkts = 1;
3357
3358 // Queue it for retransmission, to be sent as soon as CWND provides enough space (could even be immediately).
3359 if (rexmit_on)
3360 {
3361 if (!ok_to_rexmit_or_close(sock, oldest_pkt_it, false)) // Ensure not too many retransmissions already.
3362 // ^-- false <= Same as comment above.
3363 {
3364 return; // Already closed/logged/etc.
3365 }
3366 // else
3367
3368 sock->m_snd_rexmit_q.push_back(oldest_pkt); // Only a ref-counted pointer copy (constant time).
3369 ++sock->m_snd_rexmit_q_size;
3370 }
3371 // else { Just drop it. }
3372
3373 // Remember it short-term for the Drop_timer consolidated book-keeping below...
3374 packet_marked_to_drop_or_drop_all = oldest_pkt->m_sent_when.back().m_order_num;
3375
3376 // ...and in fact mark that packet Dropped (update our image of the pipe).
3377 snd_flying_pkts_erase_one(sock, oldest_pkt_it);
3378 }
3379
3380 /* Deal with congestion control. For introduction to the general topic see the large comment
3381 * near the top of handle_accumulated_acks().
3382 *
3383 * Since a Drop Timeout implies a large loss event, the congestion control module must be
3384 * informed. It may adjust the congestion window (used in can_send() and controlling how many
3385 * packets we are allowed to have In-flight at a time), probably downward.
3386 *
3387 * Also, this is a new loss event. Why? (For detailed explanation of what a loss event is, and
3388 * how we keep track of them, see that large comment in handle_accumulated_acks(). It
3389 * may be required to understand the rest of this paragraph.) Certainly this Drop is part of some
3390 * loss event by definition, but is it a new loss event, or merely the previous one (if such
3391 * exists)? Well, a Drop Timeout is, in practice, at least 1 second (which is likely 4 times a
3392 * pretty large RTT of 250 msec) and can also be estimated to be 3 * SRTT. In other words it is
3393 * probably much larger than SRTT, and certainly is at least a little larger than SRTT. Therefore
3394 * most likely any packet(s) Dropped by this DTO were sent after the last loss event (if any) was
3395 * detected. Hence this DTO event is a new loss event. We could explicitly check for this, but
3396 * it seems unnecessarily complex and intuitively unnecessary.
3397 *
3398 * Per handle_accumulated_acks(), when a new loss event is seen, m_snd_last_loss_event_when
3399 * is set to NOW. */
3400
3401 // @todo Arguable if it should be INFO or TRACE. We'll see.
3402 FLOW_LOG_INFO("cong_ctl [" << sock << "] update: Drop Timeout event: "
3403 "Dropped [" << cong_ctl_dropped_bytes << "] bytes = [" << cong_ctl_dropped_pkts << "] packets.");
3404
3405 // MUST call this after, not before, updating m_snd_flying_{packets|bytes} per method doc.
3406 sock->m_snd_cong_ctl->on_drop_timeout(cong_ctl_dropped_bytes, cong_ctl_dropped_pkts);
3407 sock->m_snd_last_loss_event_when = Fine_clock::now();
3408
3409 // Register that there was a timeout, and that bytes were converted from In-flight to Dropped.
3410 sock->m_snd_stats.drop_timeout();
3411 sock->m_snd_stats.dropped_data(cong_ctl_dropped_bytes, cong_ctl_dropped_pkts);
3412
3413 // Now log the "after."
3414 log_snd_window(sock);
3415
3416 // Since we've changed snd_flying_pkts*, Drop_timer events have occurred. Cleanly handle them all in one go.
3417
3418 const Drop_timer::Ptr drop_timer = sock->m_snd_drop_timer;
3419 drop_timer->start_contemporaneous_events();
3420
3421 /* Handle possible effect of above activities on the Drop Timer. (It may get disabled or restarted anew.)
3422 * Why not just do this right when we erase the associated packets from snd_flying_pkts*? We don't want to
3423 * trigger disruptive behavior like possibly retransmitting everything in the middle of all that accounting
3424 * which is not yet complete. Now it's complete, so it's the right time to handle this.
3425 *
3426 * Recall that snd_flying_pkts* have been updated and no longer contain the associated packet(s)'s info. */
3427 if (packet_marked_to_drop_or_drop_all == 0)
3428 {
3429 // Note that this is equivalent to calling ...packet_no_longer_in_flight(P) for all P -- just faster.
3430 drop_timer->on_no_packets_in_flight_any_longer();
3431 }
3432 else // if (packet_marked_to_drop_or_drop_all refers to, in fact, a specific packet)
3433 {
3434 drop_timer->on_packet_no_longer_in_flight(packet_marked_to_drop_or_drop_all);
3435 /* Could also call on_no_packets_in_flight_any_longer() if now none is In-flight, but performance-wise that'd
3436 * be the same; ...packet_no_longer_in_flight() will check the same condition anyway. So don't bother. */
3437 }
3438
3439 drop_timer->end_contemporaneous_events();
3440
3441 /* We've definitely reduced the number of packets we consider In-flight. We may also have added
3442 * packets to retransmission queue (if retransmission is on). Therefore can_send() may now return
3443 * true while at the beginning of the method it returned false; snd_deqable() may now return true
3444 * similarly. So have send_worker() check and send more if possible. See Node::send() for
3445 * discussion of overall strategy on this topic. */
3446 if ((!could_send_before_drops) || (rexmit_on && (!had_rexmit_data_before_drops)))
3447 {
3448 send_worker(sock, false);
3449 // ^-- defer_delta_check == false: for similar reason as in send_worker_check_state() calling send_worker().
3450 }
3451} // Node::drop_timer_action()
3452
3454{
3455 using std::min;
3456 using std::max;
3457 using boost::ratio;
3458 using boost::ratio_subtract;
3459 using boost::ratio_string;
3460 using boost::chrono::round;
3461 using boost::chrono::milliseconds;
3462 using boost::chrono::microseconds;
3463 using boost::chrono::seconds;
3464
3465 // We are in thread W.
3466
3467 // For brevity and a bit of speed:
3468 Fine_duration& srtt = sock->m_snd_smoothed_round_trip_time;
3469 Fine_duration& rtt_var = sock->m_round_trip_time_variance;
3470 Fine_duration& dto = sock->m_snd_drop_timeout;
3471 const Fine_duration& rtt = round_trip_time;
3472
3473 /* An ACK has supplied the given round_trip_time for a specific packet. We are to update the
3474 * smoothed RTT for the socket which is an estimate for the smooth "current" RTT for the socket.
3475 * Use RFC 6298 algorithm for SRTT calculation.
3476 *
3477 * RFC 6298 specifies the formula in "seconds." Of course it need not be seconds; it can be any
3478 * unit. We leave the unit we use unspecified, except to say that we will use the unit of
3479 * Fine_duration, which is the duration type of Fine_clock, which is the highest-resolution clock
3480 * available in the OS/hardware. Since, where possible, we keep using Fine_duration without
3481 * truncation to compute round_trip_time, assuming we don't introduce any unit conversions
3482 * (truncations, roundings) in the below code, the SRTT will maintain those units as well.
3483 * boost::chrono::duration will specifically cause compile failures if we don't explicitly specify
3484 * every truncation-inducing operation (duration_cast<>, round<>, etc.).
3485 *
3486 * BTW, this "unspecified" unit is probably nanoseconds.
3487 *
3488 * Note that the units used do NOT guarantee any particular clock granularity. E.g., I can give
3489 * you the time in milliseconds, but if I always say it in multiples of 1000 milliseconds, then I
3490 * may be working with milliseconds, but the resolution is 1 sec. */
3491
3492 if (srtt == Fine_duration::zero())
3493 {
3494 // First RTT measurement; initialize according to algorithm.
3495 srtt = rtt;
3496 rtt_var = rtt / 2;
3497
3498 // Truncate results to millisecond representation for readability.
3499 FLOW_LOG_TRACE("First SRTT calculation for [" << sock << "]: "
3500 "srtt = [" << round<milliseconds>(srtt) << " = " << srtt << "]; "
3501 "rtt_var = [" << round<milliseconds>(rtt_var) << " = " << rtt_var << "]; "
3502 "rtt = [" << rtt << "].");
3503 }
3504 else // if (SRTT was defined before this sample.)
3505 {
3506 // Subsequent RTT measurements.
3507
3508 // @todo Per last paragraph of RFC 6298-5, we MAY want to clear srtt/rtt_var afer multiple RTOs or maybe idleness.
3509 // (RTO = Retransmission Timeout, though we call it a Drop Timeout more accurately [we don't necessarily
3510 // retransmit on loss in NetFlow, unlike TCP].)
3511
3512 const Fine_duration prev_srtt = srtt;
3513 const Fine_duration prev_rtt_var = rtt_var;
3514
3515 /* Reason I used ratio<> instead of floating point constants: I don't want to use floating
3516 * points in production code that much. I don't necessarily trust it for consistent behavior across platforms...
3517 * and in general I just find integers more predictable/easier to reason about in most contexts of net_flow.
3518 * Reason I used ratio<> instead of just having separate integer constants for numerators and
3519 * denominators: I'd rather have ratio<> do the arithmetic for me (at compile time to boot!). */
3520 using Alpha = ratio<1, 8>; // 1/8, per RFC.
3521 using One_minus_alpha = ratio_subtract<ratio<1>, Alpha>;
3522 using Beta = ratio<1, 4>; // 1/4, per RFC.
3523 using One_minus_beta = ratio_subtract<ratio<1>, Beta>;
3524 // Now I can use X::num and X::den, such that X is the ratio X::num/X::den.
3525
3526 // Compute |srtt - rtt|.
3527 Fine_duration abs_srtt_minus_rtt = srtt - rtt;
3528 if (abs_srtt_minus_rtt.count() < 0)
3529 {
3530 abs_srtt_minus_rtt = -abs_srtt_minus_rtt;
3531 }
3532
3533 // Update the results per RFC.
3534 rtt_var
3535 = rtt_var * One_minus_beta::num / One_minus_beta::den
3536 + abs_srtt_minus_rtt * Beta::num / Beta::den;
3537 srtt
3538 = srtt * One_minus_alpha::num / One_minus_alpha::den
3539 + rtt * Alpha::num / Alpha::den;
3540
3541 // Truncate results to millisecond representation for readability.
3542 FLOW_LOG_TRACE("Next SRTT calculation for [" << sock << "]: "
3543 "srtt = [" << round<milliseconds>(srtt) << " = " << srtt << "]; "
3544 "rtt_var = [" << round<milliseconds>(rtt_var) << " = " << rtt_var << "]; "
3545 "rtt = [" << rtt << "]; "
3546 "prev_srtt = [" << prev_srtt << "]; "
3547 "prev_rtt_var = [" << prev_rtt_var << "]; "
3548 "alpha = " << (ratio_string<Alpha, char>::prefix()) << "; "
3549 "(1 - alpha) = " << (ratio_string<One_minus_alpha, char>::prefix()) << "; "
3550 "beta = " << (ratio_string<Beta, char>::prefix()) << "; "
3551 "(1 - beta) = " << (ratio_string<One_minus_beta, char>::prefix()) << "; "
3552 "|srtt - rtt| = [" << abs_srtt_minus_rtt << "].");
3553 } // else if (SRTT was defined before this sample)
3554
3555 /* Now compute Drop Timeout (DTO), similar to TCP's RTO (Retransmission Timeout): the minimum
3556 * amount of time we give an In-flight packet to get Acknowledged before considering it Dropped.
3557 * Again we use RFC 6298 for DTO computation.
3558 *
3559 * The formula is DTO = srtt + max(G, K * rtt_var), where K = 4 and G is the "clock
3560 * granularity." Additionally, we are to put a floor of 1 second on DTO. Finally, we are allowed
3561 * to put a ceiling on DTO, as long as that ceiling is at least 60 seconds.
3562 *
3563 * G plays an important part in the RTO caclulation algorithm, so we must know it. So what is it?
3564 * We don't know. We do however have a reasonably conservative upper bound; boost.timer
3565 * documentation lists some popular OS+CPU combinations and notes that for none of them does
3566 * high_resolution_timer exceed 5 microseconds. Therefore, let us pick the exceedingly
3567 * conservative G = 500 microseconds = 1/2 milliseconds. */
3568
3569 const Fine_duration clock_resolution_at_least = microseconds(500);
3570 const Fine_duration floor = seconds(1);
3571 const Fine_duration ceiling = sock->opt(sock->m_opts.m_dyn_drop_timeout_ceiling);
3572 const unsigned int k = 4;
3573
3574 const Fine_duration prev_dto = dto;
3575 const Fine_duration rtt_var_k = rtt_var * k;
3576 const Fine_duration srtt_plus_var_term = srtt + max(clock_resolution_at_least, rtt_var_k);
3577 dto = max(srtt_plus_var_term, floor);
3578 dto = min(dto, ceiling);
3579
3580 // Truncate results to millisecond representation for readability.
3581 FLOW_LOG_TRACE("Drop Timeout (DTO) calculation: "
3582 "dto = [" << round<milliseconds>(dto) << " = " << dto << "]; "
3583 "rtt_var * k = [" << rtt_var_k << "]; "
3584 "srtt + max(G, rtt_var * k) = [" << srtt_plus_var_term << "]; "
3585 "k = [" << k << "]; "
3586 "floor = [" << floor << "]; ceiling = [" << ceiling << "]; "
3587 "clock_resolution = [" << clock_resolution_at_least << "]; "
3588 "prev_dto = [" << prev_dto << "].");
3589} // void Node::new_round_trip_time_sample()
3590
3591void Node::log_snd_window(Peer_socket::Const_ptr sock, bool force_verbose_info_logging) const
3592{
3593 using std::vector;
3594 using std::list;
3595 using std::string;
3596 using boost::algorithm::join;
3597 using boost::prior;
3599 using std::flush;
3600
3601 // We're in thread W.
3602
3603 // For brevity and a little speed:
3604 const auto& snd_flying_pkts_by_seq = sock->m_snd_flying_pkts_by_seq_num;
3605 const auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
3606 const size_t num_flying_pkts = snd_flying_pkts_by_seq.size();
3607
3608 // force_verbose_info_logging => log the most detail, as INFO (if INFO logging enabled).
3609
3610 if (snd_flying_pkts_by_seq.empty())
3611 {
3612 // No In-flight packets, so this is brief enough for TRACE as opposed to DATA.
3613 FLOW_LOG_WITH_CHECKING(force_verbose_info_logging ? log::Sev::S_INFO : log::Sev::S_TRACE,
3614 "Send window state for [" << sock << "]: cong_wnd "
3615 "[" << sock->bytes_blocks_str(sock->m_snd_cong_ctl->congestion_window_bytes()) << "]; "
3616 "sent+acked/dropped "
3617 "[" << sock->m_snd_init_seq_num << ", " << sock->m_snd_next_seq_num << ") "
3618 "unsent [" << sock->m_snd_next_seq_num << ", ...).");
3619 return;
3620 }
3621 // else
3622
3623 auto const logger_ptr = get_logger();
3624 if (((!logger_ptr) || (!logger_ptr->should_log(log::Sev::S_DATA, get_log_component()))) &&
3625 (!(force_verbose_info_logging && logger_ptr->should_log(log::Sev::S_INFO, get_log_component()))))
3626 {
3627 // Can't print entire In-flight data structure, but can print a summary, if TRACE enabled.
3629 ("Send window state for [" << sock << "]: cong_wnd "
3630 "[" << sock->bytes_blocks_str(sock->m_snd_cong_ctl->congestion_window_bytes()) << "]; "
3631 "sent+acked/dropped [" << sock->m_snd_init_seq_num << ", " << snd_flying_pkts_by_seq.begin()->first << ") "
3632 "in-flight [" << sock->m_snd_flying_bytes << "] bytes: " << num_flying_pkts << ":{...} "
3633 "unsent [" << sock->m_snd_next_seq_num << ", ...).");
3634 return;
3635 }
3636 // else
3637
3638 // Very verbose and slow!
3639
3640 const bool rexmit_on = sock->rexmit_on();
3641
3642 vector<string> pkt_strs;
3643 pkt_strs.reserve(num_flying_pkts);
3644 for (Peer_socket::Sent_pkt_ordered_by_seq_const_iter pkt_it_it = snd_flying_pkts_by_seq.begin();
3645 pkt_it_it != snd_flying_pkts_by_seq.end();
3646 ++pkt_it_it)
3647 {
3648 Sequence_number start, end;
3649 get_seq_num_range(pkt_it_it->second, &start, &end);
3650
3651 Peer_socket::Sent_packet::Const_ptr sent_pkt = pkt_it_it->second->second;
3652
3653 String_ostream pkt_str_os;
3654 pkt_str_os.os() << '[' << start;
3655 if (rexmit_on)
3656 {
3657 pkt_str_os.os() << '[' << int(sent_pkt->m_packet->m_rexmit_id) << '/' << sent_pkt->m_sent_when.back().m_order_num
3658 << "], ";
3659 }
3660 else
3661 {
3662 pkt_str_os.os() << ", ";
3663 }
3664 pkt_str_os.os() << end << ")<" << sent_pkt->m_acks_after_me << "acks" << flush;
3665
3666 pkt_strs.push_back(pkt_str_os.str());
3667 }
3668
3670 (force_verbose_info_logging ? log::Sev::S_INFO : log::Sev::S_DATA,
3671 "Send window state for [" << sock << "]: cong_wnd "
3672 "[" << sock->bytes_blocks_str(sock->m_snd_cong_ctl->congestion_window_bytes()) << "]; "
3673 "sent+acked/dropped [" << sock->m_snd_init_seq_num << ", " << snd_flying_pkts_by_seq.begin()->first << ") "
3674 "in-flight "
3675 "[" << sock->m_snd_flying_bytes << "] bytes: " << num_flying_pkts << ":{" << join(pkt_strs, " ") <<
3676 "} unsent [" << sock->m_snd_next_seq_num << ", ...).");
3677
3678 if (!rexmit_on)
3679 {
3680 return;
3681 }
3682 // else
3683
3684 // Since retransmission is on, also useful to show the packets sorted by when they were sent.
3685
3686 vector<string> pkt_strs_time;
3687 pkt_strs_time.reserve(num_flying_pkts);
3688 // Note I don't use `auto` only for clarity (to express it is a reverse iterator, hence why didn't use for(:)).
3689 for (Peer_socket::Sent_pkt_by_sent_when_map::Const_reverse_iterator pkt_it = snd_flying_pkts_by_when.const_oldest();
3690 pkt_it != snd_flying_pkts_by_when.const_past_newest();
3691 ++pkt_it)
3692 {
3693 Sequence_number start, end;
3694 // The forward iterator F pointing to same list element as reverse iterator R is prior(R.base()) [sic]. Google it.
3695 get_seq_num_range(prior(pkt_it.base()), &start, &end);
3696
3697 Peer_socket::Sent_packet::Const_ptr sent_pkt = pkt_it->second;
3698
3699 string pkt_str;
3701 start, '[', int(sent_pkt->m_packet->m_rexmit_id), '/',
3702 sent_pkt->m_sent_when.back().m_order_num, "], ", end, ")<",
3703 sent_pkt->m_acks_after_me, "acks");
3704 pkt_strs_time.push_back(pkt_str);
3705 }
3706
3707 // Log it only if it is different (only possible if some retransmitted packets are actually involved).
3708 if (pkt_strs_time != pkt_strs)
3709 {
3711 (force_verbose_info_logging ? log::Sev::S_INFO : log::Sev::S_DATA,
3712 "Sorted by time sent: {" << join(pkt_strs_time, " ") << "}.");
3713 }
3714} // Node::log_snd_window()
3715
3717{
3718 using boost::prior;
3719
3720 const Peer_socket::Sent_pkt_by_seq_num_map& flying_packets = sock->m_snd_flying_pkts_by_seq_num;
3721 if (flying_packets.empty())
3722 {
3723 return Sequence_number(); // Default value. Less than all others.
3724 }
3725 // else
3726
3727 // Get the sequence number of the first datum in the last unhandled packet.
3728 const Peer_socket::Sent_pkt_by_seq_num_map::value_type& highest_val = *(prior(flying_packets.end()));
3729 Sequence_number seq_num = highest_val.first;
3730
3731 // Advance just past the data in that packet to get what we want.
3732 advance_seq_num(&seq_num, highest_val.second->second->m_size);
3733
3734 return seq_num;
3735}
3736
3738{
3739 // using boost::next; // Still ambiguous for some reason (in clang at least).
3740
3741 auto const logger_ptr = get_logger();
3742 if (logger_ptr && logger_ptr->should_log(log::Sev::S_TRACE, get_log_component()))
3743 {
3744 const Peer_socket::Sent_packet& sent_pkt = *pkt_it->second;
3745 const Peer_socket::order_num_t order_num = sent_pkt.m_sent_when.back().m_order_num;
3746 Sequence_number seq_num, seq_num_end;
3747 get_seq_num_range(pkt_it, &seq_num, &seq_num_end);
3748
3749 if (sock->rexmit_on())
3750 {
3752 ("On [" << sock << "] erasing packet [" << seq_num << ", " << seq_num_end << ") "
3753 "order_num [" << order_num << "] rexmit_id [" << int(sent_pkt.m_packet->m_rexmit_id) << "] from "
3754 "snd_flying_pkts* and friends.");
3755 }
3756 else
3757 {
3759 ("On [" << sock << "] erasing packet [" << seq_num << ", " << seq_num_end << ") "
3760 "order_num [" << order_num << "] from snd_flying_pkts* and friends.");
3761 }
3762 }
3763
3764 // Update byte count.
3765 snd_flying_pkts_updated(sock, pkt_it, boost::next(pkt_it), false);
3766
3767 // Finally erase from main structures.
3768 sock->m_snd_flying_pkts_by_seq_num.erase(pkt_it->first);
3769 sock->m_snd_flying_pkts_by_sent_when.erase(pkt_it);
3770
3771 // Note: As advertsied, we do NOT inform sock->m_snd_drop_timer. It is up to the caller to do the right thing there.
3772}
3773
3775 const Sequence_number& seq_num,
3777{
3778 using std::pair;
3779 using std::make_pair;
3780 // using boost::next; // Still ambiguous for some reason (in clang at least).
3781
3782 // For brevity and a bit of speed:
3783 auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
3784
3785#ifndef NDEBUG
3786 const auto insert_result =
3787#endif
3788 snd_flying_pkts_by_when.insert(make_pair(seq_num, sent_pkt));
3789
3790 // In this map, last added (a/k/a last sent) packet = first in the ordering!
3791 const Peer_socket::Sent_pkt_ordered_by_when_iter& pkt_it = snd_flying_pkts_by_when.begin();
3792 assert(insert_result.second); // Sequence numbers must not repeat ever.
3793 assert(insert_result.first == pkt_it); // Check that just-inserted element is ordered at the start.
3794
3795 snd_flying_pkts_updated(sock, pkt_it, boost::next(pkt_it), true); // Update byte count.
3796
3797 // Accordingly, insert packet (in the form of iterator into the above map) into sequence-number-ordered "scoreboard."
3798#ifndef NDEBUG
3799 const auto insert_result_by_seq =
3800#endif
3801 sock->m_snd_flying_pkts_by_seq_num.insert(make_pair(seq_num, pkt_it));
3802
3803 // Check invariant: Key X is in ..._by_sent_when <=> key X is in ..._by_seq_num.
3804 assert(insert_result_by_seq.second);
3805
3806 /* Caution: As noted in the doc header for this method, note that while we've already inserted sent_pkt into
3807 * snd_flying_pkts_by_when, the actual value of sent_pkt->m_sent_when.back() -- the absolute "when" -- isn't ready.
3808 * It will only be finalized once we actually send off the packet (after pacing, if any), in mark_data_packet_sent().
3809 * Nevertheless, we know the packet will be sent sometime fairly soon; and in fact AFTER all the packets
3810 * following it it in snd_flying_pkts_by_when's iterator ordering and in fact BEFORE any packets that
3811 * would be subsequently ahead of it in snd_flying_pkts_by_when's iterator ordering. That is, we can
3812 * place it there now, despite not knowing the _absolute_ time when it be sent, because we are confident about
3813 * its _relative_ order of when it will be sent vs. all the other packets in that structure, past or future. */
3814
3815 // Everything following this point is logging only.
3816
3817 auto const logger_ptr = get_logger();
3818 if ((!logger_ptr) || (!logger_ptr->should_log(log::Sev::S_TRACE, get_log_component())))
3819 {
3820 return;
3821 }
3822 // else
3823
3824 Sequence_number seq_num_end;
3825 get_seq_num_range(pkt_it, 0, &seq_num_end);
3826 if (sock->rexmit_on())
3827 {
3829 ("On [" << sock << "] pushing packet [" << seq_num << ", " << seq_num_end << ") "
3830 "rexmit_id [" << int(sent_pkt->m_packet->m_rexmit_id) << "] onto snd_flying_pkts and friends.");
3831 }
3832 else
3833 {
3835 ("On [" << sock << "] pushing packet [" << seq_num << ", " << seq_num_end << ") "
3836 "onto snd_flying_pkts and friends.");
3837 }
3838}
3839
3843 bool added)
3844{
3845 // We are in thread W.
3846
3847 if (pkt_begin == pkt_end)
3848 {
3849 return; // Wouldn't do anything anyway, but return here to avoid logging.
3850 }
3851
3852 // For brevity and a bit of speed:
3853 const auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
3854 size_t& snd_flying_bytes = sock->m_snd_flying_bytes;
3855
3856 // Optimization for when they effectively clear() snd_flying_pkts* (e.g., possibly on Drop Timeout):
3857 if ((!added)
3858 && (pkt_begin == snd_flying_pkts_by_when.const_newest())
3859 && (pkt_end == snd_flying_pkts_by_when.const_past_oldest()))
3860 {
3861 snd_flying_bytes = 0;
3862 }
3863 else
3864 {
3865 size_t delta_bytes = 0;
3866 for ( ; pkt_begin != pkt_end; ++pkt_begin)
3867 {
3868 delta_bytes += pkt_begin->second->m_size;
3869 }
3870 added ? (snd_flying_bytes += delta_bytes) : (snd_flying_bytes -= delta_bytes);
3871 }
3872
3873 FLOW_LOG_TRACE("cong_ctl [" << sock << "] update: "
3874 "In-flight [" << sock->bytes_blocks_str(snd_flying_bytes) << "].");
3875}
3876
3879 bool defer_delta_check)
3880{
3881 const Peer_socket::Sent_packet& pkt = *pkt_it->second;
3882
3883 Sequence_number seq_num, seq_num_end;
3884 get_seq_num_range(pkt_it, &seq_num, &seq_num_end);
3885
3886 const unsigned int rexmit_id = pkt.m_packet->m_rexmit_id;
3887 FLOW_LOG_TRACE("On [" << sock << "] attempting to queue for retransmission "
3888 "[" << seq_num << ", " << seq_num_end << "] which has been "
3889 "retransmitted [" << rexmit_id << "] times so far.");
3890 if (rexmit_id == sock->opt(sock->m_opts.m_st_max_rexmissions_per_packet))
3891 {
3892 rst_and_close_connection_immediately(socket_id(sock), sock,
3894 return false;
3895 }
3896 // else
3897 return true;
3898}
3899
3901 const Peer_socket_options* opts)
3902{
3903 return connect_with_metadata(to, boost::asio::buffer(&S_DEFAULT_CONN_METADATA, sizeof(S_DEFAULT_CONN_METADATA)),
3904 err_code, opts);
3905}
3906
3908 const boost::asio::const_buffer& serialized_metadata,
3909 Error_code* err_code,
3910 const Peer_socket_options* sock_opts)
3911{
3912 FLOW_ERROR_EXEC_AND_THROW_ON_ERROR(Peer_socket::Ptr, connect_with_metadata, to, serialized_metadata, _1, sock_opts);
3913 // ^-- Call ourselves and return if err_code is null. If got to present line, err_code is not null.
3914
3917
3918 // We are in thread U != W.
3919
3920 if (!running())
3921 {
3923 return Peer_socket::Ptr();
3924 }
3925 // else
3926
3927 // If it's good enough for DATA packets, it's good enough for metadata in SYN.
3928 if (serialized_metadata.size() > max_block_size())
3929 {
3931 return Peer_socket::Ptr();
3932 }
3933
3934 /* Put the rest of the work into thread W. For justification, see big comment in listen().
3935 * Addendum regarding performance: connect() is probably called more frequently than listen(), but
3936 * I doubt the performance impact is serious even so. send() and receive() might be a different
3937 * story. */
3938
3939 Peer_socket::Ptr sock;
3940 /* Load this->connect_worker(...) onto thread W boost.asio work queue.
3941 * We don't return until it finishes; therefore it is fine to do total & capture. */
3942 asio_exec_ctx_post(get_logger(), &m_task_engine, Synchronicity::S_ASYNC_AND_AWAIT_CONCURRENT_COMPLETION,
3943 [&]() { connect_worker(to, serialized_metadata, sock_opts, &sock); });
3944 // If got here, the task has completed in thread W and signaled us to that effect.
3945
3946 // connect_worker() indicates success or failure through this data member.
3947 if (sock->m_disconnect_cause)
3948 {
3949 *err_code = sock->m_disconnect_cause;
3950 return Peer_socket::Ptr(); // sock will go out of scope and thus will be destroyed.
3951 }
3952 // else
3953 err_code->clear();
3954 return sock;
3955} // Node::connect_with_metadata()
3956
3957void Node::connect_worker(const Remote_endpoint& to, const boost::asio::const_buffer& serialized_metadata,
3958 const Peer_socket_options* sock_opts,
3959 Peer_socket::Ptr* sock_ptr)
3960{
3961 using boost::asio::buffer;
3962 using boost::asio::ip::address;
3963
3964 assert(sock_ptr);
3965
3966 // We are in thread W. connect() is waiting for us to set *sock_ptr and return.
3967
3968 // Create new socket and set all members that may be immediately accessed by user in thread U after we're done.
3969
3970 auto& sock = *sock_ptr;
3971 if (sock_opts)
3972 {
3973 /* They provided custom per-socket options. Before we give those to the new socket, let's
3974 * validate them (for proper values and internal consistency, etc.). */
3975
3976 Error_code err_code;
3977 const bool opts_ok = sock_validate_options(*sock_opts, 0, &err_code);
3978
3979 // Due to the advertised interface of the current method, we must create a socket even on error.
3980 sock.reset(sock_create(*sock_opts));
3981
3982 // Now report error if indeed options were invalid. err_code is already set and logged in that case.
3983 if (!opts_ok)
3984 {
3985 sock->m_disconnect_cause = err_code;
3986 return;
3987 }
3988 // else
3989 }
3990 else
3991 {
3992 /* More typically, they did not provide per-socket options. So we just pass our global
3993 * template for the per-socket options to the Peer_socket constructor. The only caveat is
3994 * that template may be concurrently changed, so we must lock it. Could do it with opt(), but
3995 * that introduces an extra copy of the entire struct, so just do it explicitly.
3996 *
3997 * Note: no need to validate; global options (including per-socket ones) are validated
3998 * elsewhere when set. */
3999 Peer_socket* sock_non_ptr;
4000 {
4002 sock_non_ptr = sock_create(m_opts.m_dyn_sock_opts);
4003 }
4004 sock.reset(sock_non_ptr);
4005 }
4006
4007 // Socket created; set members.
4008
4009 sock->m_active_connect = true;
4010 sock->m_node = this;
4012 sock->m_remote_endpoint = to;
4013 // Will be sent in SYN to be deserialized by user on the other side. Save here if we must retransmit SYN.
4014 sock->m_serialized_metadata.assign_copy(serialized_metadata);
4015
4016 /* Initialize the connection's send bandwidth estimator (object that estimates available
4017 * outgoing bandwidth based on incoming acknowledgments). It may be used by m_snd_cong_ctl,
4018 * depending on the strategy chosen, but may be useful in its own right. Hence it's a separate
4019 * object, not inside *m_snd_cong_ctl. */
4020 sock->m_snd_bandwidth_estimator.reset(new Send_bandwidth_estimator(get_logger(), sock));
4021
4022 // Initialize the connection's congestion control strategy based on the configured strategy.
4023 sock->m_snd_cong_ctl.reset
4024 (Congestion_control_selector::create_strategy(sock->m_opts.m_st_cong_ctl_strategy, get_logger(), sock));
4025 // ^-- No need to use opt() yet: user doesn't have socket and cannot set_options() on it yet.
4026
4027 /* Tweak: If they specify the "any" IP address as the destination (which means any interface on
4028 * this machine), response traffic will look as though it's coming from the loopback IP address,
4029 * or another specific IP address -- not "any." Thus it will not be able to be properly
4030 * demultiplexed to this socket, since that will be saved at the "any" address in our data
4031 * structures. So that's an error. */
4032 bool ip_addr_any_error = false;
4033 const address& addr = to.m_udp_endpoint.address(); // Short-hand.
4034 if (addr.is_v4())
4035 {
4036 if (addr.to_v4() == util::Ip_address_v4::any())
4037 {
4038 ip_addr_any_error = true;
4039 }
4040 }
4041 else if (addr.is_v6())
4042 {
4043 if (addr.to_v6() == util::Ip_address_v6::any())
4044 {
4045 ip_addr_any_error = true;
4046 }
4047 }
4048 // else a new version of IP! Yay!
4049 if (ip_addr_any_error)
4050 {
4051 // Mark/log error.
4052 Error_code* err_code = &sock->m_disconnect_cause;
4054 return;
4055 }
4056 // else
4057
4058 // Allocate ephemeral local port.
4059
4060 sock->m_local_port = m_ports.reserve_ephemeral_port(&sock->m_disconnect_cause);
4061 if (sock->m_local_port == S_PORT_ANY)
4062 {
4063 // Error already logged and is in sock->m_disconnect_cause.
4064 return;
4065 }
4066 // else
4067
4068 const Socket_id socket_id = Node::socket_id(sock);
4069 FLOW_LOG_INFO("NetFlow worker thread starting active-connect of [" << sock << "].");
4070
4071 if (util::key_exists(m_socks, socket_id))
4072 {
4073 /* This is an active connect (we're intiating the connection). Therefore in particular it
4074 * should be impossible that our local_port() equals an already existing connection's
4075 * local_port(); Port_space is supposed to prevent the same ephemeral port from being handed out
4076 * to more than one connection. Therefore this must be a programming error. */
4077
4078 FLOW_LOG_WARNING("Cannot add [" << sock << "], because such a connection already exists. "
4079 "This is an ephemeral port collision and "
4080 "constitutes either a bug or an extremely unlikely condition.");
4081
4082 // Mark/log error.
4083 Error_code* err_code = &sock->m_disconnect_cause;
4085
4086 // Return port.
4087 Error_code return_err_code;
4088 m_ports.return_port(sock->m_local_port, &return_err_code);
4089 assert(!return_err_code);
4090
4091 return;
4092 } // if (that socket pair already exists)
4093 // else
4094
4095 /* Try the packet send just below again if SYN not acknowledged within a certain amount of time.
4096 * Give up if that happens too many times. Why do this BEFORE sending packet? Because
4097 * this can fail, in which case we don't want a weird situation where we've sent
4098 * the packet but failed to start the retransmit/timeout timers.
4099 * Update: It can no longer fail, so that reasoning is N/A. Not moving, though, because it's still fine here. */
4100 setup_connection_timers(socket_id, sock, true);
4101
4102 /* Initial Sequence Number (ISN) (save before create_syn() uses it).
4103 * Remember it in case we must retransmit the SYN. (m_snd_next_seq_num may have been further increased by then.) */
4104 Sequence_number& init_seq_num = sock->m_snd_init_seq_num;
4105 init_seq_num = m_seq_num_generator.generate_init_seq_num();
4106 /* Setting this now ensures ALL subsequent copies (essentially, every single Sequence_number on this socket's
4107 * local data number line!) will have the same nice metadata (hence nice logging) too.
4108 * The `+ 1` nuance is explained in class Sequence_number doc header, *Metadata* section. */
4109 init_seq_num.set_metadata('L', init_seq_num + 1, sock->max_block_size());
4110 // Sequence number of first bit of actual data.
4111 sock->m_snd_next_seq_num = init_seq_num + 1;
4112
4113 // Make a SYN packet to send.
4114 auto syn = create_syn(sock);
4115
4116 // Fill out common fields and asynchronously send packet.
4117 async_sock_low_lvl_packet_send_paced(sock, Low_lvl_packet::ptr_cast(syn));
4118
4119 /* send will happen asynchronously, and the registered completion handler will execute in this
4120 * thread when done (NO SOONER than this method finishes executing). */
4121
4122 // No more erros: Map socket pair to the socket data structure (kind of analogous to a TCP net-stack's TCB structure).
4123 m_socks[socket_id] = sock;
4124
4125 // CLOSED -> SYN_SENT.
4126 sock_set_int_state(sock, Peer_socket::Int_state::S_SYN_SENT);
4127} // Node::connect_worker()
4128
4130 const Peer_socket_options* sock_opts)
4131{
4132 return sync_connect_with_metadata(to, Fine_duration::max(),
4133 boost::asio::buffer(&S_DEFAULT_CONN_METADATA, sizeof(S_DEFAULT_CONN_METADATA)),
4134 err_code, sock_opts);
4135}
4136
4138 const boost::asio::const_buffer& serialized_metadata,
4139 Error_code* err_code, const Peer_socket_options* opts)
4140{
4141 return sync_connect_with_metadata(to, Fine_duration::max(), serialized_metadata, err_code, opts);
4142}
4143
4145 const boost::asio::const_buffer& serialized_metadata,
4146 Error_code* err_code, const Peer_socket_options* sock_opts)
4147{
4149 to, max_wait, serialized_metadata, _1, sock_opts);
4150 // ^-- Call ourselves and return if err_code is null. If got to present line, err_code is not null.
4151
4152 // We are in thread U != W.
4153
4154 /* This is actually pretty simple. All we want to do is connect(), which is non-blocking, and
4155 * then block until the connection is ready (at least according to our side). Ready means that
4156 * the socket is Writable (since user has no access to the socket yet, nothing can be loading
4157 * data onto the Send buffer, and obviously the congestion window is clear, so it must be
4158 * Writable). Note that, like BSD sockets, we specifically don't consider a socket Writable
4159 * until in ESTABLISHED internal state. */
4160
4161 /* For the "block until Writable" part, create and load the Event_set. Do this before connect(),
4162 * so that if it fails we don't have to then clean up the socket before returning error to user. */
4163
4164 const Event_set::Ptr event_set = event_set_create(err_code);
4165 if (!event_set)
4166 {
4167 assert(*err_code == error::Code::S_NODE_NOT_RUNNING);
4168 return Peer_socket::Ptr(); // *err_code is set.
4169 }
4170 // Now we know Node is running(); and we have event_set.
4171
4172 // We must clean up event_set at any return point below.
4173 Error_code dummy_prevents_throw;
4174 util::Auto_cleanup event_set_cleanup = util::setup_auto_cleanup([&]()
4175 {
4176 // Eat any error when closing Event_set, as it's unlikely and not interesting to user.
4177 event_set->close(&dummy_prevents_throw);
4178 });
4179
4180 const auto sock = connect_with_metadata(to, serialized_metadata, err_code, sock_opts);
4181 if (!sock)
4182 {
4183 return sock; // *err_code is set. It's probably some user error like an invalid destination.
4184 }
4185 // else we have a socket that has started connecting.
4186
4187 /* We must clean up sock (call sock->close_abruptly(&dummy_prevents_throw)) at any return point (including
4188 * exception throw) below, EXCEPT the success case. Because of the latter, we can't use the
4189 * auto_cleanup trick we used on event_set. So, we'll just have to handle sock cleanup
4190 * manually. */
4191
4192 // Add the one event about which we care.
4193 bool result = event_set->add_wanted_socket<Peer_socket>(sock, Event_set::Event_type::S_PEER_SOCKET_WRITABLE,
4194 &dummy_prevents_throw);
4195 assert(result); // Node is running, so there's no way that should have failed.
4196
4197 // Wait for Writable.
4198 result = event_set->sync_wait(max_wait, err_code);
4199 if (!result)
4200 {
4201 if (*err_code == error::Code::S_EVENT_SET_CLOSED)
4202 {
4203 // It's unlikely, but I guess someone could have destroyed Node during the wait (we do allow that during sleep).
4205 }
4206 else
4207 {
4208 // This is quite common and is analogous to POSIX's EINTR semantics (signal interrupted the blocking call).
4209 assert(*err_code == error::Code::S_WAIT_INTERRUPTED);
4210 }
4211
4212 // Clean up (as discussed above).
4213 sock->close_abruptly(&dummy_prevents_throw); // Eat any error; user doesn't care.
4214 return Peer_socket::Ptr(); // *err_code is set.
4215 } // if (sync_wait() failed)
4216 // else we know event_set is still open, and sync_wait() succeeded.
4217
4218 // OK; either that returned 1 event, or 0 events (timeout).
4219 const bool ready = event_set->events_detected(err_code);
4220 /* Node had not been destroyed by the time sync_wait() finished, and we don't allow simultaneous
4221 * ~Node() outside a blocking sleep (see notes in class Node doc header). The only way this
4222 * failed is if Event_set was closed, and that could only happen if Node was destroyed. */
4223 assert(!*err_code);
4224
4225 if (ready)
4226 {
4227 /* Didn't time out; socket is Writable. However, that does not mean it's Writable for "good"
4228 * reasons. If an error was encountered since the original non-blocking connect (e.g., RST
4229 * received; or handshake timeout expired), then it is now Writable, but any operation like
4230 * send() or receive() will immediately yield an error. If that is the case,
4231 * close_connection_immediately() has set user-visible state to S_CLOSED. So let's check for
4232 * it and return an error in that case.
4233 *
4234 * We could also not; pretend socket is ready and let user discover error when trying to
4235 * transmit. However it seems like a good property to help him out. */
4236
4237 if (sock->state() == Peer_socket::State::S_CLOSED)
4238 {
4239 // No need to cleanup socket; it is already closed.
4240
4241 // Return error as above.
4242 *err_code = sock->m_disconnect_cause; // No need to lock; m_disconnect_cause set and can't change later.
4243 return Peer_socket::Ptr();
4244 }
4245 // else it's probably really ready for action.
4246
4247 return sock; // *err_code is success.
4248 }
4249 // else
4250
4251 // Timed out! Clean up socket, as above, and return null with a specific error (as advertised).
4252 sock->close_abruptly(&dummy_prevents_throw);
4254 return Peer_socket::Ptr();
4255} // Node::sync_connect_impl()
4256
4257void Node::setup_connection_timers(const Socket_id& socket_id, Peer_socket::Ptr sock, bool initial)
4258{
4261 using boost::chrono::microseconds;
4262 using boost::chrono::duration_cast;
4263 using boost::weak_ptr;
4264
4265 // We are in thread W.
4266
4267 Fine_duration rexmit_from_now = sock->opt(sock->m_opts.m_st_connect_retransmit_period);
4268
4269 // Finalize the retransmit scheduled task firing time; and update the # retries statistic.
4270 if (!initial)
4271 {
4272 assert(scheduled_task_fired(get_logger(), sock->m_init_rexmit_scheduled_task));
4273
4274 ++sock->m_init_rexmit_count;
4275 /* This is a bit more precise than leaving rexmit_from_now alone, as it counts from when firing was
4276 * actually scheduled, vs. when the timer was actually triggered by boost.asio. The 2nd addend should be a bit
4277 * negative and thus decrease rexmit_from_now a bit. */
4278 rexmit_from_now += scheduled_task_fires_from_now_or_canceled(get_logger(), sock->m_init_rexmit_scheduled_task);
4279 /* @todo RFC 6298 mandates that this must be doubled after each attempt instead of keeping
4280 * the same value. Doesn't mean we should follow it. */
4281 }
4282
4283 // Firing time is set; start timer. Call that body when task fires, unless it is first canceled.
4284 sock->m_init_rexmit_scheduled_task
4285 = schedule_task_from_now(get_logger(), rexmit_from_now, true, &m_task_engine,
4286 [this, socket_id,
4287 sock_observer = weak_ptr<Peer_socket>(sock)]
4288 (bool)
4289 {
4290 auto sock = sock_observer.lock();
4291 if (sock)
4292 {
4293 handle_connection_rexmit_timer_event(socket_id, sock);
4294 }
4295 // else { Possible or not, allow for this possibility for maintainability. }
4296 });
4297
4298 // Also set up the timeout that will stop these retries from happening.
4299 if (initial)
4300 {
4301 sock->m_connection_timeout_scheduled_task
4303 sock->opt(sock->m_opts.m_st_connect_retransmit_timeout),
4304 true, &m_task_engine,
4305 [this, socket_id,
4306 sock_observer = weak_ptr<Peer_socket>(sock)]
4307 (bool)
4308 {
4309 // We are in thread W.
4310
4311 auto sock = sock_observer.lock();
4312 if (!sock)
4313 {
4314 return; // Possible or not, allow for this possibility for maintainability.
4315 }
4316 // else
4317
4318 FLOW_LOG_INFO("Connection handshake timeout timer [" << sock << "] has been triggered; was on "
4319 "attempt [" << (sock->m_init_rexmit_count + 1) << "].");
4320
4321 assert((sock->m_int_state == Peer_socket::Int_state::S_SYN_SENT)
4322 || (sock->m_int_state != Peer_socket::Int_state::S_SYN_RCVD));
4323
4324 // Timeout. Give up. Send RST, in case they do come to their senses -- but it's too late for us.
4325
4326 /* Close connection in our structures and inform user. Pre-conditions
4327 * assumed by call: sock in m_socks and sock->state() == S_OPEN (yes, since m_int_state ==
4328 * S_SYN_SENT/RCVD); err_code contains the reason for the close (yes). */
4329 rst_and_close_connection_immediately(socket_id, sock, error::Code::S_CONN_TIMEOUT, false);
4330 /* ^-- defer_delta_check == false: for similar reason as when calling send_worker() from
4331 * send_worker_check_state(). */
4332 });
4333 } // if (initial)
4334} // Node::setup_connection_timers()
4335
4337{
4338 using util::Blob;
4339
4340 // We are in thread W.
4341
4342 assert((sock->m_int_state == Peer_socket::Int_state::S_SYN_SENT)
4343 || (sock->m_int_state != Peer_socket::Int_state::S_SYN_RCVD));
4344
4345 // Not an error (so not WARNING), but it's rare and interesting enough for INFO.
4346 FLOW_LOG_INFO("Connection handshake retransmit timer [" << sock << "] triggered; was on "
4347 "attempt [" << (sock->m_init_rexmit_count + 1) << "].");
4348
4349 // Try again. Reproduce the SYN or SYN_ACK... but first set up the next timer.
4350
4351 // Setup the next timer before sending packet for the same reason as in the original SYN/SYN_ACK-sending code.
4352 setup_connection_timers(socket_id, sock, false);
4353
4354 /* Send packet.
4355 * @todo More code reuse? Or save the serialized version inside socket and resend here verbatim? */
4356
4357 Low_lvl_packet::Ptr re_syn_base;
4358 if (sock->m_active_connect)
4359 {
4360 auto syn = create_syn(sock);
4361 re_syn_base = Low_lvl_packet::ptr_cast(syn);
4362 }
4363 else
4364 {
4365 // (Subtlety: As of this writing it wouldn't have changed since original SYN_ACK, but safe>sorry.)
4366 sock->m_rcv_last_sent_rcv_wnd = sock_rcv_wnd(sock);
4367
4368 auto syn_ack = create_syn_ack(sock);
4369 re_syn_base = Low_lvl_packet::ptr_cast(syn_ack);
4370 }
4371
4372 // Fill out common fields and asynchronously send packet.
4373 async_sock_low_lvl_packet_send_paced(sock, std::move(re_syn_base));
4374} // Node::handle_connection_rexmit_timer_event()
4375
4377{
4380
4381 // We are in thread W.
4382
4383 /* Cancel any timers. Note that this will NOT prevent a given timer's handler from running.
4384 * It will try to make it run ASAP with operation_aborted error code. However, it may not even
4385 * succeed in that. In particular, if by the time the current handler started the timer handler
4386 * event was already queued inside m_task_engine, then canceling the timer now will not load
4387 * operation_aborted into the handler call; it will instead fire as if the timer really expired
4388 * (which it did). Therefore the timer handler should be careful to check the state of the socket
4389 * and exit if the state is not suitable (in this case, S_CLOSED).
4390 *
4391 * Even so, try to cancel with operation_aborted just to cut down on entropy a bit (at least by
4392 * executing all handlers ASAP).
4393 *
4394 * Update: However, scheduled_task_cancel() will indeed cleanly cancel. `Timer`s are still in direct use
4395 * as well however, so the above still applies to some of the below. */
4396
4397 sock->m_rcv_delayed_ack_timer.cancel();
4398 sock->m_snd_pacing_data.m_slice_timer.cancel();
4399
4400 if (sock->m_init_rexmit_scheduled_task)
4401 {
4402 scheduled_task_cancel(get_logger(), sock->m_init_rexmit_scheduled_task);
4403 sock->m_init_rexmit_scheduled_task = Scheduled_task_handle();
4404 }
4405 if (sock->m_connection_timeout_scheduled_task)
4406 {
4407 scheduled_task_cancel(get_logger(), sock->m_connection_timeout_scheduled_task);
4408 sock->m_connection_timeout_scheduled_task = Scheduled_task_handle();
4409 }
4410 if (sock->m_rcv_in_rcv_wnd_recovery)
4411 {
4412 scheduled_task_cancel(get_logger(), sock->m_rcv_wnd_recovery_scheduled_task);
4413 sock->m_rcv_in_rcv_wnd_recovery = false;
4414 }
4415
4416 if (sock->m_snd_drop_timer)
4417 {
4418 // This Drop_timer guy actually will prevent any callbacks from firing.
4419 sock->m_snd_drop_timer->done();
4420
4421 /* The two `shared_ptr`s (sock and m_snd_drop_timer) point to each other. Nullify this to break the cycle
4422 * and thus avoid memory leak. */
4423 sock->m_snd_drop_timer.reset();
4424 }
4425}
4426
4428{
4429 sock->m_snd_drop_timeout = sock->opt(sock->m_opts.m_st_init_drop_timeout);
4430
4431 const auto on_fail = [this, socket_id, sock](const Error_code& err_code)
4432 {
4433 rst_and_close_connection_immediately(socket_id, sock, err_code, false);
4434 // ^-- defer_delta_check == false: for similar reason as when calling send_worker() from send_worker_check_state().
4435 };
4436 const auto on_timer = [this, socket_id, sock](bool drop_all_packets)
4437 {
4438 drop_timer_action(sock, drop_all_packets);
4439 };
4440
4441 /* Set up the Drop Timer. Basically give it some key fields of sock (DTO value, the In-flight
4442 * queue) and the callbacks to call when events occur, such as the Drop Timer expiring.
4443 * Additionally, when events m_snd_drop_timer wants to know about happen, we will call
4444 * m_snd_drop_timer->on_...(). */
4445 sock->m_snd_drop_timer = Drop_timer::create_drop_timer(get_logger(), &m_task_engine, &sock->m_snd_drop_timeout,
4446 Peer_socket::Ptr(sock), on_fail, on_timer);
4447}
4448
4450 const Function<size_t (size_t max_data_size)>& snd_buf_feed_func,
4451 Error_code* err_code)
4452{
4453 using boost::asio::post;
4454
4455 /* We are in user thread U != W.
4456 * It's important to keep that in mind in this method. In particular, it is absolutely unsafe to
4457 * access m_int_state, which belongs solely to thread W and is never locked. */
4458
4459 // IMPORTANT: The logic here must be consistent with sock_is_writable().
4460
4461 if (!running())
4462 {
4464 return 0;
4465 }
4466 // else
4467
4468 // Pre-condition is that m_mutex is locked already. So EVERYTHING that can be locked, is, including the buffers.
4469
4470 // Pre-condition.
4471 assert(sock->m_state == Peer_socket::State::S_OPEN); // Locked.
4472
4473 if (sock->m_disconnect_cause) // Locked.
4474 {
4475 // Error has been recorded, and we're not CLOSED => we are DISCONNECTING.
4476 assert(sock->m_open_sub_state == Peer_socket::Open_sub_state::S_DISCONNECTING);
4477
4478 /* Disconnection is underway. Adding more data to the Send buffer is pointless; we
4479 * don't allow more data to be queued to be sent after an error (though existing buffered data
4480 * may yet be sent... but that's not relevant here). @todo No graceful close yet. */
4481
4482 // Mark in *err_code and log.
4483 FLOW_ERROR_EMIT_ERROR_LOG_INFO(sock->m_disconnect_cause);
4484 return 0;
4485 }
4486 // else
4487
4488 // No fatal error (socket not disconnecing or closed). However it may still be connecting.
4489
4490 if (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_CONNECTING)
4491 {
4492 /* Here we draw a line in the sand and refuse to buffer any data. We could easily allow
4493 * buffering data even when still S_CONNECTING. However, I am copying BSD socket semantics
4494 * here, as they do seem to be useful. As a user I don't want to think I've "sent" gobs of data
4495 * while there's little to suggest that there's even anyone listening on the other side. */
4496 err_code->clear();
4497 return 0;
4498 }
4499 // else
4500 assert(sock->m_open_sub_state == Peer_socket::Open_sub_state::S_CONNECTED);
4501
4502 const bool was_deqable = snd_deqable(sock); // See below.
4503
4504 /* Write the user-provided data into m_snd_buf; provide the missing argument (max_data_size).
4505 * Round up to a multiple of max-block-size to ensure we never fragment a max-block-size-sized
4506 * chunk of data when they're using unreliable mode! */
4507 const size_t sent = snd_buf_feed_func(sock->max_block_size_multiple(sock->m_opts.m_st_snd_buf_max_size));
4508
4509 // Register that the Send buffer possibly grew.
4510 sock->m_snd_stats.buffer_fed(sock->m_snd_buf.data_size());
4511
4512 /* We've done the minimal thing send() does: added data to the send buffer. Now we may need to
4513 * kick off the actual asynchronous sending of some of these data by thread W. It's important to
4514 * discuss the overall strategy for how that works.
4515 *
4516 * Key question: how does W send low-level packets over UDP? Answer: if there's anything on the
4517 * Send buffer or retransmission queue (if retransmission is enabled), and there is no other
4518 * (congestion control, probably) reason NOT to send packets, then dequeue a packet from
4519 * retransmission queue or Send buffer and send it off to the UDP layer; repeat in a tight loop
4520 * until both Send queues are empty, or there's some reason NOT to send packets (again, congestion
4521 * control). Let's write this in pseudo-code:
4522 *
4523 * DEQ(sock): // Thread W only.
4524 * if (!sendable(sock)):
4525 * return // Slight optimization; perform this first check before locking.
4526 * lock sock // Must lock because sock->m_snd_buf accessible from other threads.
4527 * while (sendable(sock) && deqable(sock)):
4528 * dequeue sock->m_snd_buf -> block
4529 * serialize block into packet
4530 * send packet via UDP
4531 * unlock sock
4532 *
4533 * sendable(sock):
4534 * return <...probably some congestion control condition involving CWND or something>
4535 *
4536 * deqable(sock):
4537 * return !(sock->m_rexmit_q.empty() && sock->m_snd_buf.empty())
4538 *
4539 * When should DEQ(sock) execute? Answer: whenever sendable(sock) and deqable(sock) are true. If
4540 * they're true, but DEQ(sock) doesn't run for time period P, then it's practically like adding
4541 * sleep(P) from the user's point of view. So how do we get DEQ(sock) to execute as soon as those
4542 * conditions are true? Well, running it repeatedly in a thread W tight loop would do it, but
4543 * obviously that's unacceptable.
4544 *
4545 * So consider the initial state after sock enters ESTABLISHED state. sendable(sock) is true;
4546 * deqable(sock) is false. The moment deqable(sock) becomes true, we should execute DEQ(sock); in
4547 * other words in the first sock->send(), as that will add to m_snd_buf. After DEQ(sock) exits,
4548 * there's no need to call DEQ(sock) until again both conditions are true. Therefore, the
4549 * algorithm is: whenever sendable(sock) goes from false to true, and/or deqable(sock) from false
4550 * to true, call DEQ(sock). If inside DEQ(sock) one of the conditions is still false, it will
4551 * quickly return. (Call the latter a NOOP.)
4552 *
4553 * Now we must come up with a scheme that will ensure DEQ(sock) will run very quickly after either
4554 * condition (sendable(sock), deqable(sock)) becomes true; and that will not peg the CPU.
4555 *
4556 * Consider sendable(). Only thread W (transport layer) can determine this value: it depends on
4557 * wholly internal details like packets in-flight and CWND. Therefore sendable(sock) can go
4558 * false->true only in W. Hence W, whenever changing any component that might affect
4559 * sendable(sock) would do:
4560 *
4561 * // ... Something related to sendable(sock) has changed....
4562 * DEQ(sock) // So check and send if possible.
4563 *
4564 * Clearly this calls DEQ(sock) as soon as humanly possible after sendable(sock) becomes true.
4565 * Clearly it wastes no CPU cycles either. OK.
4566 *
4567 * Now consider deqable(). sock->m_snd_buf can only change from empty to non-empty in the
4568 * previous statement (snd_buf_feed_func()). That is in thread U != W. Suppose we write:
4569 *
4570 * SEND(sock, blocks): // Non-W threads only.
4571 * lock sock // Must lock because sock->m_snd_buf accessible from other threads.
4572 * add blocks -> sock->m_snd_buf
4573 * if (sock->m_snd_buf was empty before previous statement)
4574 * // Queue DEQ(sock) for asynchronous execution on thread W as soon as it's free:
4575 * post(W, DEQ(sock))
4576 * unlock sock
4577 *
4578 * Does this call DEQ(sock) as soon as deqable(sock) becomes true? Well, DEQ(sock) can only run
4579 * on thread W, and the enqueuing of blocks can only happen on thread U, and post() will cause
4580 * DEQ(sock) to run as soon as possible. Therefore that's as good as it can be. Is it correct,
4581 * however? The mainstream case is that once "unlock sock" finished in SEND(), thread W will get
4582 * some free time, execute the just-queued DEQ(), and thus everything works out. OK so far.
4583 *
4584 * Since, however, post() is (obviously) asynchronous and done from thread non-W, there is
4585 * potential for other tomfoolery. First consider competing SEND() calls from other threads.
4586 * Because of locking, they will be entirely sequential even from different threads and thus can
4587 * be considered as all in one thread U != W. Now suppose SEND() placed DEQ() onto W, and another
4588 * SEND() executes before DEQ() executes on W. No problem: since only DEQ() can dequeue the Send
4589 * buffer, and the 1st SEND() made the buffer non-empty, the 2nd SEND() will not affect the DEQ()
4590 * situation, since it cannot make m_snd_buf become non-empty after being empty (was already
4591 * non-empty).
4592 *
4593 * Second consider SEND(sock, blocks) executing while a W handler is executing. Now suppose this
4594 * W handler discovers that sendable() may be affected and thus calls DEQ(sock) as shown above;
4595 * meanwhile SEND() posts DEQ(sock) onto W as well. W will wait until SEND(sock, blocks) exits
4596 * (due to the lock) before executing most of DEQ(sock), but when it does it will be ITS DEQ(sock)
4597 * that executes first (regardless of whether the post from thread U happened first). This
4598 * DEQ(sock) will not be a NOOP, which is great. Now, thread W should exit that handler and
4599 * finally execute SEND()'s posted DEQ() -- which will be a NOOP, because the synchronous
4600 * DEQ(sock) from thread W preempted it.
4601 *
4602 * Is this OK? Most likely. It'll spend some extra CPU cycles on the check in the NOOP, but
4603 * that's it. Now, there is some conceivable way that, maybe, such NOOPs could happen a lot in a
4604 * very busy system and perhaps even "bunch" up to peg the CPU. However, after doing many thought
4605 * experiments, I unable to come up with anything actually worrying.
4606 *
4607 * The other way deqable(sock) can become true is if m_rexmit_q was empty but becomes non-empty.
4608 * In other words, if we detect packet as Dropped, we will have added it (if retransmission is on)
4609 * to m_rexmit_q. This can only happen on thread W and thus is handled similarly to
4610 * sendable(sock):
4611 *
4612 * // ... Something related to deqable(sock) has changed....
4613 * DEQ(sock) // So check and send if possible.
4614 *
4615 * So this system should be OK. Now let's map the above pseudocode to actual code.
4616 *
4617 * SEND(sock, blocks) is the very method you're reading now (Peer_socket::send() and
4618 * Node::send(), runs in thread U != W). DEQ(sock) is Node::send_worker(sock) (runs in thread
4619 * W). sendable(sock) is Node::can_send(sock). deqable(sock) is Node::snd_deqable(sock).
4620 * post(W, f) is post(Node::m_task_engine, f).
4621 *
4622 * OK, there is one more small caveat. If DEQ(sock) is placed onto W by SEND(sock, blocks),
4623 * then before this DEQ() is executed, thread W may change the state of sock (for example, close
4624 * it). Therefore, DEQ() must also ensure it's operating in a state where it can send data
4625 * (ESTABLISHED at least), and if not, NOOP. Of course if DEQ() is executed synchronously by W,
4626 * then this is unnecessary (since W code wouldn't execute DEQ() directly unless already in a
4627 * proper state for this). So, send_worker_check_state() is actually a little bit more than just
4628 * DEQ(), while send_worker() is just DEQ(). send() posts send_worker_check_state(), while
4629 * thread W executes send_worker() directly. */
4630
4631 if ((!was_deqable) && (sent != 0))
4632 {
4633 // Possibly send_worker() can send packets now (send buffer went from empty to not).
4634 post(m_task_engine, [this, sock]() { send_worker_check_state(sock); });
4635 }
4636
4637 err_code->clear();
4638 return sent;
4639 // Note that sock->m_mutex is unlocked here (and send_worker() will lock it again when it [probably soon] executes).
4640} // Node::send()
4641
4642bool Node::sock_is_writable(const boost::any& sock_as_any) const
4643{
4644 using boost::any_cast;
4645
4646 const Peer_socket::Const_ptr sock = any_cast<Peer_socket::Ptr>(sock_as_any);
4647
4648 Peer_socket::Lock_guard lock(sock->m_mutex); // Many threads can access/write below state.
4649
4650 /* Our task here is to return true if and only if at this very moment calling sock->send() would
4651 * yield either a return value of > 0 OR a non-success *err_code. In other words, send() would
4652 * return "something." This is used for Event_set machinery.
4653 *
4654 * This should mirror send()'s algorithm. @todo Should send() call this, for code reuse?
4655 * Maybe/maybe not. Consider performance when deciding.
4656 *
4657 * - If state is CLOSED, then some sort of error/terminating condition occurred, so send()
4658 * would return 0 and non-success Error_code == sock->m_disconnect_cause. (Writable.)
4659 * - Otherwise, if state is OPEN+DISCONNECTING, then graceful close (@todo implement it) is
4660 * underway; we do not allow more data to be sent (except what's already in Sent buffer), so
4661 * send() would return 0 and non-success Error_code == sock->m_disconnect_cause.
4662 * (Writable.)
4663 * - Otherwise, if state is OPEN+CONNECTED, and there is Send buffer space, send() would return >
4664 * 0 and no error. (Writable.)
4665 * - The other remaining possibilities:
4666 * - OPEN+CONNECTED but no Send buffer space (returns 0, no error). (Not Writable.)
4667 * - OPEN+CONNECTING -- we don't allow accumulating data in Send buffer (returns 0, no error).
4668 * (Not Writable.) */
4669
4670 return (sock->m_state == Peer_socket::State::S_CLOSED)
4671 || (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_DISCONNECTING)
4672 || ((sock->m_open_sub_state == Peer_socket::Open_sub_state::S_CONNECTED)
4673 && snd_buf_enqable(sock));
4674} // Node::sock_is_writable()
4675
4677{
4678 // See big comment block in Node::send() first.
4679
4680 // We are in thread W.
4681
4682 /* This method can be thought of as the chunk of the finite state machine that defines what
4683 * happens when the "user called send, adding at least 1 block to the send buffer" (@todo: or any data
4684 * at all, if in reliable mode?) event is defined. Therefore, we will have a switch() that will handle every
4685 * state and decide what should happen when that event fires in that state.
4686 *
4687 * send() placed us onto thread W. When send() did so, m_int_state (which it was not allowed to
4688 * check, as only thread W can access it) was at least ESTABLISHED (since state was
4689 * S_OPEN+S_CONNECTED, ensured via assert()). Therefore, we can eliminate several states with
4690 * assert()s: SYN_SENT, SYN_RCVD. */
4691
4692 switch (sock->m_int_state)
4693 {
4695 // Mainstream case.
4696 send_worker(sock, false);
4697 /* ^-- defer_delta_check == false: because we were invoked from thread U != W, we are NOT
4698 * invoked from async_low_lvl_recv(). Therefore, we will NOT perform
4699 * event_set_all_check_delta(false) before the boost.asio handler exits. Therefore boost.asio
4700 * may sleep (block) before event_set_all_check_delta(false). Therefore that would delay
4701 * delivery of the Writable event to the user. Therefore force the delta check immediately.
4702 * See Node::m_sock_events doc header for details. */
4703 break;
4705 // Unlikely but legitimate.
4706 FLOW_LOG_INFO('[' << sock << "] "
4707 "in state [" << sock->m_int_state << "] "
4708 "closed before asynchronous send_worker() could proceed.");
4709 break;
4712 // Crash. See above reasoning.
4713 FLOW_LOG_WARNING('[' << sock << "] "
4714 "in state [" << sock->m_int_state << "] "
4715 "somehow had send() called on it.");
4716 assert(false);
4717 break;
4718 } // switch (sock->m_int_state)
4719} // Node::send_worker_check_state()
4720
4721void Node::send_worker(Peer_socket::Ptr sock, bool defer_delta_check)
4722{
4723 using boost::asio::buffer;
4724 using boost::next;
4725 using boost::ratio;
4726 using boost::ratio_string;
4727 using boost::chrono::milliseconds;
4728 using boost::chrono::round;
4729 using boost::shared_ptr;
4730 using std::list;
4731
4732 // We are in thread W.
4733
4734 // See big comment block in Node::send() first.
4735
4736 // Pre-condition.
4737 assert(sock->m_int_state == Peer_socket::Int_state::S_ESTABLISHED);
4738
4739 /* We are about to potentially send a bunch of DATA packets. Before sending a given packet, we
4740 * will call can_send() which will ask the congestion control module whether there is space in
4741 * what it thinks is the available pipe and return true if so (as well as check rcv_wnd, ensuring
4742 * the receiver's Receive buffer can handle the data once they arrive). However, how it answers
4743 * that question depends on the size of the pipe (m_snd_cong_ctl->congestion_window_bytes(),
4744 * a/k/a CWND). Many (most?) congestion control modules will want to reduce CWND when a
4745 * connection has been idle -- not sending anything, due to no data to be sent in Send buffer --
4746 * for a while. Thus we must call m_snd_cong_ctl->on_idle_timeout() if we've hit Idle Timeout.
4747 *
4748 * The definition of Idle Timeout we use is from TCP RFC 5681-4.1 (and DCCP CCID 2 RFC 4341-5.1).
4749 * It's simple: Idle Timeout is DTO (Drop Timeout) time units since a DATA packet has been last
4750 * sent. While I basically grasp the intuition behind it (if a DTO since even the last-sent
4751 * packet has expired, and no retransmission/further transmission has occurred, then there must
4752 * have been no more data for a while), I can't quite prove to myself that it's exactly right,
4753 * mostly due to the fact that DTO may change over time. It's probably right though, as RFC 4341
4754 * recommends it, even though that protocol is closer to NetFlow than TCP (full selective ACKs).
4755 * Anyway, if we see too many false Idle timeouts, revisit this.
4756 *
4757 * Why check this now? Why not start a proper timer, each time packet is sent, instead and just
4758 * inform m_snd_cong_ctl when it fires? Answer: timer management is somewhat of a pain in the ass
4759 * (as you can see in our other various timers, such as m_snd_drop_timer). Here we have an opportunity
4760 * to simply check the condition and affect CWND right before CWND would be used anyway
4761 * (can_send()). It's simpler, and the performance impact is negligible (it's just a
4762 * Fine_clock::now() call and a comparison). You ask, why not do the same for other timers
4763 * then, in particular the Drop Timer? Answer: for Drop Timer, we really need to know exactly
4764 * when it fires, so that we can Drop In-flight packets right then and possibly send more
4765 * packets (among other things). In this case there is no such requirement; we only care about
4766 * whether the Idle Timeout has tripped when we're about to send something. */
4767
4768 /* To avoid a very close race between DTO and idle timeout, apply a slight factor of > 1 to DTO.
4769 * Using boost::ratio<> instead of a double or something for same reason as in
4770 * new_round_trip_time_sample(). */
4771 using Idle_timeout_dto_factor = ratio<110, 100>;
4772 const Fine_duration idle_timeout
4773 = sock->m_snd_drop_timeout * Idle_timeout_dto_factor::num / Idle_timeout_dto_factor::den;
4774 const Fine_duration since_last_send = Fine_clock::now() - sock->m_snd_last_data_sent_when;
4775
4776 if ((sock->m_snd_last_data_sent_when != Fine_time_pt()) && (since_last_send > idle_timeout))
4777 {
4778 // Arguable if this should be INFO or TRACE. We'll see.
4779 FLOW_LOG_INFO("Idle timeout triggered for [" << sock << "]; "
4780 "last activity [" << round<milliseconds>(since_last_send) << "] ago "
4781 "exceeds idle timeout [" << round<milliseconds>(idle_timeout) << "] "
4782 "= " << (ratio_string<Idle_timeout_dto_factor, char>::prefix()) << " x "
4783 "[" << round<milliseconds>(sock->m_snd_drop_timeout) << "].");
4784 sock->m_snd_cong_ctl->on_idle_timeout();
4785 sock->m_snd_stats.idle_timeout();
4786 }
4787
4788 /* Check networking conditions (presumably congestion control) and flow control (rcv_wnd).
4789 * Ideally this would always be true, but then we'd overwhelm the link when send() is invoked on
4790 * large amounts of data and/or repeatedly. */
4791 if (!can_send(sock))
4792 {
4793 FLOW_LOG_TRACE('[' << sock << "]: "
4794 "Initial check: can_send() is false.");
4795 return;
4796 }
4797 // else can send if there are data to send.
4798
4799 /* Didn't lock sock above, as can_send() depends only on internal state, which is accessed from
4800 * thread W only. This is an optimization to avoid thread contention (with non-W send()s) for the
4801 * lock in the case when congestion control is preventing sends.
4802 *
4803 * Have to lock now, for sock->m_snd_buf access (at least). */
4804
4805 const bool rexmit_on = sock->rexmit_on();
4806 bool writable; // See below.
4807 {
4808 Peer_socket::Lock_guard lock(sock->m_mutex);
4809
4810 // Check whether enough data in retransmission queue or snd_buf to send a packet.
4811 if (!snd_deqable(sock))
4812 {
4813 FLOW_LOG_TRACE('[' << sock << "]: "
4814 "Initial check: can_send() is true, but no data to send.");
4815 return;
4816 }
4817 // else can send >= 1 packet.
4818
4819 // For brevity and a bit of speed:
4820 Socket_buffer& snd_buf = sock->m_snd_buf;
4821 list<Peer_socket::Sent_packet::Ptr>& rexmit_q = sock->m_snd_rexmit_q;
4822 size_t& rexmit_q_size = sock->m_snd_rexmit_q_size;
4823 Sequence_number& snd_next_seq_num = sock->m_snd_next_seq_num;
4824
4825 // @todo Implement graceful close.
4826 assert(sock->m_open_sub_state != Peer_socket::Open_sub_state::S_DISCONNECTING);
4827
4828 FLOW_LOG_TRACE('[' << sock << "]: "
4829 "Initial check: Will send from rexmit queue of size [" << rexmit_q_size << "] and/or "
4830 "Send buffer with total size [" << snd_buf.data_size() << "].");
4831 // Very verbose and CPU-intensive!
4832 FLOW_LOG_DATA("Send buffer data = [\n" << snd_buf << "].");
4833
4834 // Send packets until one or both of can_send() and snd_deqable() become false.
4835 do
4836 {
4837 shared_ptr<Data_packet> data;
4839 bool rexmit = false;
4840
4841 /* Record send time. It's only a temporary value for logging, until we
4842 * actually send packet. However, do generate the permanent m_order_num, which is unique. */
4843 Peer_socket::Sent_packet::Sent_when sent_when{ sock_get_new_snd_order_num(sock), Fine_clock::now(), 0 };
4844
4845 /* To provide the best experience on the receiving side, retransmit before sending new data,
4846 * so that Receive buffer on other side receives data as soon as possible. */
4847 if (rexmit_q.empty())
4848 {
4849 // Nothing in retransmission queue, so something is in Send buffer.
4850
4851 // Create low-level DATA packet.
4852 data = Low_lvl_packet::create_uninit_packet<Data_packet>(get_logger());
4853 data->m_rexmit_id = 0; // First (if retransmission is off, only) send attempt.
4854
4855 // Dequeue one block into the packet's data field.
4856
4857 /* Try to dequeue the head block directly into data.m_data. Because we are operating snd_buf
4858 * with block_size_hint == sock->max_block_size(); and because we don't send unless CWND
4859 * allows for at least max_block_size() bytes to be sent, the following should be a
4860 * constant-time operation (a swap of internal buffers) as opposed to a copy. */
4861 snd_buf.consume_buf_move(&data->m_data, sock->max_block_size());
4862
4863 // snd_deqable() returned true, so there must be at least one byte available.
4864 assert(!data->m_data.empty());
4865
4866 // Set sequence number; then advance the next sequence number variable for the next time we do this.
4867 data->m_seq_num = snd_next_seq_num;
4868 advance_seq_num(&snd_next_seq_num, data);
4869
4870 /* We are just about to send the packet. Assume it has been sent. It is not yet Acknowledged
4871 * and not yet Dropped. Therefore it is now In-flight. We should place its info at the back of
4872 * m_snd_flying_pkts_by_sent_when. We must maintain the invariant w/r/t that structure (see comment
4873 * for m_snd_flying_pkts_by_sent_when).
4874 *
4875 * Purpose of keeping these data: at least for comparison against Congestion Window,
4876 * for congestion control. */
4877
4878 // Guarantee that the new sequence number is > all the currently In-flight ones.
4879 assert(data->m_seq_num >= snd_past_last_flying_datum_seq_num(sock));
4880 /* Therefore we will add the following to the end of the map's ordering. Note we've
4881 * incremented m_snd_next_seq_num already, maintaining that member's invariant relationship
4882 * with m_snd_flying_pkts_by_sent_when. */
4883
4884 // New packet: create new metadata object. Record send time. (The latter will be rewritten later.)
4885 sent_pkt = Peer_socket::Sent_packet::Ptr(new Peer_socket::Sent_packet(rexmit_on, data, sent_when));
4886 }
4887 else // if (!rexmit_q.empty())
4888 {
4889 // Get packet and metadata from front of retransmission queue.
4890 rexmit = true;
4891 sent_pkt = rexmit_q.front();
4892
4893 --rexmit_q_size;
4894 rexmit_q.pop_front();
4895
4896 // We'd saved the packet we sent last time -- just need to update some things before resending.
4897 data = sent_pkt->m_packet;
4898
4899 // Retransmitting -- update retransmit count ID (used to match acks to the acked transmit attempt).
4900 ++data->m_rexmit_id;
4901
4902 // Record the send time of this newest attempt. (If pacing enabled this will be rewritten later.)
4903 sent_pkt->m_sent_when.push_back(sent_when);
4904
4905 // Chronologically, no packets sent after this one have been acked yet, as this packet is new.
4906 sent_pkt->m_acks_after_me = 0;
4907 }
4908
4909 /* Note: We have saved Fine_clock::now() as the send time of the packet. However, especially
4910 * if pacing is enabled, we want to record it at the time it is actually sent (pacing may
4911 * delay it). Even if pacing is disabled, CPU pegging may cause a delay in sending (although
4912 * whether that should "count" is a more philosophical question). With pacing, though, since
4913 * pacing spreads out packets over SRTT, and SRTT is measured based on
4914 * Sent_packet::m_sent_when, RTTs artifically become longer and longer if we record the send
4915 * time now. Anyway, this means m_sent_when.back() should be overwritten when the packet is
4916 * actually sent (which should be very soon, unless pacing is enabled).
4917 * See async_sock_low_lvl_packet_send_paced(). */
4918
4919 // data and sent_pkt are ready.
4920
4921 // Add to snd_flying_pkts* and friends; update byte counts.
4922 snd_flying_pkts_push_one(sock, data->m_seq_num, sent_pkt);
4923
4924 /* By adding to m_snd_flying_pkts_by_sent_when (i.e., increasing In-flight byte count), we may have
4925 * affected the result of can_send(). We do check it at the end of the while () body, so OK. */
4926
4927 // Fill out common fields and asynchronously send packet (packet pacing potentially performed inside).
4929
4930 sock->m_snd_stats.data_sent(data->m_data.size(), rexmit);
4931 }
4932 while (can_send(sock) && snd_deqable(sock)); // (there is CWND/rcv_wnd space; and either rexmittable or new data)
4933
4934 FLOW_LOG_TRACE('[' << sock << "]; connection [" << sock << "]: "
4935 "Final check: "
4936 "can_send() == [" << can_send(sock) << "]; "
4937 "snd_deqable() == [" << snd_deqable(sock) << "].");
4938
4939 writable = snd_buf_enqable(sock); // Must do before releasing lock.
4940 } // lock
4941
4942 /* Finally, check if the above has dequeued enough of m_snd_buf for it to accept more data from
4943 * user. If so, sock is certainly now Writable. Therefore we should soon inform anyone waiting
4944 * on any Event_sets for sock to become Writable.
4945 *
4946 * Caveat: Similar to that in Node::handle_syn_ack_ack_to_syn_rcvd() at similar point in the
4947 * code.
4948 *
4949 * Also: why do this outside the above locked block? Same reason as similar code in
4950 * handle_data_to_established(). */
4951 if (writable &&
4953 {
4954 // Possibly inform the user for any applicable Event_sets right now.
4955 event_set_all_check_delta(defer_delta_check);
4956 }
4957
4958 /* @todo After we implement graceful close, if we'd emptied m_snd_buf above, then here we should
4959 * advance the graceful close towards the final situation (m_int_state and m_state both
4960 * S_CLOSED). */
4961} // Node::send_worker()
4962
4964{
4965 using std::min;
4966
4967 /* m_snd_cong_ctl is the congestion control module, and its CWND value determines how many bytes can
4968 * be In-flight at any given time. If there are enough free bytes (CWND - In-flight) to send
4969 * data, then we can send. Otherwise we cannot. Easy, except what's "data"? There are two
4970 * reasonable answers. One: a byte or more. Two: min(max-block-size, Send buffer size). The former
4971 * answer is fine but somewhat annoying, because then we have to lock sock here***. The 2nd answer
4972 * clearly works but is potentially a little greedier than necessary (i.e., if the 1st block to
4973 * send is small enough to fit into CWND, but CWND doesn't have max-block-size space).
4974 * However, actually, we pretty much have to choose the 2nd answer regardless, as we don't want to
4975 * fragment max-block-size-sized chunks, if we can help it (in the spirit of the reliability
4976 * guarantee [when running in unreliable mode] made in send() method doc header).
4977 *
4978 * I choose the 2nd answer, because (1) it's easier (no locking of sock); (2) it is used by real
4979 * TCP implementations which keep CWND in multiples of MSS (equivalent of max-block-size); (3)
4980 * it's still safe; and (4) see previous paragraph's end. Regarding safety: it's safe, since
4981 * there can be no deadlock, because even if there's < MBS bytes free, eventually In-flight
4982 * packets will become Acknowledged or Dropped and no longer be In-flight, freeing up CWND space;
4983 * and CWND is guaranteed to always be at least 1 * MBS. Thus eventually can_send() will return
4984 * true.
4985 *
4986 * *** - I am now not sure why I wrote this. Why would we have to lock sock here in that case? */
4987
4988 // We have rcv_wnd also; so pretend previous paragraph has: s/CWND/min(CWND, rcv_wnd)/.
4989
4990 const size_t pipe_taken = sock->m_snd_flying_bytes;
4991 const size_t cong_wnd = sock->m_snd_cong_ctl->congestion_window_bytes();
4992 const size_t& rcv_wnd = sock->m_snd_remote_rcv_wnd; // @todo Any particular reason this has & but not pipe_taken?
4993 // Send no more than the network NOR the other side's Receive buffer can take.
4994 const size_t pipe_total = min(cong_wnd, rcv_wnd);
4995
4996 const bool can
4997 = (pipe_taken < pipe_total) && ((pipe_total - pipe_taken) >= sock->max_block_size());
4998
4999 FLOW_LOG_TRACE("cong_ctl [" << sock << "] info: can_send = [" << can << "]; "
5000 "pipe_taken = [" << sock->bytes_blocks_str(pipe_taken) << "]; "
5001 "cong_wnd = [" << sock->bytes_blocks_str(cong_wnd) << "]; "
5002 "rcv_wnd = [" << sock->bytes_blocks_str(rcv_wnd) << "].");
5003
5004 return can;
5005} // Node::can_send()
5006
5008 const Function<size_t ()>& rcv_buf_consume_func,
5009 Error_code* err_code)
5010{
5011 using boost::asio::post;
5012
5013 /* We are in user thread U != W.
5014 * It's important to keep that in mind in this method. In particular, it is absolutely unsafe to
5015 * access m_int_state, which belongs solely to thread W and is never locked. */
5016
5017 // IMPORTANT: The logic here must be consistent with sock_is_readable().
5018
5019 if (!running())
5020 {
5022 return 0;
5023 }
5024 // else
5025
5026 // Pre-condition is that m_mutex is locked already. So EVERYTHING that can be locked, is, including the buffers.
5027
5028 // Pre-condition.
5029 assert(sock->m_state == Peer_socket::State::S_OPEN); // Locked.
5030 assert((sock->m_open_sub_state == Peer_socket::Open_sub_state::S_CONNECTED) ||
5031 (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_CONNECTING) ||
5032 (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_DISCONNECTING));
5033
5034 /* In the rest of the method we must ensure we handle all the cases (-1a/b/c-, -2-) documented in
5035 * the Peer_socket::receive() documentation header. -3- was already handled by
5036 * Peer_socket::receive() before calling us. */
5037
5038 // Try to dequeue stuff into their buffer.
5039 const bool no_bytes_available = sock->m_rcv_buf.empty();
5040 const size_t bytes_consumed = rcv_buf_consume_func();
5041
5042 if (bytes_consumed != 0)
5043 {
5044 /* Unequivocal: if there was stuff in the Receive buffer and was able to place it into their
5045 * buffer then there is no error. (Even if m_disconnect_cause is not success, we are only
5046 * supposed to report that after the Receive buffer has been emptied.)
5047 *
5048 * This handles case -2-. */
5049 FLOW_LOG_TRACE("User thread receive() for [" << sock << "] "
5050 "has successfully returned [" << bytes_consumed << "] bytes.");
5051 err_code->clear();
5052
5053 /* We have changed (increased) the amount of free space in m_rcv_buf. This has rcv_wnd
5054 * implications. We have to at least check whether we should send a window update to the
5055 * other side. However all such book-keeping must be done in thread W due to the data
5056 * involved; call this->receive_wnd_updated(sock). */
5057 post(m_task_engine, [this, sock]() { receive_wnd_updated(sock); });
5058
5059 if (sock->m_rcv_buf.empty()
5060 && (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_DISCONNECTING))
5061 {
5062 /* We've emptied the Receive buffer; and we're in the middle of a graceful close. (@todo
5063 * Graceful close not yet implemented.) There are two possibilities. One, m_int_state ==
5064 * S_CLOSED. In this case the graceful close, at the transport layer, is over, and the only
5065 * thing stopping us from entering m_state == S_CLOSED (via close_connection_immediately())
5066 * was that the user hadn't read all of m_rcv_buf. In this case thread W should
5067 * close_connection_immediately(). Two, m_int_state may be after ESTABLISHED but before
5068 * CLOSED, in which case thread W still has to finish up graceful closing anyway.
5069 *
5070 * We are in thread W and cannot work with m_int_state, so checking it here is not possible.
5071 * Therefore we put this task onto thread W. */
5072 post(m_task_engine,
5073 [this, sock]() { receive_emptied_rcv_buf_while_disconnecting(sock); });
5074 }
5075 return bytes_consumed;
5076 }
5077 // else if (bytes_consumed == 0)
5078
5079 if (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_CONNECTING)
5080 {
5081 /* This is case -1b-. Since we are CONNECTING, no data could have been received yet (simply
5082 * not at that stage of connection opening), so Receive buffer is empty. */
5083 FLOW_LOG_TRACE("User thread receive() for [" << sock << "] "
5084 "has successfully returned no bytes because still not fully connected.");
5085 err_code->clear();
5086 return 0;
5087 }
5088 // else if (state is CONNECTED or DISCONNECTING)
5089
5090 /* We're CONNECTED or DISCONNECTING but could get no bytes. Let's examine each state.
5091 *
5092 * - CONNECTED: Either they provided a zero-sized target buffer (in which case
5093 * !no_bytes_available), or the Receive buffer is simply empty. Thus this is either -1a- or
5094 * -1c- (no_bytes_available determines which).
5095 *
5096 * - DISCONNECTING: Either:
5097 * - the initial block was too large for the max_data_size they provided in their receive()
5098 * call (in which case !no_bytes_available); or
5099 * - they called close_final() (@todo not yet implemented) and thus the Receive buffer was
5100 * cleared at that time, and all incoming data were ignored after that; thus the Receive
5101 * buffer is empty, but a graceful close is still in progress; or
5102 * - they did not call close_final(), but there is a graceful close in progress, and the
5103 * Receive buffer is simply empty.
5104 * Thus this is either -1a- or -1c-. */
5105
5106 if (!no_bytes_available)
5107 {
5108 // This is case -1c-.
5109 FLOW_LOG_TRACE("User thread receive() for [" << sock << "] "
5110 "has data to return, but the provided buffer size is too small.");
5111 err_code->clear();
5112 return 0;
5113 }
5114 // else if (no_bytes_available)
5115
5116 // This is case -1a-.
5117 FLOW_LOG_TRACE("User thread receive() for [" << sock << "] "
5118 "returning no data because Receive buffer empty.");
5119
5120 err_code->clear();
5121
5122 /* @todo Sigh. There's more. Yes, in some situations we can return 0/success here. In other
5123 * situations, though, we should return 0/<Error_code for graceful close> here. The latter
5124 * case would be in the situations where we know no data is coming, or user has said he doesn't
5125 * care about any more data:
5126 *
5127 * -1- A graceful close was initiated by the OTHER side. (Therefore no data could be coming to
5128 * save into Receive buffer.)
5129 * -2- Only we initiated the graceful close, but it was via close_final(), i.e., user is not
5130 * interested in any incoming data anymore. (Therefore we'll always just ignore any
5131 * incoming DATA and not put it into Receive buffer.)
5132 * -3- Only we initiated the graceful close, and it was via close_start() (i.e., user cares
5133 * about further incoming data); however, the final handshake has reached a state in which
5134 * further data cannot be incoming. (Therefore no data could be coming to save into Receive
5135 * buffer.)
5136 *
5137 * I am not writing code for this logic at this time. The implementations depends on how
5138 * exactly our graceful close works. This entire method, right now, is dead code, since there is
5139 * no graceful close, but I wrote it anyway to provide a skeleton for the future, since I
5140 * already thought about it. However it would be unreasonable to implement the above logic in the
5141 * absence of graceful close in the first place, skeleton or not. Therefore, dead code or not, I
5142 * do the "conservative" thing: return 0/success even in the above situations. Eventually the
5143 * graceful close will complete, at which point we'll return an error anyway, so the user won't be
5144 * left uninformed forever (worst case: the close will time out).
5145 *
5146 * For when we do implement the above logic, some thoughts: Detecting the situation in thread U
5147 * != W may be difficult and may introduce complex synchronization issues. One way
5148 * to do it might be to introduce synchronized bool Peer_socket::m_no_more_rcv_data, which
5149 * starts at false and can become true (but not false again). This member would be set to true,
5150 * by thread W, if and only if one of the above situations is detected by thread W. Then here
5151 * we'd check it, and if it's true, return error; otherwise return success.
5152 *
5153 * IMPORTANT: The logic here must be consistent with sock_is_readable(). */
5154 return 0;
5155} // Node::receive()
5156
5157bool Node::sock_is_readable(const boost::any& sock_as_any) const
5158{
5159 using boost::any_cast;
5160
5161 const Peer_socket::Const_ptr sock = any_cast<Peer_socket::Ptr>(sock_as_any);
5162
5163 Peer_socket::Lock_guard lock(sock->m_mutex); // Many threads can access/write below state.
5164
5165 /* Our task here is to return true if and only if at this very moment calling sock->receive(),
5166 * assuming sufficient user buffer space, would yield either a return value of > 0 OR a
5167 * non-success *err_code. In other words, receive() would return "something." This is used for
5168 * Event_set machinery.
5169 *
5170 * This should mirror receive()'s algorithm. @todo Should receive() call this, for code reuse?
5171 * Maybe/maybe not. Consider performance when deciding.
5172 *
5173 * - If state is CLOSED, then some sort of error/terminating condition occurred, so receive()
5174 * would return 0 and non-success Error_code == sock->m_disconnect_cause. (Readable.)
5175 * - Otherwise, if Receive buffer can be dequeued, receive() would return > 0.
5176 * - Otherwise, if Receive buffer cannot be dequeued, receive() would return 0 and no error. (Not
5177 * Readable.) Note that Receive buffer is guaranteed to be clear when entering non-Readable
5178 * non-error states (OPEN+CONNECTING, OPEN+DISCONNECTING). (Readable.)
5179 *
5180 * @todo Once we implement graceful close, there will be situations where Receive buffer is empty, state is
5181 * OPEN+DISCONNECTING, m_disconnect_cause = <cause of disconnect>, and we should return true (Readable)
5182 * here (only when we also know that no future Receive traffic possible). See receive(). */
5183
5184 return (sock->m_state == Peer_socket::State::S_CLOSED) || rcv_buf_deqable(sock);
5185} // Node::sock_is_readable()
5186
5188{
5189 // We are in thread W.
5190
5191 /* rcv_wnd (free Receive buffer space) is sent to other side opportunistically in ACKs. While
5192 * sender is sending data, they will have a good idea of our rcv_wnd as well. Is that (in a
5193 * one-way-traffic situation) sufficient however? If the sender is not sending data, because the
5194 * application on the sender doesn't provide more data to send, then the discussion is moot.
5195 * What if the sender is not sending data, because we have told it rcv_wnd is 0 (meaning our
5196 * Receive buffer is full)? This can and will happen. For example suppose our application layer
5197 * simply stops reading from Receive buffer for a while, resulting in rcv_wnd 0 sent in one of the
5198 * ACKs. Now sender knows rcv_wnd is 0. Now suppose our application reads off the entire Receive
5199 * buffer. rcv_wnd is now 100%, but since sender is not sending (because it thinks rcv_wnd is
5200 * still 0), there will be no ACKs onto which to add rcv_wnd. Thus the traffic completely stops.
5201 *
5202 * Original RFC 793 (as well as RFC 1122) suggests TCP sender should deal with this by "probing"
5203 * with 1-byte (I think) data segments sent regularly (every RTO; our DTO) in order to trigger
5204 * ACKs, which would eventually expose the non-zero rcv_wnd. To me this seems to have the
5205 * disadvantage of complexity and implications on how we packetize data (especially since in
5206 * unreliable mode we're not supposed to break up contiguous blocks of max-block-size bytes).
5207 * Also it is not as responsive as it could be. Consider that the most common scenario in
5208 * high-speed downloads is that the Receive buffer is exceeded only momentarily (due to thread
5209 * contention on receiver or something) but is then quickly emptied (once the thread contention is
5210 * resolved). In case that happens in a fraction of a second, having the probe occur a DTO later
5211 * wastes a long time. Instead the RECEIVER could take initiative and send an empty ACK with a
5212 * rcv_wnd update. When should it do this? A naive answer would be to do it simply EVERY time
5213 * free Receive buffer space increases. However that'd be terrible, as in a typical scenario
5214 * (where lots of bytes arrive, while user reads off lots of bytes due to them becoming available
5215 * to read) it would explode the number of ACKs. Even in the "sender has stopped due to
5216 * rcv_wnd=0" situation, this would result in a ton of ACKs. Plus it would cause sender to start
5217 * recovering with quite small windows which is inefficient. So the less naive way is to send the
5218 * ACK of our volition if free buffer space has increased by some % of its max capacity (like
5219 * 50%).
5220 *
5221 * This would certainly solve aforementioned situation where Receive buffer fills up momentarily
5222 * but then is quickly cleared. A fraction of a second later, the free space will have increased
5223 * by over 50%, an ACK would go to sender, and sender would work with a nice large rcv_wnd.
5224 * However, if the Receiver only reads off 49% of the data and then stops, traffic would remain
5225 * stuck (even though 49% of the buffer is available). This is where the sender-side probing
5226 * would solve it (slowly); though sender-side unsolicited ACKing on a timer would also do. I
5227 * leave that as a @todo; probably important in a widely-used net_flow; but without it it should be
5228 * sufficient for the initial intended purpose of net_flow. In that use scenario, we count on the
5229 * receiver code to be well behaved and read from Receive buffer as soon as the computer lets it.
5230 *
5231 * With that settled, there is one more concern. This is intuitively clear but is also mentioned
5232 * in RFC 1122-4.2.2.17. Suppose the receiver-initiated ACK after 50% of buffer is cleared is
5233 * dropped by the network. ACKs are not reliable (there are no ACKs of ACKs), so then we're back
5234 * in no-more-traffic-forever land. To solve this, I implement this scheme: Having sent that ACK,
5235 * start a timer and then send it again periodically, until some long time period (something like
5236 * a minute) expires (just in case) OR we get a new DATA packet from the sender. In the latter
5237 * case we're back in business, as it implies sender got our window update. Note that this
5238 * mechanism is not necessary any longer, once we implement sender-side probing as explained
5239 * above. */
5240
5241 // As always, no need to lock m_state, etc., unless we plan to alter them, since no other thread can alter them.
5242
5243 if (sock->m_int_state != Peer_socket::Int_state::S_ESTABLISHED)
5244 {
5245 /* Yes, they emptied Receive buffer. However, we haven't finished the graceful close.
5246 * Therefore -- even though one more barrier to reaching m_state == S_CLOSED has been removed --
5247 * there's nothing further to do at this time. In fact, in certain situations we might even
5248 * get more data onto the Receive buffer! @todo No graceful close yet. */
5249 FLOW_LOG_INFO('[' << sock << "] Receive buffer space freed, "
5250 "but state is now [" << sock->m_int_state << "]; ignoring.");
5251 return;
5252 }
5253 // else if (m_int_state == S_ESTABLISHED)
5254
5255 if (sock->m_rcv_in_rcv_wnd_recovery)
5256 {
5257 /* We have already sent the unsolicited ACK and are currently in the phase where we're
5258 * periodically sending more, until we get some DATA from sender or a long period of time
5259 * passes. Even if we've freed yet another large chunk of the buffer since the last ACK, do
5260 * not start again... just let it continue. */
5261 FLOW_LOG_TRACE('[' << sock << "] Receive buffer space freed, but "
5262 "we are already in rcv_wnd recovery mode. Nothing to do.");
5263 return;
5264 }
5265 // else
5266
5267 // Grab available Receive buffer space.
5268 const size_t rcv_wnd = sock_rcv_wnd(sock);
5269 // @todo That was a ~copy/paste of Node::async_low_lvl_ack_send(). Add code reuse.
5270
5271 const size_t& last_rcv_wnd = sock->m_rcv_last_sent_rcv_wnd;
5272
5273 if (rcv_wnd <= last_rcv_wnd)
5274 {
5275 /* This seems odd, but one can imagine more data arriving between when we were placed onto W's
5276 * task queue and when we executed. So it's not that odd and not worth INFO or WARNING. */
5277 FLOW_LOG_TRACE('[' << sock << "] Receive buffer space freed, but "
5278 "free space [" << sock->bytes_blocks_str(rcv_wnd) << "] <= prev "
5279 "free space [" << sock->bytes_blocks_str(last_rcv_wnd) << "]. Nothing to do.");
5280 return;
5281 }
5282 // else
5283
5284 const size_t diff = rcv_wnd - last_rcv_wnd;
5285 const unsigned int pct = sock->opt(sock->m_opts.m_st_rcv_buf_max_size_to_advertise_percent);
5286 const size_t max_rcv_buf_size = sock->max_block_size_multiple(sock->m_opts.m_st_rcv_buf_max_size);
5287 const size_t min_inc = max_rcv_buf_size * pct / 100;
5288
5289 if (diff < min_inc)
5290 {
5291 // Not big enough increase; wait until more space is freed before informing other side.
5292 FLOW_LOG_TRACE('[' << sock << "] Receive buffer space "
5293 "freed is [" << sock->bytes_blocks_str(diff) << "] since last advertisement; "
5294 "< threshold [" << pct << "%] x "
5295 "[" << sock->bytes_blocks_str(max_rcv_buf_size) << "] = "
5296 "[" << sock->bytes_blocks_str(min_inc) << "]. Not advertising rcv_wnd yet.");
5297 return;
5298 }
5299 // else cool. Let's advertise it.
5300
5301 // This is ~equally as rare as Receive buffer overflows, so this is worth an INFO message.
5302 FLOW_LOG_INFO('[' << sock << "] Receive buffer space "
5303 "freed is [" << sock->bytes_blocks_str(diff) << "] since last advertisement; "
5304 "rcv_wnd = [" << sock->bytes_blocks_str(rcv_wnd) << "]; "
5305 ">= threshold [" << pct << "%] x "
5306 "[" << sock->bytes_blocks_str(max_rcv_buf_size) << "] = "
5307 "[" << sock->bytes_blocks_str(min_inc) << "]. Sending unsolicited rcv_wnd-advertising ACK "
5308 "and entering rcv_wnd recovery.");
5309
5310 // Prevent any further shenanigans (see above), until we exit this mode.
5311 sock->m_rcv_in_rcv_wnd_recovery = true;
5312 // Mark this down, so that we exit this mode eventually.
5313 sock->m_rcv_wnd_recovery_start_time = Fine_clock::now();
5314
5315 // Record we started the mode.
5316 sock->m_rcv_stats.rcv_wnd_recovery_event_start();
5317
5318 async_rcv_wnd_recovery(sock, rcv_wnd);
5319} // Node::receive_wnd_updated()
5320
5322{
5323 using boost::chrono::milliseconds;
5324 using boost::chrono::round;
5325 using boost::weak_ptr;
5326
5327 // We are in thread W.
5328
5329 // As discussed in Node::receive_wnd_updated(), send the ACK and then periodically re-send it until canceled.
5330
5331 // Create an ACK with no packets acknowledged (so just a window update) and send it off.
5332 auto ack = Low_lvl_packet::create_uninit_packet<Ack_packet>(get_logger());
5333 ack->m_rcv_wnd = rcv_wnd;
5334 // Record that it was advertised!
5335 sock->m_rcv_last_sent_rcv_wnd = rcv_wnd;
5336
5338
5339 // Register one ACK packet we will send ASAP (and that it acknowledged no individual packets).
5340 sock->m_rcv_stats.sent_low_lvl_ack_packet(true);
5341
5342 // ACK queued to send soon. Now, as discussed, protect against it being lost by scheduling a timer.
5343
5344 const Fine_duration fire_when_from_now = sock->opt(sock->m_opts.m_dyn_rcv_wnd_recovery_timer_period);
5345
5346 FLOW_LOG_INFO("Setting timer to fire "
5347 "[" << round<milliseconds>(fire_when_from_now) << "] from now.");
5348
5349 /* As usual, when scheduling a thing we can use the much simpler util::schedule_task_*() API; or the
5350 * full-featured boost.asio Timer. We don't need the advanced features; so the only possible reason
5351 * to go with Timer would be the perf considerations (see schedule_task_from_now() doc header for discussion).
5352 * It is emphatically NOT the case that lots of these tasks are scheduled/fired/canceled per unit time;
5353 * e.g., we see it as rare enough to be OK with an INFO log message. Hence no need to reuse a Timer repeatedly,
5354 * so use the simple API. */
5355
5356 sock->m_rcv_wnd_recovery_scheduled_task
5357 = schedule_task_from_now(get_logger(), fire_when_from_now, true, &m_task_engine,
5358 [this, sock_observer = weak_ptr<Peer_socket>(sock)](bool)
5359 {
5360 // We are in thread W.
5361
5362 auto sock = sock_observer.lock();
5363 if (!sock)
5364 {
5365 return; // Possible or not, allow for this possibility for maintainability.
5366 }
5367 // else
5368
5369 const Fine_duration since_recovery_started = Fine_clock::now() - sock->m_rcv_wnd_recovery_start_time;
5370 if (since_recovery_started > sock->opt(sock->m_opts.m_dyn_rcv_wnd_recovery_max_period))
5371 {
5372 // We've kept ACKing for a long time, and still no data. Give up: it's all up to the sender now.
5373
5374 // This is ~equally as rare as Receive buffer overflows, so this is worth an INFO message.
5375 FLOW_LOG_INFO('[' << sock << "]: still no new DATA arrived since last rcv_wnd advertisement; "
5376 "Time since entering recovery [" << round<milliseconds>(since_recovery_started) << "] expired. "
5377 "Ending rcv_wnd recovery.");
5378 sock->m_rcv_in_rcv_wnd_recovery = false;
5379
5380 // Record we ended in timeout.
5381 sock->m_rcv_stats.rcv_wnd_recovery_event_finish(false);
5382
5383 return;
5384 }
5385 // else
5386
5387 // Still in rcv_wnd recovery. Send another unsolicited ACK (as in receive_wnd_updated()).
5388
5389 // Re-grab available Receive buffer space.
5390 const size_t rcv_wnd = sock_rcv_wnd(sock);
5391
5392 // This is ~equally as rare as Receive buffer overflows, so this is worth an INFO message.
5393 FLOW_LOG_INFO('[' << sock << "]: still no new DATA arrived since last rcv_wnd advertisement; "
5394 "rcv_wnd = [" << sock->bytes_blocks_str(rcv_wnd) << "]; "
5395 "time since entering recovery [" << round<milliseconds>(since_recovery_started) << "]. "
5396 "Sending unsolicited rcv_wnd-advertising ACK and continuing rcv_wnd recovery.");
5397
5398 async_rcv_wnd_recovery(sock, rcv_wnd);
5399 }); // on-scheduled-task-fired
5400} // Node::async_rcv_wnd_recovery()
5401
5403{
5404 using boost::chrono::milliseconds;
5405 using boost::chrono::round;
5407
5408 // We are in thread W.
5409
5410 // We got some good DATA. If we were sending unsolicited window update ACKs, we can now stop.
5411
5412 if (!sock->m_rcv_in_rcv_wnd_recovery)
5413 {
5414 // We weren't.
5415 return;
5416 }
5417 // else
5418
5419 // This is ~equally as rare as Receive buffer overflows, so this is worth an INFO message.
5420 FLOW_LOG_INFO('[' << sock << "]: Canceling rcv_wnd recovery; "
5421 "Time since entering recovery "
5422 "[" << round<milliseconds>(Fine_clock::now() - sock->m_rcv_wnd_recovery_start_time) << "].");
5423
5424 sock->m_rcv_in_rcv_wnd_recovery = false;
5425#ifndef NDEBUG
5426 const bool canceled =
5427#endif
5428 scheduled_task_cancel(get_logger(), sock->m_rcv_wnd_recovery_scheduled_task);
5429 assert(canceled);
5430
5431 // Record we ended in success.
5432 sock->m_rcv_stats.rcv_wnd_recovery_event_finish(true);
5433}
5434
5436{
5437 using std::numeric_limits;
5438
5439 // We are in thread W.
5440
5441 if (!sock->opt(sock->m_opts.m_st_rcv_flow_control_on))
5442 {
5443 /* Flow control disabled, so if we always advertise the same huge value, the other side will
5444 * never stop sending due to rcv_wnd. On this side, we won't activate rcv_wnd recovery, because
5445 * the "last advertised" window will always equal the current window. */
5446 return numeric_limits<size_t>::max();
5447 }
5448 // else
5449
5450 // Grab available Receive buffer space. We have to momentarily lock sock due to access to sock->m_rcv_buf.
5451 size_t rcv_buf_size;
5452 {
5454 rcv_buf_size = sock->m_rcv_buf.data_size();
5455 }
5456
5457 // Add the reassembly queue cumulative stored data size. Why? See sock_data_to_reassembly_q_unless_overflow().
5458 if (sock->rexmit_on())
5459 {
5460 rcv_buf_size += sock->m_rcv_reassembly_q_data_size; // (At least one reason we must be in thread W.)
5461 }
5462
5463 const size_t max_rcv_buf_size = sock->max_block_size_multiple(sock->m_opts.m_st_rcv_buf_max_size);
5464
5465 return (max_rcv_buf_size > rcv_buf_size) ? (max_rcv_buf_size - rcv_buf_size) : 0;
5466}
5467
5469{
5470 // We are in thread W.
5471
5472 /* As always, no need to lock m_state, etc., unless we plan to alter them, since no other thread can alter them.
5473 * ...On the other hand, we are going to be checking m_rcv_buf for emptiness below, and if it's not empty,
5474 * a user thread U != W may be altering it right now by consuming it. So, lock.
5475 *
5476 * Could think about locking later in this function, but this is called so rarely I'd rather not have to
5477 * worry about whether it's OK to do that and just not. */
5478 Peer_socket::Lock_guard lock(sock->m_mutex);
5479
5480 if (sock->m_state == Peer_socket::State::S_CLOSED)
5481 {
5482 /* When were placed onto thread W, state was S_OPEN+S_DISCONNECTING, but before boost.asio
5483 * could execute us, it executed another handler which already moved us to S_CLOSED for
5484 * whatever reason (there are many valid ones). So just don't do anything, as we no longer
5485 * apply. It's kind of interesting, so log INFO message. */
5486 FLOW_LOG_INFO('[' << sock << "] "
5487 "was completely closed before asynchronous "
5488 "receive_emptied_rcv_buf_while_disconnecting() could proceed.");
5489 return;
5490 }
5491 // else
5492
5493 // Sanity-check (we cannot be called until there's a graceful close underway).
5494 assert((sock->m_state == Peer_socket::State::S_OPEN) &&
5495 (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_DISCONNECTING));
5496
5497 const Socket_id socket_id = Node::socket_id(sock);
5498
5499 if (sock->m_int_state != Peer_socket::Int_state::S_CLOSED)
5500 {
5501 /* Yes, they emptied Receive buffer. However, we haven't finished the graceful close.
5502 * Therefore -- even though one more barrier to reaching m_state == S_CLOSED has been removed --
5503 * there's nothing further to do at this time. In fact, in certain situations we might even
5504 * get more data onto the Receive buffer! @todo No graceful close yet. */
5505 FLOW_LOG_TRACE('[' << sock << "] "
5506 "is gracefully closing, and Receive buffer is empty, but graceful close itself not yet finished.");
5507 return;
5508 }
5509 // else if (m_int_state == S_CLOSED)
5510
5511 // Ensure Receive buffer is indeed still empty. (Can still get data while gracefully closing.)
5512 if (!sock->m_rcv_buf.empty())
5513 {
5514 /* Some data arrived between the time we were placed on thread W and boost.asio executing us.
5515 * So we can't do anything; user has to receive() the stuff first, which should call us again. */
5516 FLOW_LOG_TRACE('[' << sock << "] "
5517 "is gracefully closing, but Receive buffer has data again.");
5518 return;
5519 }
5520 // else if (m_int_state == S_CLOSED, and m_rcv_buf is empty)
5521
5522 // Yes, the transport layer final handshake is finished. Since Receive buffer now empty, no more barriers remain.
5523 FLOW_LOG_TRACE('[' << sock << "] "
5524 "is gracefully closing, and Receive buffer is now empty. Ready to permanently close.");
5526 Error_code(), /* err_code == success indicates clean close here. */
5527 false);
5528 /* ^-- defer_delta_check == false: for similar reason as when calling send_worker() from
5529 * send_worker_check_state(). */
5530} // Node::receive_emptied_rcv_buf_while_disconnecting()
5531
5533{
5534 using boost::adopt_lock;
5537
5538 /* We are in user thread U != W.
5539 * It's important to keep that in mind in this method. In particular, it is absolutely unsafe to
5540 * access m_int_state, which belongs solely to thread W and is never locked. */
5541
5542 {
5543 /* WARNING!!! sock->m_mutex is locked, but WE must unlock it before returning! Can't leave that
5544 * to the caller, because we must unlock at a specific point below, right before post()ing
5545 * close_abruptly_worker() onto thread W. Use a Lock_guard that adopts an
5546 * already-locked mutex. */
5547 Peer_socket::Lock_guard lock(sock->m_mutex, adopt_lock);
5548
5549 if (!running())
5550 {
5552 return;
5553 }
5554 // else
5555
5556 // Pre-condition.
5557 assert(sock->m_state == Peer_socket::State::S_OPEN);
5558
5559 /* Put the rest of the work into thread W. For justification, see big comment in listen().
5560 * Addendum regarding performance: close_abruptly() is probably called more frequently than
5561 * listen(), but I doubt the performance impact is serious even so. send() and receive() might be
5562 * a different story. */
5563
5564 // We're done -- must unlock so that thread W can do what it wants to with sock.
5565 } // lock
5566
5567 // Load this onto thread W boost.asio work queue. We don't return until it runs, so [&].
5568 asio_exec_ctx_post(get_logger(), &m_task_engine, Synchronicity::S_ASYNC_AND_AWAIT_CONCURRENT_COMPLETION, [&]()
5569 {
5570 // We are in thread W. Thread U is waiting for us to do our stuff and return.
5571
5572 /* Since we were placed onto thread W, another handler may have been executed before boost.asio
5573 * got to us. Therefore we may already be S_CLOSED. Detect this. */
5574
5575 if (sock->m_state == Peer_socket::State::S_CLOSED) // No need to lock: only W can write to this.
5576 {
5577 // Yep, already closed. sock->m_disconnect_cause is already set to closure reason. Done.
5578 *err_code = sock->m_disconnect_cause;
5579 return;
5580 }
5581 // else
5582
5583 /* Cool, we're not quite closed yet. We could be connecting... or connected... or even in the
5584 * middle of graceful close (@todo that's not yet implemented). Any of those situations allow
5585 * close_abruptly(), just as (indeed because of the fact that) any of those situations allow
5586 * close_connection_immediately() (..., error::...).
5587 *
5588 * Therefore simply do the following. Pre-conditions hold: sock is in m_socks and is S_OPEN
5589 * (because not S_CLOSED); 3rd arg contains failure reason. */
5591 /* ^-- defer_delta_check == false: for similar reason as when calling send_worker() from
5592 * send_worker_check_state(). */
5593
5594 // That set sock->m_disconnect_cause. Closure successful. Done.
5595 err_code->clear(); // Success.
5596 }); // asio_exec_ctx_post()
5597 // If got here, the task has completed in thread W and signaled us to that effect.
5598} // Node::close_abruptly()
5599
5601 const Error_code& err_code, bool defer_delta_check)
5602{
5603 using boost::lexical_cast;
5604 using std::string;
5605
5606 // We are in thread W.
5607
5608 // @todo OK if a graceful close (S_OPEN+S_DISCONNECTING) is already in progress? Below provides for it, but ensure.
5609 assert(sock->m_state == Peer_socket::State::S_OPEN);
5610
5611 if (err_code)
5612 {
5613 FLOW_ERROR_LOG_ERROR(err_code);
5614 FLOW_LOG_INFO("Closing and destroying [" << sock << "] abruptly.");
5615 }
5616 else
5617 {
5618 // m_disconnect_cause has already been set and logged.
5619 FLOW_LOG_INFO("Closing and destroying [" << sock << "] after graceful close.");
5620 }
5621 // Log final state report.
5622 sock_log_detail(sock);
5623
5624 /* Thread safety: we're in thread W, so no need to lock things by default (as most resources can
5625 * also only be accessed from thread W). Exceptions are certain data members in Peer_socket
5626 * sock and Server_socket serv that may have originated it (if it was a passive open). I will
5627 * comment on the locking situation for those data members as they come up in the code. */
5628
5629 // First, set various state in *sock (including emptying Send and Receive buffers and setting m_node = 0).
5630
5631 /* Save the final set of stats for Peer_socket::info(), as the source data will probably get
5632 * purged just below in sock_disconnect_*(). */
5633 sock_load_info_struct(sock, &sock->m_info_on_close);
5634 // We may have to massage it a little more, because some info is set below, by when it's too late.
5635
5636 if (err_code)
5637 {
5638 // sock->m_disconnect_cause has not yet been set; so sock_load_info_struct() did not copy it properly yet. Do so.
5639 sock->m_info_on_close.m_disconnect_cause = err_code;
5640 // Similarly:
5641 sock->m_info_on_close.m_int_state_str = lexical_cast<string>(Peer_socket::Int_state::S_CLOSED);
5642
5643 /* This is an abrupt close. This can be called in any situation once sock is in m_socks. It's
5644 * our responsibility to move directly to transport layer state S_CLOSED and user state
5645 * S_CLOSED. */
5646 sock_set_int_state(sock, Peer_socket::Int_state::S_CLOSED); // Thread W access only; no need to lock.
5647 // Sets S_CLOSED public state (and related data, including m_disconnect_cause). Locked inside.
5648 sock_disconnect_detected(sock, err_code, true);
5649 }
5650 else
5651 {
5652 /* We are in a graceful close and have reached the final stage of it (connection entirely
5653 * closed without having to abruptly close; buffers emptied gracefully by user and/or Node).
5654 * Therefore m_int_state is already S_CLOSED (method pre-condition), so
5655 * we just complete the user-visible state change. */
5656
5657 assert(sock->m_int_state == Peer_socket::Int_state::S_CLOSED); // Thread W access only; no need to lock.
5658 sock_disconnect_completed(sock); // Sets S_CLOSED public state (and related data). Locked inside.
5659 }
5660
5661 // Next, remove sock from our main socket list.
5662
5663#ifndef NDEBUG
5664 const auto erased = 1 ==
5665#endif
5666 m_socks.erase(socket_id);
5667 assert(erased); // S_OPEN => it's in m_socks. Otherwise there's a serious bug somewhere.
5668
5669 // Next, if this potentially is an unaccepted connection, delete it from the corresponding server socket.
5670 if (!sock->m_active_connect)
5671 {
5672 /* What is that Server_socket though? Well, it's in sock->m_originating_serv... but that data
5673 * member can be accessed from a non-W thread, so we'd have to lock it. But the mutex that
5674 * protects it in in *m_originating_serv itself! So it's a chicked/egg problem. However, we
5675 * can find that Server_socket (if it applies to sock) another way: through the port. Its port
5676 * must be the same as local_port. If such a Server_socket exists, cool; and if sock is
5677 * tracked inside it, cool. Otherwise we needn't do anything. */
5678 Port_to_server_map::const_iterator port_to_server_it = m_servs.find(sock->m_local_port);
5679 if (port_to_server_it != m_servs.end()) // Server at same port number exists. Not necessarily our guy though.
5680 {
5681 // If it is our guy, delete us from him.
5682 Server_socket::Ptr serv = port_to_server_it->second;
5683 serv_peer_socket_closed(serv, sock); // Thread-safe (in particular with respect to simultaneous serv->accept()).
5684 }
5685 }
5686
5687 // sock now should not be (directly or indirectly) referenced in any Node data structures.
5688
5689 // Cancel any timers.
5690 cancel_timers(sock);
5691
5692 /* Return the port -- but only if it is an active open. If it's a passive open the port is
5693 * still reserved for the server socket. */
5694 if (sock->m_active_connect)
5695 {
5696 Error_code return_err_code;
5697 m_ports.return_port(sock->m_local_port, &return_err_code);
5698 assert(!return_err_code);
5699 }
5700
5701 /* sock has changed to CLOSED state. Performing sock->receive() or sock->write() would therefore
5702 * certainly return an error. Returning an error from those methods (as opposed to 0 but no
5703 * error) is considered Readable and Writable, respectively (as we want to alert the user to the
5704 * error, so her wait [if any] wakes up and notices the error). Therefore we should soon inform
5705 * anyone waiting on any Event_sets for sock to become Readable or Writable.
5706 *
5707 * Caveat: Similar to that in Node::handle_syn_ack_ack_to_syn_rcvd() at similar point in the
5708 * code. */
5709
5710 // Accumulate the event into the Node store (note: not any Event_set yet).
5711 const bool inserted_rd = m_sock_events[Event_set::Event_type::S_PEER_SOCKET_READABLE].insert(sock).second;
5712 const bool inserted_wr = m_sock_events[Event_set::Event_type::S_PEER_SOCKET_WRITABLE].insert(sock).second;
5713 if (inserted_rd || inserted_wr) // Must always perform both insert()s, hence the use of the 2 variables.
5714 {
5715 // Possibly inform the user for any applicable Event_sets right now.
5716 event_set_all_check_delta(defer_delta_check);
5717 }
5718} // Node::close_connection_immediately()
5719
5721 const Error_code& err_code, bool defer_delta_check)
5722{
5723 // We are in thread W.
5725 close_connection_immediately(socket_id, sock, err_code, defer_delta_check);
5726}
5727
5729{
5730 using util::Blob;
5731
5732 auto syn = Low_lvl_packet::create_uninit_packet<Syn_packet>(get_logger());
5733 // Initial Sequence Number.
5734 syn->m_init_seq_num = sock->m_snd_init_seq_num;
5735 /* Send serialized version of arbitrary user data, which user can deserialize on the other side
5736 * after accepting connection.
5737 * Add const to express we require a copy, not move. */
5738 syn->m_serialized_metadata = static_cast<const Blob&>(sock->m_serialized_metadata);
5739
5740 return syn;
5741}
5742
5744{
5745 auto syn_ack = Low_lvl_packet::create_uninit_packet<Syn_ack_packet>(get_logger());
5746 // Initial Sequence Number (the start of our own series).
5747 syn_ack->m_init_seq_num = sock->m_snd_init_seq_num;
5748 // Random security token.
5749 syn_ack->m_packed.m_security_token = sock->m_security_token;
5750 // Advertise initial rcv_wnd.
5751 syn_ack->m_packed.m_rcv_wnd = sock->m_rcv_last_sent_rcv_wnd;
5752
5753 return syn_ack;
5754}
5755
5757 boost::shared_ptr<const Syn_ack_packet>& syn_ack)
5758{
5759 // Make a packet.
5760 auto syn_ack_ack = Low_lvl_packet::create_uninit_packet<Syn_ack_ack_packet>(get_logger());
5761 // No sequence number (not the initial SYN; not data).
5762 // Security token: give it back to them (they will verify).
5763 syn_ack_ack->m_packed.m_security_token = syn_ack->m_packed.m_security_token;
5764 // Initial receive window is probably the entire, ~empty Receive buffer. Save the advertised rcv_wnd as promised.
5765 syn_ack_ack->m_packed.m_rcv_wnd = sock->m_rcv_last_sent_rcv_wnd = sock_rcv_wnd(sock);
5766
5767 // Fill out common fields and asynchronously send packet.
5769}
5770
5772{
5773 using boost::chrono::milliseconds;
5774 using boost::chrono::duration_cast;
5775 using std::make_pair;
5776 using std::vector;
5777 using std::numeric_limits;
5778
5779 // We are in thread W.
5780
5781 // Handle the timer-related corner cases (if we were invoked by m_rcv_delayed_ack_timer triggering).
5782
5783 // For brevity and speed:
5784 vector<Peer_socket::Individual_ack::Ptr>& pending_acks = sock->m_rcv_pending_acks;
5785
5786 if (sys_err_code == boost::asio::error::operation_aborted)
5787 {
5788 FLOW_LOG_TRACE("Delayed [ACK] timer [" << sock << "] canceled; "
5789 "pending acknowledgment count [" << pending_acks.size() << "].");
5790 return;
5791 }
5792 // else
5793
5794 FLOW_LOG_TRACE("Delayed [ACK] timer [" << sock << "] triggered, or ACK forced; "
5795 "pending acknowledgment count [" << pending_acks.size() << "].");
5796
5797 if (sys_err_code)
5798 {
5799 FLOW_ERROR_SYS_ERROR_LOG_WARNING(); // Log non-portable error.
5800 // Nothing else to do here. We don't know what this means. So just treat it as if timer was triggered.
5801 }
5802
5803 if (sock->m_int_state != Peer_socket::Int_state::S_ESTABLISHED)
5804 {
5805 /* This is unlikely but legitimate. (Can happen if, by the time the handler that advanced state
5806 * from ESTABLISHED to another state started, this timer also was triggered and thus queued the
5807 * current handler inside m_task_engine.) */
5808 FLOW_LOG_TRACE("Delayed [ACK] timer [" << sock << "] triggered, "
5809 "but socket already in inapplicable state [" << sock->m_int_state << "]. Ignoring.");
5810 return;
5811 }
5812 // else
5813
5814 if (pending_acks.empty())
5815 {
5816 /* This is probably a bug if we're here. However, assert() or connection closure seems a bit
5817 * drastic... carry on. */
5818 FLOW_LOG_WARNING("Delayed [ACK] timer [" << sock << "] triggered, "
5819 "but socket has no pending acknowledgments. This is likely an internal bug. Ignoring.");
5820 return;
5821 }
5822 // else
5823
5824 /* OK, let's do it. Basically just shove all the acknowledgments into an ACK packet. Namely, for
5825 * each one, shove the starting sequence number and the amount of time since we first received it
5826 * (so the other side can subtract that to compute RTT, if it wants).
5827 *
5828 * However we may run out of space and need more ACKs. To keep track of how much space we've
5829 * used, compute an estimate for serializing those two pieces of data and keep adding that for
5830 * each acknowledgment handled. The budget is given by max-block-size; a DATA packet is allowed
5831 * that much payload on top of the normal header stuff, so that should be good enough for us too.
5832 * There's probably some constant overhead on top of that, but it's close enough.
5833 *
5834 * ACK is also used as an opportunistic way to send rcv_wnd to the other side, which informs
5835 * them of how much more data we can take at this time. Naively we should just have rcv_wnd =
5836 * the max buffer size minus the buffer space currently taken, and that is the most accurate
5837 * thing. However RFC 793 ("Window Management Suggestions") and probably other literature
5838 * suggest to (when the available space is increasing) advertise the window in larger steps (so
5839 * withhold the higher rcv_wnd value until it increases even further up to some threshold). For
5840 * now I forego such fanciness. See also the rcv_wnd-related comment in
5841 * Node::receive_wnd_increased() for further reasoning on rcv_wnd (namely surrounding the fact
5842 * that sometimes we must send ACKs with no packets acknowledged to ensure a connection does not
5843 * stall due to a zero rcv_wnd). */
5844
5845 // Grab available Receive buffer space. Save it for later comparison.
5846 const size_t& rcv_wnd = sock->m_rcv_last_sent_rcv_wnd = sock_rcv_wnd(sock);
5847
5848 auto ack = Low_lvl_packet::create_uninit_packet<Ack_packet>(get_logger());
5849 ack->m_rcv_wnd = rcv_wnd; // Advertise receive window. @todo Code reuse?
5850
5851 const size_t max_block_size = sock->max_block_size();
5852 size_t size_est_inc
5854 if (sock->rexmit_on())
5855 {
5856 size_est_inc += sizeof(Low_lvl_packet::rexmit_id_t);
5857 }
5858 assert(size_est_inc <= max_block_size); // At least one has to fit.
5859
5860 const Fine_time_pt time_now = Fine_clock::now();
5861 size_t size_est_so_far = sizeof(Low_lvl_packet::rcv_wnd_t); // How many raw bytes we have, approximately, used.
5862 for (Peer_socket::Individual_ack::Const_ptr ind_ack : pending_acks)
5863 {
5864 if (size_est_so_far + size_est_inc > max_block_size)
5865 {
5866 // Too big. Send off what we have.
5868
5869 // Register one ACK packet we will send ASAP.
5870 sock->m_rcv_stats.sent_low_lvl_ack_packet(false);
5871
5872 // As async_sock_low_lvl_packet_send_paced() says, we cannot reuse ack's pointed-to-object. Make new one.
5873 ack = Low_lvl_packet::create_uninit_packet<Ack_packet>(get_logger());
5874 ack->m_rcv_wnd = rcv_wnd; // Advertise receive window. @todo Code reuse?
5875
5876 size_est_so_far = sizeof(Low_lvl_packet::rcv_wnd_t);
5877 }
5878
5879 // Add the acknowledgment to the current ACK.
5880
5881 // First sequence number in packet.
5882 const Sequence_number& seq_num = ind_ack->m_seq_num;
5883
5884 // ACK delay for this individual acknowledgment. Compute it; then validate it.
5885
5886 /* @todo In low_lvl_io, we perform packet pacing but currently choose to assign a value of
5887 * 0 bytes to an ACK. That is, while we do preserve the order of DATA and ACK packets -- if
5888 * both happen to be in the outgoing stream -- we do not delay the sending of the ACK once it is
5889 * the next packet to be sent out. However, even so, an ACK's sending may be delayed by the
5890 * pacing applied to DATA packets intermixed with it. Therefore the ACK delay measurement we
5891 * take here may be incorrect (too low) in that case. This can cause overestimated RTTs on the
5892 * sender's side. The to-do is to correct the ACK delay value in a given ACK by adding the
5893 * pacing delay (if any) of the ACK to the individual ACK delays within it. Conceptually this
5894 * is similar to the sent_when value being set when choosing to send a DATA packet and then
5895 * corrected in the pacing module later.
5896 *
5897 * This to-do is not important until we in practice start mixing sending and receiving at the
5898 * application layer... but still -- it's worth knowing that there is a design bug here. */
5899
5900 // Shouldn't be negative.
5901 Fine_duration delay = time_now - ind_ack->m_received_when;
5902 if (delay.count() < 0)
5903 {
5904 /* This is pretty crazy and should not happen according to the documented properties of
5905 * Fine_clock. No need to crash or disconnect though, so do our best.... */
5906 FLOW_LOG_WARNING("Delayed [ACK] timer [" << sock << "] triggered; "
5907 "delay for packet [" << seq_num << ", ...) is "
5908 "negative: [" << delay << "]; using zero.");
5909 delay = Fine_duration::zero();
5910 }
5911
5912 /* Convert whatever resolution Fine_clock uses to milliseconds because we want to keep that
5913 * field of the ACK sized according to how the low-level packet handling code prefers it for
5914 * efficiency. Overflow is possible. Use duration_cast (truncation) instead of rounding,
5915 * because in very low-latency situations the extra microseconds rounding up can cause a
5916 * negative RTT calculation on the other side (when this ACK is received). The ACK handling
5917 * code will just clamp the value at zero on the other side, but let's try to avoid it anyway
5918 * on this side.
5919 *
5920 * @todo This comment appears to be outdated, as Ack_delay_time_unit is just Fine_duration.
5921 * Look into this. */
5922 Ack_packet::Ack_delay_time_unit pkt_delay = duration_cast<Ack_packet::Ack_delay_time_unit>(delay);
5923 const Ack_packet::ack_delay_t MAX_DELAY_VALUE = numeric_limits<Ack_packet::ack_delay_t>::max();
5924 if (uint64_t(pkt_delay.count()) > uint64_t(MAX_DELAY_VALUE))
5925 {
5926 /* This is pretty crazy though not 100% impossible if the CPU is really loaded, or some other
5927 * shenanigans. So do our best.... */
5928 FLOW_LOG_WARNING("Delayed [ACK] timer [" << sock << "] triggered; "
5929 "delay for packet [" << seq_num << ", ...) is [" << pkt_delay << "]; overflow; "
5930 "using max value [" << MAX_DELAY_VALUE << "] units.");
5931 // @todo Maybe there's a more sane ceiling value than the absolute maximum?
5932 pkt_delay = Ack_packet::Ack_delay_time_unit(MAX_DELAY_VALUE);
5933 }
5934
5935 // Finally write the individual acknowledgment.
5936 if (sock->rexmit_on())
5937 {
5938 ack->m_rcv_acked_packets_rexmit_on_out.push_back
5940 ind_ack->m_rexmit_id,
5941 Ack_packet::ack_delay_t(pkt_delay.count())));
5942 }
5943 else
5944 {
5945 ack->m_rcv_acked_packets_rexmit_off_out.push_back
5947 Ack_packet::ack_delay_t(pkt_delay.count())));
5948 }
5949 size_est_so_far += size_est_inc;
5950
5951 // Register one packet of unknown size that we've packaged into an ACK and will send ASAP.
5952 sock->m_rcv_stats.sent_individual_ack();
5953 } // for (ind_ack : pending_acks)
5954
5955 // Don't forget the last non-full ACK, if any.
5956 if (size_est_so_far != 0)
5957 {
5959 }
5960
5961 // Register one ACK packet we will send ASAP.
5962 sock->m_rcv_stats.sent_low_lvl_ack_packet(false);
5963
5964 // All serialized to be sent; the timer can start again when a packet must be acknowledged.
5965 pending_acks.clear();
5966
5967 // Register that now there are 0 pending individual acks.
5968 sock->m_rcv_stats.current_pending_to_ack_packets(0);
5969
5970 // Note that all the ACKs are sent off outside this handler and only once UDP is ready.
5971} // Node::async_low_lvl_ack_send()
5972
5974{
5975 // We are in thread W.
5976 return Socket_id{ sock->remote_endpoint(), sock->local_port() };
5977}
5978
5980{
5981 // There is stuff to send if there is anything to retransmit or at least new user data.
5982 return !(sock->m_snd_rexmit_q.empty() && sock->m_snd_buf.empty());
5983}
5984
5986{
5987 // See doc comment for rationale for keeping this in a function.
5988
5989 /* Since 1 block can be at most max-block-size, if that much space is free, then definitely one
5990 * can enqueue onto m_snd_buf. Note that if less than max-block-size space is free, it would
5991 * still be possible to enqueue a smaller block; yet we still return false. We are intentionally
5992 * conservative, because we are guaranteeing ANY one enqueueing will work. More importantly, this
5993 * guarantees our Socket_buffer scheme (see class doc header) to guarantee constant-time
5994 * dequeueing will work.
5995 *
5996 * We're not overly conservative, either; i.e., no one is likely to complain this policy is too
5997 * stingy. */
5998 return sock->m_snd_buf.data_size() + sock->max_block_size()
5999 <= sock->opt(sock->m_opts.m_st_snd_buf_max_size);
6000}
6001
6003{
6004 // See doc comment for rationale for keeping this in a function.
6005 return !sock->m_rcv_buf.empty();
6006}
6007
6009{
6010 // We are in thread W.
6011
6012 FLOW_LOG_TRACE('[' << sock << "] changing state from [" <<
6013 sock->m_int_state << "] to [" << new_state << "].");
6014 sock->m_int_state = new_state;
6015}
6016
6018{
6019 Peer_socket::Lock_guard lock(sock->m_mutex);
6020
6021 // @todo Add TRACE logging.
6022
6023 sock->m_state = state;
6024 if (state == Peer_socket::State::S_OPEN)
6025 {
6026 sock->m_open_sub_state = open_sub_state;
6027 }
6028 else // (state == Peer_socket::State::S_CLOSED)
6029 {
6030 /* Important convention: S_CLOSED means socket is permanently incapable of sending or
6031 * receiving more data. At this point the originating Node removes the socket from its internal
6032 * structures. Therefore, the Node itself may even go away -- while this Peer_socket still
6033 * exists. Since we use shared_ptr when giving our socket objects, that's fine -- but we want to
6034 * avoid returning an invalid Node* in node(). So, when S_CLOSED, sock->m_node = 0. */
6035 sock->m_node = 0;
6036 }
6037}
6038
6039void Node::sock_disconnect_detected(Peer_socket::Ptr sock, const Error_code& disconnect_cause, bool close)
6040{
6041 Peer_socket::Lock_guard lock(sock->m_mutex);
6042
6043 sock->m_disconnect_cause = disconnect_cause;
6044
6045 if (close)
6046 {
6047 // DONE.
6048 sock_set_state(sock, Peer_socket::State::S_CLOSED); // Reentrant mutex => OK.
6049 sock_free_memory(sock);
6050 }
6051 else
6052 {
6053 // This socket is screwed, but let user get any remaining buffer data out.
6054
6055 // Reentrant mutex => OK:
6057 }
6058}
6059
6061{
6062 Peer_socket::Lock_guard lock(sock->m_mutex);
6063
6064 // Sanity-check pre-conditions. (Basically ensure disconnect_detected(err_code, false) was previously called.)
6065 assert(sock->m_disconnect_cause);
6066 assert((sock->m_state == Peer_socket::State::S_OPEN)
6067 && (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_DISCONNECTING));
6068
6069 sock_set_state(sock, Peer_socket::State::S_CLOSED); // Reentrant mutex => OK.
6070 sock_free_memory(sock);
6071}
6072
6074{
6075 sock->m_rcv_buf.clear();
6076 sock->m_snd_buf.clear();
6077 sock->m_rcv_packets_with_gaps.clear();
6078 sock->m_rcv_reassembly_q_data_size = 0;
6079 sock->m_snd_flying_pkts_by_sent_when.clear();
6080 sock->m_snd_flying_pkts_by_seq_num.clear();
6081 sock->m_snd_rexmit_q.clear();
6082 sock->m_serialized_metadata.make_zero(); // clear() does not deallocate, but this does.
6083 sock->m_rcv_syn_rcvd_data_q.clear();
6084 sock->m_rcv_pending_acks.clear();
6085 sock->m_rcv_acked_packets.clear();
6086 sock->m_snd_pacing_data.m_packet_q.clear();
6087
6088 /* Destroy memory stored in m_snd_cong_ctl which may be non-O(1). This is a little questionable;
6089 * maybe should leave it to destructor? However since we store it as a pointer and are to free
6090 * any "significant" memory, and this may be significant, we may as well just delete it. */
6091 sock->m_snd_cong_ctl.reset();
6092 // Same deal.
6093 sock->m_snd_bandwidth_estimator.reset();
6094}
6095
6097{
6098 // We are in thread U != W.
6099
6100 if (!running())
6101 {
6103 return false;
6104 }
6105 // else
6106
6107 /* We just want to replace m_opts with a copy of opts. First validate opts (including with
6108 * respect to m_opts, and also check for invalid values and such), then copy it over. */
6109
6110 // Log new options values. A bit computationally expensive so just use TRACE for now. @todo Reconsider?
6111 FLOW_LOG_TRACE("For [" << sock << "]:\n\n" << opts);
6112
6113 // Will be writing sock->m_opts if all goes well, so must acquire exclusive ownership of m_opts.
6114 Peer_socket::Options_lock lock(sock->m_opts_mutex);
6115
6116 /* Validate the new option set (including ensuring they're not changing static options' values).
6117 * Note that an explicit pre-condition of this method is that m_opts_mutex is locked if needed,
6118 * hence the above locking statement is not below this call. */
6119 if (!sock_validate_options(opts, &sock->m_opts, err_code))
6120 {
6121 return false;
6122 }
6123 // else
6124
6125 // Boo-ya.
6126 sock->m_opts = opts;
6127 return true;
6128} // Node::sock_set_options()
6129
6130/// @cond
6131/* -^- Doxygen, please ignore the following. (Don't want docs generated for temp macro; this is more maintainable
6132 * than specifying the macro name to omit it, in Doxygen-config EXCLUDE_SYMBOLS.) */
6133
6134/* Normaly I try to avoid macro cleverness, but in this case to get a nice printout we need the
6135 * # technique, and also this eliminates quite a bit of repetition. So let's.... */
6136#define VALIDATE_STATIC_OPTION(ARG_opt) \
6137 validate_static_option(opts.ARG_opt, prev_opts->ARG_opt, #ARG_opt, err_code)
6138#define VALIDATE_CHECK(ARG_check) \
6139 validate_option_check(ARG_check, #ARG_check, err_code)
6140
6141// -v- Doxygen, please stop ignoring.
6142/// @endcond
6143
6145 const Peer_socket_options* prev_opts,
6146 Error_code* err_code) const
6147{
6148 /* We are to validate the given set of per-socket option values. If prev_opts, then the context
6149 * is that an already-existing socket (with already-set options) is being called with
6150 * set_options(), i.e. user is modifying options for an existing socket. In that case we must
6151 * ensure that no static (unchangeable) option's value would be changed by this.
6152 *
6153 * If not prev_opts, then the per-socket options within the global per-Node Node_options object
6154 * are being changed. Per-socket options in that context are always dynamic, since if they were
6155 * static, there'd be no point in making the per-socket in the first place. So in that case that
6156 * static option check is to be skipped.
6157 *
6158 * Finally, we must check for individual integrity of the specified values (including consistency
6159 * with other option values). */
6160
6161 using boost::chrono::seconds;
6162 using std::numeric_limits;
6163
6164 // We are in thread U != W or in thread W.
6165
6166 if (prev_opts)
6167 {
6168 /* As explained above, they're trying to change an existing socket's option values. Ensure
6169 * all the static options' values are the same in opts and prev_opts. */
6170
6171 // Explicitly documented pre-condition is that *prev_opts is already locked if necessary. So don't lock.
6172
6173 const bool static_ok
6174 = VALIDATE_STATIC_OPTION(m_st_max_block_size) &&
6175 VALIDATE_STATIC_OPTION(m_st_connect_retransmit_period) &&
6176 VALIDATE_STATIC_OPTION(m_st_connect_retransmit_timeout) &&
6177 VALIDATE_STATIC_OPTION(m_st_snd_buf_max_size) &&
6178 VALIDATE_STATIC_OPTION(m_st_rcv_buf_max_size) &&
6179 VALIDATE_STATIC_OPTION(m_st_rcv_flow_control_on) &&
6180 VALIDATE_STATIC_OPTION(m_st_rcv_buf_max_size_slack_percent) &&
6181 VALIDATE_STATIC_OPTION(m_st_rcv_buf_max_size_to_advertise_percent) &&
6182 VALIDATE_STATIC_OPTION(m_st_rcv_max_packets_after_unrecvd_packet_ratio_percent) &&
6183 VALIDATE_STATIC_OPTION(m_st_delayed_ack_timer_period) &&
6184 VALIDATE_STATIC_OPTION(m_st_max_full_blocks_before_ack_send) &&
6185 VALIDATE_STATIC_OPTION(m_st_rexmit_on) &&
6186 VALIDATE_STATIC_OPTION(m_st_max_rexmissions_per_packet) &&
6187 VALIDATE_STATIC_OPTION(m_st_init_drop_timeout) &&
6188 VALIDATE_STATIC_OPTION(m_st_snd_pacing_enabled) &&
6189 VALIDATE_STATIC_OPTION(m_st_snd_bandwidth_est_sample_period_floor) &&
6190 VALIDATE_STATIC_OPTION(m_st_cong_ctl_strategy) &&
6191 VALIDATE_STATIC_OPTION(m_st_cong_ctl_init_cong_wnd_blocks) &&
6192 VALIDATE_STATIC_OPTION(m_st_cong_ctl_max_cong_wnd_blocks) &&
6193 VALIDATE_STATIC_OPTION(m_st_cong_ctl_cong_wnd_on_drop_timeout_blocks) &&
6194 VALIDATE_STATIC_OPTION(m_st_cong_ctl_classic_wnd_decay_percent) &&
6195 VALIDATE_STATIC_OPTION(m_st_drop_packet_exactly_after_drop_timeout) &&
6196 VALIDATE_STATIC_OPTION(m_st_drop_all_on_drop_timeout) &&
6197 VALIDATE_STATIC_OPTION(m_st_out_of_order_ack_restarts_drop_timer);
6198
6199 if (!static_ok)
6200 {
6201 // validate_static_option() has set *err_code.
6202 return false;
6203 }
6204 // else
6205 } // if (prev_opts)
6206
6207 // Now sanity-check the values themselves. @todo Comment and reconsider these?
6208 const bool checks_ok
6209 = VALIDATE_CHECK(opts.m_st_max_block_size >= 512) &&
6210 VALIDATE_CHECK(opts.m_st_connect_retransmit_period.count() > 0) &&
6211 VALIDATE_CHECK(opts.m_st_connect_retransmit_timeout.count() > 0) &&
6212 VALIDATE_CHECK(opts.m_st_snd_buf_max_size >= 4 * opts.m_st_max_block_size) &&
6213 VALIDATE_CHECK(opts.m_st_rcv_buf_max_size >= 4 * opts.m_st_max_block_size) &&
6215 VALIDATE_CHECK(opts.m_st_rcv_max_packets_after_unrecvd_packet_ratio_percent >= 100) &&
6216 VALIDATE_CHECK(opts.m_st_delayed_ack_timer_period <= seconds(1)) &&
6217 VALIDATE_CHECK(util::in_closed_range(Fine_duration::zero(),
6219 Fine_duration(seconds(1)))) &&
6220 VALIDATE_CHECK(opts.m_st_max_full_blocks_before_ack_send >= 1) &&
6221 VALIDATE_CHECK(opts.m_st_max_rexmissions_per_packet >= 1) &&
6222 VALIDATE_CHECK(opts.m_st_max_rexmissions_per_packet <= numeric_limits<Low_lvl_packet::rexmit_id_t>::max());
6223 VALIDATE_CHECK(opts.m_st_init_drop_timeout.count() > 0) &&
6224 VALIDATE_CHECK(opts.m_st_snd_bandwidth_est_sample_period_floor.count() > 0) &&
6226 VALIDATE_CHECK
6228 VALIDATE_CHECK(opts.m_st_cong_ctl_cong_avoidance_increment_blocks < 20) &&
6229 VALIDATE_CHECK(opts.m_st_cong_ctl_classic_wnd_decay_percent <= 100) &&
6230 VALIDATE_CHECK(util::in_closed_range<size_t>(1, opts.m_st_cong_ctl_cong_wnd_on_drop_timeout_blocks, 10)) &&
6231 VALIDATE_CHECK(opts.m_dyn_drop_timeout_ceiling > 4 * opts.m_st_init_drop_timeout) &&
6232 VALIDATE_CHECK(opts.m_dyn_drop_timeout_backoff_factor >= 1) &&
6233 VALIDATE_CHECK(opts.m_dyn_rcv_wnd_recovery_timer_period.count() > 0);
6234
6235 // On error, validate_option_check() has set *err_code.
6236
6237 return checks_ok;
6238
6239#undef VALIDATE_CHECK
6240#undef VALIDATE_STATIC_OPTION
6241} // Node::sock_validate_options()
6242
6244{
6247 using boost::adopt_lock;
6248
6249 // We are in thread U != W.
6250
6251 Peer_socket_info stats;
6252 {
6253 /* WARNING!!! sock->m_mutex is locked, but WE must unlock it before returning! Can't leave that
6254 * to the caller, because we must unlock at a specific point below, right before post()ing
6255 * sock_info_worker() onto thread W. Use a Lock_guard that adopts an already-locked mutex. */
6256 Peer_socket::Lock_guard lock(sock->m_mutex, adopt_lock);
6257
6258 if (!running())
6259 {
6260 /* This is kind of a weird case, in that sock's Node having stopped running is a problem, but
6261 * in this case they just want the socket stats. The only reason we're in this method --
6262 * calling sock->info() did not simply return the stats itself -- is that there was a danger
6263 * thread W might change the stats, while we'd be copying them. Well, if !running() there is no
6264 * danger of that. So we can just: */
6265 sock_load_info_struct(sock, &stats);
6266 return stats;
6267 }
6268 // else
6269
6270 /* Okay -- Node is running and may change stats's source info at any time. Therefore, since we
6271 * do not have a mutex for all that source info, we place a task on W and set up a future as a
6272 * way for it to inform us it's done. This has a certain performance penalty, but that's better
6273 * than having to lock each time we need to modify this source data throughout W's operations.
6274 * Moreover we warned about the performance penalty in the doc header for Peer_socket::info(). */
6275
6276 // We're done -- must unlock so that thread W can do what it wants to with sock.
6277 } // lock
6278
6279 // Load this onto thread W boost.asio work queue. We don't return until it's done, so [&] is OK.
6280 asio_exec_ctx_post(get_logger(), &m_task_engine, Synchronicity::S_ASYNC_AND_AWAIT_CONCURRENT_COMPLETION,
6281 [&]() { sock_load_info_struct(sock, &stats); });
6282 // If got here, the task has completed in thread W and signaled us to that effect.
6283
6284 return stats;
6285} // Node::sock_info()
6286
6288{
6289 using boost::lexical_cast;
6290 using std::string;
6291
6292 // We are in thread W.
6293
6294 stats->m_rcv = sock->m_rcv_stats.stats();
6295 stats->m_snd = sock->m_snd_stats.stats();
6296
6297 // @todo This is more suitable for the non-existent Node_info and Node::load_info_struct(). (It's not per-socket.)
6299
6300 stats->m_int_state_str = lexical_cast<string>(sock->m_int_state);
6301 stats->m_is_active_connect = sock->m_active_connect;
6302 // No need to lock: no thread but W can write to it.
6303 stats->m_disconnect_cause = sock->m_disconnect_cause;
6304
6305 {
6306 // Gotta lock, as Receive and Send buffers can be modified at any time by thread U at least.
6307 Peer_socket::Lock_guard lock(sock->m_mutex);
6308 stats->m_rcv_buf_size = sock->m_rcv_buf.data_size();
6309 stats->m_snd_buf_size = sock->m_snd_buf.data_size();
6310 }
6311
6312 stats->m_rcv_wnd = sock_rcv_wnd(sock);
6313 stats->m_rcv_wnd_last_advertised = sock->m_rcv_last_sent_rcv_wnd;
6314 stats->m_rcv_reassembly_q_data_size = sock->m_rcv_reassembly_q_data_size;
6315 stats->m_rcv_packets_with_gaps = sock->m_rcv_packets_with_gaps.size();
6317 = sock->m_rcv_syn_rcvd_data_q.empty() ? 0 : sock->m_rcv_syn_rcvd_data_cumulative_size;
6318 stats->m_rcv_syn_rcvd_data_q_size = sock->m_rcv_syn_rcvd_data_q.size();
6319
6320 stats->m_snd_rcv_wnd = sock->m_snd_remote_rcv_wnd;
6321 stats->m_snd_cong_ctl_in_flight_bytes = sock->m_snd_flying_bytes;
6322 stats->m_snd_cong_ctl_in_flight_count = sock->m_snd_flying_pkts_by_sent_when.size();
6323 stats->m_snd_cong_ctl_wnd_bytes = sock->m_snd_cong_ctl->congestion_window_bytes();
6324 stats->m_snd_cong_ctl_wnd_count_approx = stats->m_snd_cong_ctl_wnd_bytes / sock->max_block_size();
6325 stats->m_snd_smoothed_round_trip_time = sock->m_snd_smoothed_round_trip_time;
6326 stats->m_snd_round_trip_time_variance = sock->m_round_trip_time_variance;
6327 stats->m_snd_drop_timeout = sock->m_snd_drop_timeout;
6328 stats->m_snd_pacing_packet_q_size = sock->m_snd_pacing_data.m_packet_q.size();
6329 stats->m_snd_pacing_bytes_allowed_this_slice = sock->m_snd_pacing_data.m_bytes_allowed_this_slice;
6330 stats->m_snd_pacing_slice_start = sock->m_snd_pacing_data.m_slice_start;
6331 stats->m_snd_pacing_slice_period = sock->m_snd_pacing_data.m_slice_period;
6333 = util::to_mbit_per_sec<Send_bandwidth_estimator::Time_unit>
6334 (sock->m_snd_bandwidth_estimator->bandwidth_bytes_per_time());
6335
6336 stats->m_sock_opts = sock->opt(sock->m_opts); // Lock and copy... probably not the fastest thing ever....
6337 stats->m_node_opts = opt(m_opts); // Ditto.
6338}
6339
6341{
6342 // We are in thread W.
6343
6344 /* We are to log details about the given socket. Since the idea is that this would be called on
6345 * the order of at most once or twice a second, we can be as verbose as we think is useful without
6346 * (too much) concern for performance. */
6347
6348 Peer_socket_info stats;
6349 sock_load_info_struct(sock, &stats); // This involves some copying, but, again, we are not too concerned with speed.
6350
6351 FLOW_LOG_INFO("[=== Socket state for [" << sock << "]. ===\n" << stats);
6352
6353 // Log receive and send windows details. Force the logging of the most verbose possible amount of info.
6354 log_snd_window(sock, true);
6355 log_rcv_window(sock, true);
6356 // @todo Should this be inside Peer_socket_info also?
6357
6358 FLOW_LOG_INFO("=== Socket state for [" << sock << "]. ===]");
6359} // Node::sock_log_detail()
6360
6361void Node::advance_seq_num(Sequence_number* seq_num, boost::shared_ptr<const Data_packet> data) // Static.
6362{
6363 /* We just need to increment *seq_num, which points to the start of the data in `data`,
6364 * to a value that points to the data just past the end of the data in `data`. Why is this in a
6365 * separate method? Answer: We may want to change the mapping from sequence number to byte of data. In
6366 * particular the mapping can be one-to-one, as in TCP. Or it can be one sequence number to all bytes in a
6367 * particular packet, which I've seen in certain lesser known custom protocols. This allows us to
6368 * (hopefully) change the code in one place. */
6369
6370 advance_seq_num(seq_num, data->m_data.size());
6371} // Node::advance_seq_num()
6372
6373void Node::advance_seq_num(Sequence_number* seq_num, size_t data_size)
6374{
6375 /* For now go with TCP's convention (one byte to one sequence number, no gaps). While we deal
6376 * with blocks, instead of streams, this may complicate the math a bit and use more sequence
6377 * number space (faster wrapping). However, it would make it easier to adapt the algorithms
6378 * when we move to byte streams; and we currently use a sequence number so large that wrapping
6379 * is impossible. Update: we have moved to streams. */
6380 *seq_num += data_size;
6381}
6382
6383template<typename Packet_map_iter>
6384void Node::get_seq_num_range(const Packet_map_iter& packet_it,
6385 Sequence_number* seq_num_start, Sequence_number* seq_num_end) // Static.
6386{
6387 const Sequence_number& seq_num_start_cref = packet_it->first;
6388 if (seq_num_start)
6389 {
6390 *seq_num_start = seq_num_start_cref;
6391 }
6392 if (seq_num_end)
6393 {
6394 *seq_num_end = seq_num_start_cref;
6395 advance_seq_num(seq_num_end, packet_it->second->m_size);
6396 }
6397}
6398
6400{
6401 // Since m_snd_last_order_num starts at 0, this ensures 0 is reserved, as advertised.
6402 return ++sock->m_snd_last_order_num;
6403}
6404
6406{
6407 // Just make a regular net_flow::Peer_socket.
6408 return sock_create_forward_plus_ctor_args<Peer_socket>(opts);
6409}
6410
6411// Free implementations.
6412
6413std::ostream& operator<<(std::ostream& os, const Peer_socket* sock)
6414{
6415 return
6416 sock
6417 ? (os
6418 << "NetFlow_socket "
6419 << "[" << sock->remote_endpoint() << "]<=>[NetFlow [:" << sock->local_port() << "]] "
6420 "@" << static_cast<const void*>(sock))
6421 : (os << "NetFlow_socket@null");
6422}
6423
6424/// @cond
6425/* -^- Doxygen, please ignore the following. (Don't want docs generated for temp macro; this is more maintainable
6426 * than specifying the macro name to omit it, in Doxygen-config EXCLUDE_SYMBOLS.) */
6427
6428// That's right, I did this. Wanna fight about it?
6429#define STATE_TO_CASE_STATEMENT(ARG_state) \
6430 case Peer_socket::Int_state::S_##ARG_state: \
6431 return os << #ARG_state
6432
6433// -v- Doxygen, please stop ignoring.
6434/// @endcond
6435
6436std::ostream& operator<<(std::ostream& os, Peer_socket::Int_state state)
6437{
6438 switch (state)
6439 {
6440 STATE_TO_CASE_STATEMENT(CLOSED);
6441 STATE_TO_CASE_STATEMENT(SYN_SENT);
6442 STATE_TO_CASE_STATEMENT(SYN_RCVD);
6443 STATE_TO_CASE_STATEMENT(ESTABLISHED);
6444 }
6445 return os;
6446#undef STATE_TO_CASE_STATEMENT
6447}
6448
6449} // namespace flow::net_flow
const Component & get_log_component() const
Returns reference to the stored Component object, particularly as many FLOW_LOG_*() macros expect.
Definition: log.cpp:222
Logger * get_logger() const
Returns the stored Logger pointer, particularly as many FLOW_LOG_*() macros expect.
Definition: log.cpp:217
Interface that the user should implement, passing the implementing Logger into logging classes (Flow'...
Definition: log.hpp:1284
static Congestion_control_strategy * create_strategy(Strategy_choice strategy_choice, log::Logger *logger_ptr, Peer_socket::Const_ptr sock)
Factory method that, given an enum identifying the desired strategy, allocates the appropriate Conges...
Definition: cong_ctl.cpp:101
static Ptr create_drop_timer(log::Logger *logger_ptr, util::Task_engine *node_task_engine, Fine_duration *sock_drop_timeout, Peer_socket::Const_ptr &&sock, const Function< void(const Error_code &err_code)> &timer_failure, const Function< void(bool drop_all_packets)> &timer_fired)
Constructs Drop_timer and returns a ref-counted pointer wrapping it.
Definition: drop_timer.cpp:28
@ S_PEER_SOCKET_WRITABLE
Event type specifying the condition of interest wherein a target Peer_socket sock is such that callin...
@ S_PEER_SOCKET_READABLE
Event type specifying the condition of interest wherein a target Peer_socket sock is such that callin...
An object of this class is a single Flow-protocol networking node, in the sense that: (1) it has a di...
Definition: node.hpp:934
void snd_flying_pkts_updated(Peer_socket::Ptr sock, Peer_socket::Sent_pkt_ordered_by_when_const_iter pkt_begin, const Peer_socket::Sent_pkt_ordered_by_when_const_iter &pkt_end, bool added)
Updates Peer_socket::m_snd_flying_bytes according to an operation (add packets, remove packets) calle...
bool categorize_individual_ack(const Socket_id &socket_id, Peer_socket::Ptr sock, Ack_packet::Individual_ack::Const_ptr ack, bool *dupe_or_late, Peer_socket::Sent_pkt_ordered_by_when_iter *acked_pkt_it)
Helper of perform_accumulated_on_recv_tasks() that categorizes the given accumulated individual ackno...
void handle_data_to_established(const Socket_id &socket_id, Peer_socket::Ptr sock, boost::shared_ptr< Data_packet > packet, bool syn_rcvd_qd_packet)
Handles a just-deserialized, just-demultiplexed, low-level DATA packet delivered to the given peer so...
bool sock_is_writable(const boost::any &sock_as_any) const
Returns true if and only if calling sock->send() with at least some arguments would return either non...
Peer_socket_info sock_info(Peer_socket::Const_ptr sock)
Implementation of sock->info() for socket sock in all cases except when sock->state() == Peer_socket:...
void receive_wnd_updated(Peer_socket::Ptr sock)
Placed by receive() onto W if it has dequeued data from Receive buffer and given it to the user,...
void sock_track_new_data_after_gap_rexmit_off(Peer_socket::Ptr sock, boost::shared_ptr< const Data_packet > packet, size_t data_size, bool *slide, size_t *slide_size)
Helper for handle_data_to_established() that aims to register the given DATA packet as an out-of-orde...
bool sock_data_to_reassembly_q_unless_overflow(Peer_socket::Ptr sock, boost::shared_ptr< Data_packet > packet)
Helper for handle_data_to_established() that aims to register the given DATA packet as an out-of-orde...
static bool ensure_sock_open(Socket_ptr sock, Error_code *err_code)
Helper method that checks whether the given Peer_socket or Server_socket is CLOSED; if so,...
Definition: node.hpp:4099
void send_worker(Peer_socket::Ptr sock, bool defer_delta_check)
Thread W implemention of send(): synchronously or asynchronously send the contents of sock->m_snd_buf...
void handle_accumulated_acks(const Socket_id &socket_id, Peer_socket::Ptr sock)
Helper of perform_accumulated_on_recv_tasks() that handles any incoming acknowledgments and rcv_wnd u...
void async_rcv_wnd_recovery(Peer_socket::Ptr sock, size_t rcv_wnd)
receive_wnd_updated() helper that continues rcv_wnd recovery: that is, sends unsolicited ACK with a r...
void log_accumulated_acks(Peer_socket::Const_ptr sock) const
Helper of handle_accumulated_acks() that logs the about-to-be-handled accumulated individual acknowle...
void sock_free_memory(Peer_socket::Ptr sock)
Helper that clears all non-O(1)-space data structures stored inside sock.
void sock_load_info_struct(Peer_socket::Const_ptr sock, Peer_socket_info *stats) const
Given a Peer_socket, copies all stats info (as available via Peer_socket::info()) from various struct...
void log_snd_window(Peer_socket::Const_ptr sock, bool force_verbose_info_logging=false) const
Logs TRACE or DATA messages thats show the detailed state of the sending sequence number space.
void send_worker_check_state(Peer_socket::Ptr sock)
Helper placed by send() onto W to invoke send_worker() but ensures that the socket has not entered so...
size_t m_low_lvl_max_buf_size
OS-reported m_low_lvl_sock UDP receive buffer maximum size, obtained right after we OS-set that setti...
Definition: node.hpp:3729
Non_blocking_func_ret_type sync_op(typename Socket::Ptr sock, const Function< Non_blocking_func_ret_type()> &non_blocking_func, Non_blocking_func_ret_type would_block_ret_val, Event_set::Event_type ev_type, const Fine_time_pt &wait_until, Error_code *err_code)
Implementation of core blocking transfer methods, namely Peer_socket::sync_send(),...
Definition: node.hpp:3935
size_t sock_max_packets_after_unrecvd_packet(Peer_socket::Const_ptr sock) const
Computes and returns the max size for Peer_socket::m_rcv_packets_with_gaps for sock.
Peer_socket::Sent_pkt_ordered_by_when_iter categorize_pkts_as_dropped_on_acks(Peer_socket::Ptr sock, const boost::unordered_set< Peer_socket::order_num_t > &flying_now_acked_pkts)
Helper of perform_accumulated_on_recv_tasks() that determines the range of In-flight packets that sho...
void rcv_get_first_gap_info(Peer_socket::Const_ptr sock, bool *first_gap_exists, Sequence_number *seq_num_after_first_gap)
Helper for handle_data_to_established() that gets simple info about Peer_socket::m_rcv_packets_with_g...
bool snd_deqable(Peer_socket::Const_ptr sock) const
Return true if and only if there are enough data either in Peer_socket::m_snd_rexmit_q of sock (if re...
void cancel_timers(Peer_socket::Ptr sock)
Cancel any timers and scheduled tasks active in the given socket.
void sock_rcv_buf_now_readable(Peer_socket::Ptr sock, bool syn_rcvd_qd_packet)
Helper for handle_data_to_established() that assumes the given's socket Receive buffer is currently r...
void snd_flying_pkts_erase_one(Peer_socket::Ptr sock, Peer_socket::Sent_pkt_ordered_by_when_iter pkt_it)
Erases (for example if considered Acknowledged or Dropped) a packet struct from the "scoreboard" (Pee...
Opt_type opt(const Opt_type &opt_val_ref) const
Obtain a copy of the value of a given option in a thread-safe manner.
Definition: node.hpp:4138
bool sock_validate_options(const Peer_socket_options &opts, const Peer_socket_options *prev_opts, Error_code *err_code) const
Analogous to validate_options() but checks per-socket options instead of per-Node options.
void handle_accumulated_pending_acks(const Socket_id &socket_id, Peer_socket::Ptr sock)
Helper of perform_accumulated_on_recv_tasks() that handles any additional individual outgoing acknowl...
void receive_wnd_recovery_data_received(Peer_socket::Ptr sock)
Pertaining to the async_rcv_wnd_recovery() mechanism, this handles the event that we have received an...
static Peer_socket::order_num_t sock_get_new_snd_order_num(Peer_socket::Ptr sock)
Returns the "order number" to use for Peer_socket::Sent_packet::Sent_when structure corresponding to ...
Peer_socket::Ptr sync_connect_impl(const Remote_endpoint &to, const Fine_duration &max_wait, const boost::asio::const_buffer &serialized_metadata, Error_code *err_code, const Peer_socket_options *opts)
Implementation core of sync_connect*() that gets rid of templated or missing arguments thereof.
size_t max_block_size() const
The maximum number of bytes of user data per received or sent block on connections generated from thi...
Definition: node.cpp:1111
void snd_flying_pkts_push_one(Peer_socket::Ptr sock, const Sequence_number &seq_num, Peer_socket::Sent_packet::Ptr sent_pkt)
Adds a new packet struct (presumably representing packet to be sent shortly) to the "scoreboard" (Pee...
Syn_packet::Ptr create_syn(Peer_socket::Const_ptr sock)
Helper that creates a new SYN packet object to the extent that is suitable for immediately passing to...
void close_abruptly(Peer_socket::Ptr sock, Error_code *err_code)
Implementation of non-blocking sock->close_abruptly() for socket sock in all cases except when sock->...
void async_low_lvl_ack_send(Peer_socket::Ptr sock, const Error_code &sys_err_code=Error_code())
Sends a low-level ACK packet, with all accumulated in Peer_socket::m_rcv_pending_acks of sock individ...
static void get_seq_num_range(const Packet_map_iter &packet_it, Sequence_number *seq_num_start, Sequence_number *seq_num_end)
Given an iterator into a Peer_socket::Sent_pkt_by_sent_when_map or Peer_socket::Recv_pkt_map,...
Peer_socket::Ptr sync_connect_with_metadata(const Remote_endpoint &to, const boost::chrono::duration< Rep, Period > &max_wait, const boost::asio::const_buffer &serialized_metadata, Error_code *err_code=0, const Peer_socket_options *opts=0)
A combination of sync_connect() and connect_with_metadata() (blocking connect, with supplied metadata...
Definition: node.hpp:3914
Syn_ack_packet::Ptr create_syn_ack(Peer_socket::Const_ptr sock)
Like create_syn() but for SYN_ACK.
virtual Peer_socket * sock_create(const Peer_socket_options &opts)
Internal factory used for ALL Peer_socket objects created by this Node (including subclasses).
bool snd_buf_enqable(Peer_socket::Const_ptr sock) const
Return true if and only if there is enough free space in Peer_socket::m_snd_buf of sock to enqueue an...
bool can_send(Peer_socket::Const_ptr sock) const
Answers the perennial question of congestion and flow control: assuming there is a DATA packet to sen...
void sock_slide_rcv_next_seq_num(Peer_socket::Ptr sock, size_t slide_size, bool reassembly_in_progress)
Helper for handle_data_to_established() that aims to register a set of received DATA packet data as i...
void sock_log_detail(Peer_socket::Const_ptr sock) const
Logs a verbose state report for the given socket.
static void advance_seq_num(Sequence_number *seq_num, boost::shared_ptr< const Data_packet > data)
Assuming *seq_num points to the start of data.m_data, increments *seq_num to point to the datum just ...
static Sequence_number snd_past_last_flying_datum_seq_num(Peer_socket::Const_ptr sock)
Obtain the sequence number for the datum just past the last (latest) In-flight (i....
Peer_socket::Ptr connect(const Remote_endpoint &to, Error_code *err_code=0, const Peer_socket_options *opts=0)
Initiates an active connect to the specified remote Flow server.
void event_set_all_check_delta(bool defer_delta_check)
For each WAITING Event_set within the Node: checks for any events that hold, and if any do hold,...
Definition: event_set.cpp:1127
void serv_peer_socket_closed(Server_socket::Ptr serv, Peer_socket::Ptr sock)
Records that a Server_socket-contained (i.e., currently un-established, or established but not yet ac...
bool rcv_buf_deqable(Peer_socket::Const_ptr sock) const
Return true if and only if there are enough data in Peer_socket::m_rcv_buf of sock to give the user s...
void async_acknowledge_packet(Peer_socket::Ptr sock, const Sequence_number &seq_num, unsigned int rexmit_id, size_t data_size)
Causes an acknowledgment of the given received packet to be included in a future Ack_packet sent to t...
Socket_id_to_socket_map m_socks
The peer-to-peer connections this Node is currently tracking.
Definition: node.hpp:3750
Peer_socket::Options_lock Options_lock
Short-hand for lock that acquires exclusive access to an Options_mutex.
Definition: node.hpp:1436
static Socket_id socket_id(Peer_socket::Const_ptr sock)
Constructs the socket pair (connection ID) for the given socket.
void handle_syn_ack_to_syn_sent(const Socket_id &socket_id, Peer_socket::Ptr sock, boost::shared_ptr< const Syn_ack_packet > syn_ack)
Handles a just-deserialized, just-demultiplexed low-level SYN_ACK packet delivered to the given peer ...
size_t send(Peer_socket::Ptr sock, const Function< size_t(size_t max_data_size)> &snd_buf_feed_func, Error_code *err_code)
Implementation of non-blocking sock->send() for socket sock in all cases except when sock->state() ==...
void sock_set_int_state(Peer_socket::Ptr sock, Peer_socket::Int_state new_state)
Sets internal state of given socket to the given state and logs a TRACE message about it.
bool sock_is_readable(const boost::any &sock_as_any) const
Returns true if and only if calling sock->receive() with at least some arguments would return either ...
bool sock_data_to_rcv_buf_unless_overflow(Peer_socket::Ptr sock, boost::shared_ptr< Data_packet > packet)
Helper for handle_data_to_established() that aims to pass the payload of the given DATA packet to the...
bool sock_set_options(Peer_socket::Ptr sock, const Peer_socket_options &opts, Error_code *err_code)
Thread W implementation of sock->set_options().
bool running() const
Returns true if and only if the Node is operating.
Definition: node.cpp:420
Port_to_server_map m_servs
The server sockets this Node is currently tracking.
Definition: node.hpp:3756
Event_set::Ev_type_to_socks_map m_sock_events
All sockets that have been detected to be "ready" (by the Event_set doc header definition) at any poi...
Definition: node.hpp:3788
static const uint8_t S_DEFAULT_CONN_METADATA
Type and value to supply as user-supplied metadata in SYN, if user chooses to use [[a]sync_]connect()...
Definition: node.hpp:1400
void setup_drop_timer(const Socket_id &socket_id, Peer_socket::Ptr sock)
Creates a new Drop Timer and saves it to sock->m_snd_drop_timer.
void handle_ack_to_established(Peer_socket::Ptr sock, boost::shared_ptr< const Ack_packet > ack)
Handles a just-deserialized, just-demultiplexed, low-level ACK packet delivered to the given peer soc...
Peer_socket::Ptr sync_connect(const Remote_endpoint &to, const boost::chrono::duration< Rep, Period > &max_wait, Error_code *err_code=0, const Peer_socket_options *opts=0)
The blocking (synchronous) version of connect().
Definition: node.hpp:3925
void handle_syn_ack_to_established(Peer_socket::Ptr sock, boost::shared_ptr< const Syn_ack_packet > syn_ack)
Handles a just-deserialized, just-demultiplexed, duplicate (equal to already-received SYN_ACK) low-le...
void setup_connection_timers(const Socket_id &socket_id, Peer_socket::Ptr sock, bool initial)
Assuming we've just sent SYN or SYN_ACK, sets up an asynchronous scheduled task to fire within some a...
void log_rcv_window(Peer_socket::Const_ptr sock, bool force_verbose_info_logging=false) const
Logs TRACE or DATA messages that show the detailed state of the receiving sequence number space.
size_t sock_rcv_wnd(Peer_socket::Const_ptr sock) const
Computes and returns the currently correct rcv_wnd value; that is the amount of space free in Receive...
void connect_worker(const Remote_endpoint &to, const boost::asio::const_buffer &serialized_metadata, const Peer_socket_options *opts, Peer_socket::Ptr *sock)
Thread W implementation of connect().
bool drop_pkts_on_acks(Peer_socket::Ptr sock, const Peer_socket::Sent_pkt_ordered_by_when_iter &last_dropped_pkt_it, size_t *cong_ctl_dropped_pkts, size_t *cong_ctl_dropped_bytes, size_t *dropped_pkts, size_t *dropped_bytes, std::vector< Peer_socket::order_num_t > *pkts_marked_to_drop)
Helper of perform_accumulated_on_recv_tasks() that acts on the determination made by categorize_pkts_...
static const Peer_socket::Sent_packet::ack_count_t S_MAX_LATER_ACKS_BEFORE_CONSIDERING_DROPPED
For a given unacknowledged sent packet P, the maximum number of times any individual packet with high...
Definition: node.hpp:3644
Error_code sock_categorize_data_to_established(Peer_socket::Ptr sock, boost::shared_ptr< const Data_packet > packet, bool *dupe, bool *slide, size_t *slide_size)
Helper for handle_data_to_established() that categorizes the DATA packet received as either illegal; ...
void async_sock_low_lvl_rst_send(Peer_socket::Ptr sock)
Sends an RST to the other side of the given socket asynchronously when possible.
Definition: low_lvl_io.cpp:988
void sock_set_state(Peer_socket::Ptr sock, Peer_socket::State state, Peer_socket::Open_sub_state open_sub_state=Peer_socket::Open_sub_state::S_CONNECTED)
Sets Peer_socket::m_state and Peer_socket::m_open_sub_state.
void receive_emptied_rcv_buf_while_disconnecting(Peer_socket::Ptr sock)
Placed by receive() onto W during a graceful close, after the Receive buffer had been emptied by the ...
void sock_disconnect_detected(Peer_socket::Ptr sock, const Error_code &disconnect_cause, bool close)
Records that thread W shows underlying connection is broken (graceful termination,...
size_t receive(Peer_socket::Ptr sock, const Function< size_t()> &rcv_buf_consume_func, Error_code *err_code)
Implementation of non-blocking sock->receive() for socket sock in all cases except when sock->state()...
void handle_connection_rexmit_timer_event(const Socket_id &socket_id, Peer_socket::Ptr sock)
Handles the triggering of the retransmit timer wait set up by setup_connection_timers(); it will re-s...
Node_options m_opts
This Node's global set of options.
Definition: node.hpp:3662
void close_connection_immediately(const Socket_id &socket_id, Peer_socket::Ptr sock, const Error_code &err_code, bool defer_delta_check)
A thread W method that handles the transition of the given socket from OPEN (any sub-state) to CLOSED...
void sock_disconnect_completed(Peer_socket::Ptr sock)
While in S_OPEN+S_DISCONNECTING state (i.e., after beginning a graceful close with sock_disconnect_de...
Fine_duration compute_rtt_on_ack(Peer_socket::Sent_packet::Const_ptr flying_pkt, const Fine_time_pt &time_now, Ack_packet::Individual_ack::Const_ptr ack, const Peer_socket::Sent_packet::Sent_when **sent_when) const
Helper of perform_accumulated_on_recv_tasks() that computes the RTT implied by a given individual ack...
void async_low_lvl_syn_ack_ack_send(const Peer_socket::Ptr &sock, boost::shared_ptr< const Syn_ack_packet > &syn_ack)
Helper to create, fully fill out, and asynchronously send via async_sock_low_lvl_packet_send_paced() ...
Peer_socket::Ptr connect_with_metadata(const Remote_endpoint &to, const boost::asio::const_buffer &serialized_metadata, Error_code *err_code=0, const Peer_socket_options *opts=0)
Same as connect() but sends, as part of the connection handshake, the user-supplied metadata,...
void new_round_trip_time_sample(Peer_socket::Ptr sock, Fine_duration round_trip_time)
Handles a just-computed new RTT (round trip time) measurement for an individual packet earlier sent: ...
void async_sock_low_lvl_packet_send_paced(const Peer_socket::Ptr &sock, Low_lvl_packet::Ptr &&packet)
Begins the process of asynchronously sending the given low-level packet to the remote Node specified ...
Definition: low_lvl_io.cpp:599
bool ok_to_rexmit_or_close(Peer_socket::Ptr sock, const Peer_socket::Sent_pkt_ordered_by_when_iter &pkt_it, bool defer_delta_check)
Checks whether the given sent packet has been retransmitted the maximum number of allowed times; if s...
util::Task_engine m_task_engine
The main loop engine, functioning in the single-threaded-but-asynchronous callback-based "reactor" st...
Definition: node.hpp:3697
Port_space m_ports
Flow port space for both client and server sockets. All threads may access this.
Definition: node.hpp:3735
void rst_and_close_connection_immediately(const Socket_id &socket_id, Peer_socket::Ptr sock, const Error_code &err_code, bool defer_delta_check)
Asynchronously send RST to the other side of the given socket and close_connection_immediately().
void drop_timer_action(Peer_socket::Ptr sock, bool drop_all_packets)
Handles a Drop_timer (Peer_socket::m_snd_drop_timer) event in ESTABLISHED state by dropping the speci...
A class that keeps a Peer_socket_receive_stats data store, includes methods to conveniently accumulat...
void good_data_accepted_packet(size_t data)
Indicates good_data_packet(), and these data are not dropped (so either delivered into Receive buffer...
void good_data_dropped_reassembly_q_overflow_packet(size_t data)
Indicates good_data_packet(), but these data are dropped due to insufficient Receive reassembly queue...
void presumed_dropped_data(size_t data)
Indicates that one or more unreceived data packets have been considered Dropped due to the number of ...
void good_data_delivered_packet(size_t data)
Indicates good_data_accepted_packet(), and these data are delivered into Receive buffer (either immed...
void late_or_dupe_to_send_ack_packet(size_t data)
Indicates that late_or_dupe_data_packet() and therefore an individual acknowledgment for this packet ...
void total_data_packet(size_t data)
Indicates one DATA packet has been received on socket.
void good_to_send_ack_packet(size_t data)
Indicates that good_data_delivered_packet() and therefore an individual acknowledgment for this packe...
void good_data_packet(size_t data)
Indicates total_data_packet(), and these data are new and acceptable into Receive buffer assuming the...
void error_data_packet(size_t data)
Indicates total_data_packet(), but there is some error about the sequence numbers so that they are no...
void buffer_fed(size_t size)
Indicates the Receive buffer was enqueued with data from network (so its data_size() increased).
void good_data_first_qd_packet(size_t data)
Indicates good_data_accepted_packet(), and these data are, upon receipt, queued for reassembly (not i...
void good_data_dropped_buf_overflow_packet(size_t data)
Indicates good_data_packet(), but these data are dropped due to insufficient Receive buffer space.
void late_or_dupe_data_packet(size_t data)
Indicates total_data_packet(), but the arrived data have either already been received before or (more...
A peer (non-server) socket operating over the Flow network protocol, with optional stream-of-bytes an...
size_t get_connect_metadata(const boost::asio::mutable_buffer &buffer, Error_code *err_code=0) const
Obtains the serialized connect metadata, as supplied by the user during the connection handshake.
size_t max_block_size_multiple(const size_t &opt_val_ref, const unsigned int *inflate_pct_val_ptr=0) const
Returns the smallest multiple of max_block_size() that is >= the given option value,...
bool sync_send_reactor_pattern_impl(const Fine_time_pt &wait_until, Error_code *err_code)
Helper similar to sync_send_impl() but for the nullptr_t versions of sync_send().
std::map< Sequence_number, Sent_pkt_ordered_by_when_iter > Sent_pkt_by_seq_num_map
Short-hand for m_snd_flying_pkts_by_seq_num type; see that data member.
bool sync_receive_reactor_pattern_impl(const Fine_time_pt &wait_until, Error_code *err_code)
Helper similar to sync_receive_impl() but for the nullptr_t versions of sync_receive().
Remote_endpoint m_remote_endpoint
See remote_endpoint(). Should be set before user gets access to *this and not changed afterwards.
size_t sync_receive(const Mutable_buffer_sequence &target, const boost::chrono::duration< Rep, Period > &max_wait, Error_code *err_code=0)
Blocking (synchronous) version of receive().
util::Blob m_serialized_metadata
If !m_active_connect, this contains the serialized metadata that the user supplied on the other side ...
size_t node_sync_send(const Function< size_t(size_t max_data_size)> &snd_buf_feed_func_or_empty, const Fine_time_pt &wait_until, Error_code *err_code)
This is to sync_send() as node_send() is to send().
Error_code m_disconnect_cause
The Error_code causing disconnection (if one has occurred or is occurring) on this socket; otherwise ...
Peer_socket(log::Logger *logger_ptr, util::Task_engine *task_engine, const Peer_socket_options &opts)
Constructs object; initializes most values to well-defined (0, empty, etc.) but not necessarily meani...
Definition: peer_socket.cpp:37
Sequence_number m_rcv_init_seq_num
The Initial Sequence Number (ISN) contained in the original Syn_packet or Syn_ack_packet we received.
const Remote_endpoint & remote_endpoint() const
Intended other side of the connection (regardless of success, failure, or current State).
State
State of a Peer_socket.
@ S_OPEN
Future reads or writes may be possible. A socket in this state may be Writable or Readable.
@ S_CLOSED
Neither future reads nor writes are possible, AND Node has disowned the Peer_socket.
Open_sub_state
The sub-state of a Peer_socket when state is State::S_OPEN.
@ S_CONNECTED
This Peer_socket was created through a passive connect (Node::accept() and the like) or an active con...
@ S_CONNECTING
This Peer_socket was created through an active connect (Node::connect() and the like),...
@ S_DISCONNECTING
This Peer_socket was created through a passive connect (Node::accept() and the like) or an active con...
size_t sync_send(const Const_buffer_sequence &data, const boost::chrono::duration< Rep, Period > &max_wait, Error_code *err_code=0)
Blocking (synchronous) version of send().
~Peer_socket() override
Boring virtual destructor. Note that deletion is to be handled exclusively via shared_ptr,...
Definition: peer_socket.cpp:77
Error_code disconnect_cause() const
The error code that perviously caused state() to become State::S_CLOSED, or success code if state is ...
Sequence_number::seq_num_t order_num_t
Short-hand for order number type. 0 is reserved. Caution: Keep in sync with Drop_timer::packet_id_t.
flow_port_t local_port() const
The local Flow-protocol port chosen by the Node (if active or passive open) or user (if passive open)...
flow_port_t m_local_port
See local_port(). Should be set before user gets access to *this and not changed afterwards.
friend class Send_bandwidth_estimator
Stats modules have const access to all socket internals.
bool set_options(const Peer_socket_options &opts, Error_code *err_code=0)
Dynamically replaces the current options set (options()) with the given options set.
size_t node_send(const Function< size_t(size_t max_data_size)> &snd_buf_feed_func, Error_code *err_code)
Non-template helper for template send() that forwards the send() logic to Node::send().
bool rexmit_on() const
Whether retransmission is enabled on this connection.
size_t node_sync_receive(const Function< size_t()> &rcv_buf_consume_func_or_empty, const Fine_time_pt &wait_until, Error_code *err_code)
This is to sync_receive() as node_receive() is to receive().
util::Lock_guard< Mutex > Lock_guard
Short-hand for RAII lock guard of Mutex.
Int_state
The state of the socket (and the connection from this end's point of view) for the internal state mac...
@ S_ESTABLISHED
Public state is OPEN+CONNECTED; in our opinion the connection is established.
@ S_SYN_SENT
Public state is OPEN+CONNECTING; user requested active connect; we sent SYN and are awaiting response...
@ S_CLOSED
Closed (dead or new) socket.
@ S_SYN_RCVD
Public state is OPEN+CONNECTING; other side requested passive connect via SYN; we sent SYN_ACK and ar...
util::Lock_guard< Options_mutex > Options_lock
Short-hand for lock that acquires exclusive access to an Options_mutex.
void close_abruptly(Error_code *err_code=0)
Acts as if fatal error error::Code::S_USER_CLOSED_ABRUPTLY has been discovered on the connection.
size_t max_block_size() const
The maximum number of bytes of user data per received or sent packet on this connection.
Node * node() const
Node that produced this Peer_socket.
Definition: peer_socket.cpp:95
Peer_socket_info info() const
Returns a structure containing the most up-to-date stats about this connection.
Recvd_pkt_map::iterator Recvd_pkt_iter
Short-hand for m_rcv_packets_with_gaps iterator type.
Mutex m_mutex
This object's mutex.
Sent_pkt_by_sent_when_map::iterator Sent_pkt_ordered_by_when_iter
Short-hand for m_snd_flying_pkts_by_sent_when iterator type.
Sent_pkt_by_seq_num_map::const_iterator Sent_pkt_ordered_by_seq_const_iter
Short-hand for m_snd_flying_pkts_by_seq_num const iterator type.
Peer_socket_info m_info_on_close
This is the final set of stats collected at the time the socket was moved to S_CLOSED m_state.
bool ensure_open(Error_code *err_code) const
Helper that is equivalent to Node::ensure_sock_open(this, err_code).
Sent_pkt_by_sent_when_map::const_iterator Sent_pkt_ordered_by_when_const_iter
Short-hand for m_snd_flying_pkts_by_sent_when const iterator type.
Opt_type opt(const Opt_type &opt_val_ref) const
Analogous to Node::opt() but for per-socket options.
std::string bytes_blocks_str(size_t bytes) const
Helper that, given a byte count, returns a string with that byte count and the number of max_block_si...
Peer_socket_options m_opts
This socket's per-socket set of options.
Peer_socket_options options() const
Copies this socket's option set and returns that copy.
Options_mutex m_opts_mutex
The mutex protecting m_opts.
std::map< Sequence_number, boost::shared_ptr< Received_packet > > Recvd_pkt_map
Short-hand for m_rcv_packets_with_gaps type; see that data member.
Recvd_pkt_map::const_iterator Recvd_pkt_const_iter
Short-hand for m_rcv_packets_with_gaps const iterator type.
Open_sub_state m_open_sub_state
See state().
size_t node_receive(const Function< size_t()> &rcv_buf_consume_func, Error_code *err_code)
Non-template helper for template receive() that forwards the receive() logic to Node::receive().
State state(Open_sub_state *open_sub_state=0) const
Current State of the socket.
Definition: peer_socket.cpp:85
void return_port(flow_port_t port, Error_code *err_code)
Return a previously reserved port (of any type).
Definition: port_space.cpp:175
An internal net_flow sequence number identifying a piece of data.
Definition: seq_num.hpp:126
void set_metadata(char num_line_id=0, const Sequence_number &zero_point=Sequence_number(), seq_num_delta_t multiple_size=0)
Updates the full set of metadata (used at least for convenient convention-based logging but not actua...
Definition: seq_num.cpp:268
uint64_t seq_num_t
Raw sequence number type.
Definition: seq_num.hpp:138
Internal net_flow class that implements a socket buffer, as used by Peer_socket for Send and Receive ...
void consume_buf_move(util::Blob *target_buf, size_t max_data_size)
Consumes (removes from the front of the internal byte buffer and returns them to the caller) a byte s...
size_t data_size() const
The total number of bytes of application-layer data stored in this object.
Properties of various container types.
Definition: traits.hpp:43
typename Value_list::const_reverse_iterator Const_reverse_iterator
Type for reverse iterator pointing into an immutable structure of this type.
typename Value_list::reverse_iterator Reverse_iterator
Type for reverse iterator pointing into a mutable structure of this type.
std::pair< Iterator, bool > insert(Value const &key_and_mapped)
Attempts to insert the given key/mapped-value pair into the map.
static Ptr ptr_cast(const From_ptr &ptr_to_cast)
Provides syntactic-sugary way to perform a static_pointer_cast<> from a compatible smart pointer type...
boost::shared_ptr< Peer_socket > Ptr
Short-hand for ref-counted pointer to mutable values of type Target_type::element_type (a-la T*).
Const_target_ptr Const_ptr
Short-hand for ref-counted pointer to immutable values of type Target_type::element_type (a-la T cons...
Similar to ostringstream but allows fast read-only access directly into the std::string being written...
#define FLOW_ERROR_SYS_ERROR_LOG_WARNING()
Logs a warning about the (often errno-based or from a library) error code in sys_err_code.
Definition: error.hpp:269
#define FLOW_ERROR_LOG_ERROR(ARG_val)
Logs a warning about the given error code using FLOW_LOG_WARNING().
Definition: error.hpp:233
#define FLOW_ERROR_EXEC_AND_THROW_ON_ERROR(ARG_ret_type, ARG_function_name,...)
Narrow-use macro that implements the error code/exception semantics expected of most public-facing Fl...
Definition: error.hpp:363
#define FLOW_ERROR_EMIT_ERROR(ARG_val)
Sets *err_code to ARG_val and logs a warning about the error using FLOW_LOG_WARNING().
Definition: error.hpp:202
#define FLOW_ERROR_EMIT_ERROR_LOG_INFO(ARG_val)
Identical to FLOW_ERROR_EMIT_ERROR(), but the message logged has flow::log::Sev::S_INFO severity inst...
Definition: error.hpp:218
#define FLOW_LOG_DATA(ARG_stream_fragment)
Logs a DATA message into flow::log::Logger *get_logger() with flow::log::Component get_log_component(...
Definition: log.hpp:242
#define FLOW_LOG_INFO(ARG_stream_fragment)
Logs an INFO message into flow::log::Logger *get_logger() with flow::log::Component get_log_component...
Definition: log.hpp:197
#define FLOW_LOG_WITHOUT_CHECKING(ARG_sev, ARG_stream_fragment)
Identical to FLOW_LOG_WITH_CHECKING() but foregoes the filter (Logger::should_log()) check.
Definition: log.hpp:532
#define FLOW_LOG_WARNING(ARG_stream_fragment)
Logs a WARNING message into flow::log::Logger *get_logger() with flow::log::Component get_log_compone...
Definition: log.hpp:152
#define FLOW_LOG_WITH_CHECKING(ARG_sev, ARG_stream_fragment)
Logs a message of the specified severity into flow::log::Logger *get_logger() with flow::log::Compone...
Definition: log.hpp:489
#define FLOW_LOG_TRACE_WITHOUT_CHECKING(ARG_stream_fragment)
Logs a TRACE message into flow::log::Logger *get_logger() with flow::log::Component get_log_component...
Definition: log.hpp:354
#define FLOW_LOG_DATA_WITHOUT_CHECKING(ARG_stream_fragment)
Logs a DATA message into flow::log::Logger *get_logger() with flow::log::Component get_log_component(...
Definition: log.hpp:372
#define FLOW_LOG_TRACE(ARG_stream_fragment)
Logs a TRACE message into flow::log::Logger *get_logger() with flow::log::Component get_log_component...
Definition: log.hpp:227
Synchronicity
Enumeration indicating the manner in which asio_exec_ctx_post(), and various boost....
Definition: async_fwd.hpp:223
void asio_exec_ctx_post(log::Logger *logger_ptr, Execution_context *exec_ctx, Synchronicity synchronicity, Task &&task)
An extension of boost.asio's post() and dispatch() free function templates, this free function templa...
Definition: util.hpp:31
bool exec_void_and_throw_on_error(const Func &func, Error_code *err_code, util::String_view context)
Equivalent of exec_and_throw_on_error() for operations with void return type.
Definition: error.hpp:168
@ S_DATA
Message satisfies Sev::S_TRACE description AND contains variable-length structure (like packet,...
@ S_TRACE
Message indicates any condition that may occur with great frequency (thus verbose if logged).
@ S_INFO
Message indicates a not-"bad" condition that is not frequent enough to be of severity Sev::S_TRACE.
@ S_CONN_TIMEOUT
Other side did not complete connection handshake within the allowed time; perhaps no one is listening...
@ S_USER_CLOSED_ABRUPTLY
User code on this side abruptly closed connection; other side may be informed of this.
@ S_CONN_RESET_TOO_MANY_REXMITS
Connection reset because a packet has been retransmitted too many times.
@ S_SEQ_NUM_IMPLIES_CONNECTION_COLLISION
Other side has sent packet with sequence number that implies a port collision between two connections...
@ S_SEQ_NUM_ARITHMETIC_FAILURE
Other side has sent packets with inconsistent sequence numbers.
@ S_CONN_METADATA_TOO_LARGE
During connection user supplied metadata that is too large.
@ S_CANNOT_CONNECT_TO_IP_ANY
Cannot ask to connect to "any" IP address. Use specific IP address.
@ S_WAIT_USER_TIMEOUT
A blocking (sync_) or background-blocking (async_) operation timed out versus user-supplied time limi...
@ S_WAIT_INTERRUPTED
A blocking (sync_) or background-blocking (async_) operation was interrupted, such as by a signal.
@ S_EVENT_SET_CLOSED
Attempted operation on an event set, when that event set was closed.
@ S_INTERNAL_ERROR_PORT_COLLISION
Internal error: Ephemeral port double reservation allowed.
@ S_NODE_NOT_RUNNING
Node not running.
Flow module containing the API and implementation of the Flow network protocol, a TCP-inspired stream...
Definition: node.cpp:25
uint16_t flow_port_t
Logical Flow port type (analogous to a UDP/TCP port in spirit but in no way relevant to UDP/TCP).
const flow_port_t S_PORT_ANY
Special Flow port value used to indicate "invalid port" or "please pick a random available ephemeral ...
Definition: port_space.cpp:33
std::ostream & operator<<(std::ostream &os, const Congestion_control_selector::Strategy_choice &strategy_choice)
Serializes a Peer_socket_options::Congestion_control_strategy_choice enum to a standard ostream – the...
Definition: cong_ctl.cpp:146
bool key_exists(const Container &container, const typename Container::key_type &key)
Returns true if and only if the given key is present at least once in the given associative container...
Definition: util.hpp:301
Auto_cleanup setup_auto_cleanup(const Cleanup_func &func)
Provides a way to execute arbitrary (cleanup) code at the exit of the current block.
Definition: util.hpp:307
std::string buffers_dump_string(const Const_buffer_sequence &data, const std::string &indentation, size_t bytes_per_line)
Identical to buffers_to_ostream() but returns an std::string instead of writing to a given ostream.
Definition: util.hpp:506
bool subtract_with_floor(Minuend *minuend, const Subtrahend &subtrahend, const Minuend &floor)
Performs *minuend -= subtrahend, subject to a floor of floor.
Definition: util.hpp:324
Integer ceil_div(Integer dividend, Integer divisor)
Returns the result of the given non-negative integer divided by a positive integer,...
Definition: util.hpp:258
bool in_open_open_range(T const &min_val, T const &val, T const &max_val)
Returns true if and only if the given value is within the given range, given as a (low,...
Definition: util.hpp:295
Scheduled_task_handle schedule_task_from_now(log::Logger *logger_ptr, const Fine_duration &from_now, bool single_threaded, Task_engine *task_engine, Scheduled_task_handler &&task_body_moved)
Schedule the given function to execute in a certain amount of time: A handy wrapper around Timer (asi...
Definition: sched_task.hpp:34
bool scheduled_task_fired(log::Logger *logger_ptr, Scheduled_task_const_handle task)
Returns whether a previously scheduled (by schedule_task_from_now() or similar) task has already fire...
Definition: sched_task.cpp:238
bool in_open_closed_range(T const &min_val, T const &val, T const &max_val)
Returns true if and only if the given value is within the given range, given as a (low,...
Definition: util.hpp:279
void ostream_op_to_string(std::string *target_str, T const &... ostream_args)
Writes to the specified string, as if the given arguments were each passed, via << in sequence,...
Definition: util.hpp:367
Fine_duration scheduled_task_fires_from_now_or_canceled(log::Logger *logger_ptr, Scheduled_task_const_handle task)
Returns how long remains until a previously scheduled (by schedule_task_from_now() or similar) task f...
Definition: sched_task.cpp:200
boost::shared_ptr< Scheduled_task_handle_state > Scheduled_task_handle
Black-box type that represents a handle to a scheduled task as scheduled by schedule_task_at() or sch...
bool in_closed_range(T const &min_val, T const &val, T const &max_val)
Returns true if and only if the given value is within the given range, inclusive.
Definition: util.hpp:271
boost::shared_ptr< void > Auto_cleanup
Helper type for setup_auto_cleanup().
Definition: util_fwd.hpp:205
boost::asio::io_context Task_engine
Short-hand for boost.asio event service, the central class of boost.asio.
Definition: util_fwd.hpp:135
bool scheduled_task_cancel(log::Logger *logger_ptr, Scheduled_task_handle task)
Attempts to prevent the execution of a previously scheduled (by schedule_task_from_now() or similar) ...
Definition: sched_task.cpp:26
Blob_with_log_context<> Blob
A concrete Blob_with_log_context that compile-time-disables Basic_blob::share() and the sharing API d...
Definition: blob_fwd.hpp:60
boost::system::error_code Error_code
Short-hand for a boost.system error code (which basically encapsulates an integer/enum error code and...
Definition: common.hpp:508
Flow_log_component
The flow::log::Component payload enumeration comprising various log components used by Flow's own int...
Definition: common.hpp:638
Fine_clock::duration Fine_duration
A high-res time duration as computed from two Fine_time_pts.
Definition: common.hpp:416
Fine_clock::time_point Fine_time_pt
A high-res time point as returned by Fine_clock::now() and suitable for precise time math in general.
Definition: common.hpp:413
unsigned char uint8_t
Byte. Best way to represent a byte of binary data. This is 8 bits on all modern systems.
Definition: common.hpp:391
Specifies the outgoing (pre-serialization) acknowledgment of a single received Data_packet,...
Equivalent of Individual_ack_rexmit_off but for sockets with retransmission enabled.
Specifies the incoming (post-deserialization) acknowledgment of a single received Data_packet.
boost::shared_ptr< const Individual_ack > Const_ptr
Short-hand for ref-counted pointer to immutable objects of this class.
uint64_t ack_delay_t
Type used to store the ACK delay for a given individual acknowledged packet.
Fine_duration Ack_delay_time_unit
Ack_delay_time_unit(1) is the duration corresponding to the ack_delay_t value 1; and proportionally f...
uint32_t rcv_wnd_t
Type used to store the size of m_rcv_wnd member in a couple of different packet types.
uint8_t rexmit_id_t
Type used to store the retransmission count in DATA and ACK packets.
The data nugget uniquely identifying a peer-to-peer connection from a remote endpoint to a port in th...
Definition: node.hpp:3862
Metadata describing the data sent in the acknowledgment of an individual received packet.
boost::shared_ptr< const Individual_ack > Const_ptr
Short-hand for ref-counted pointer to immutable objects of this class.
boost::shared_ptr< Individual_ack > Ptr
Short-hand for ref-counted pointer to mutable objects of this class.
Metadata (and data, if retransmission is on) for a packet that has been received (and,...
const size_t m_size
Number of bytes in the Data_packet::m_data field of that packet.
Received_packet(log::Logger *logger_ptr, size_t size, util::Blob *src_data)
Constructs object by storing size of data and, if so instructed, the data themselves.
util::Blob m_data
Byte sequence equal to that of Data_packet::m_data of the packet.
Data store to keep timing related info when a packet is sent out.
const order_num_t m_order_num
Order number of the packet.
size_t m_sent_cwnd_bytes
The congestion window size (in bytes) that is used when the packet is sent out.
Fine_time_pt m_sent_time
The timestamp when the packet is sent out.
Metadata (and data, if retransmission is on) for a packet that has been sent one (if retransmission i...
Sent_packet(bool rexmit_on, boost::shared_ptr< Data_packet > packet, const Sent_when &sent_when)
Constructs object with the given values and m_acks_after_me at zero.
std::vector< Sent_when > m_sent_when
Time stamps, order numbers, and other info at the times when the different attempts (including origin...
const size_t m_size
Number of bytes in the Data_packet::m_data field of the sent packet.
const boost::shared_ptr< Data_packet > m_packet
If retransmission is on, this is the DATA packet itself that was sent; otherwise null.
uint16_t ack_count_t
Type used for m_acks_after_me.
ack_count_t m_acks_after_me
The number of times any packet with m_sent_when.back().m_order_num > this->m_sent_when....
A data store that keeps stats about the a Peer_socket connection.
Definition: info.hpp:456
Peer_socket_send_stats m_snd
Stats for outgoing direction of traffic. As opposed to the other m_snd_* members, this typically accu...
Definition: info.hpp:511
Node_options m_node_opts
Per-node options currently set on the socket's Node.
Definition: info.hpp:651
size_t m_low_lvl_max_buf_size
The UDP receive buffer maximum size, as reported by an appropriate call to the appropriate getsockopt...
Definition: info.hpp:526
size_t m_rcv_buf_size
The number of bytes in the internal Receive buffer.
Definition: info.hpp:549
size_t m_rcv_wnd_last_advertised
The last rcv_wnd (receive window) size sent to sender (not necessarily received; packets can be lost)...
Definition: info.hpp:555
Fine_duration m_snd_pacing_slice_period
In pacing, the duration of the current pacing time slice.
Definition: info.hpp:629
size_t m_rcv_reassembly_q_data_size
If rexmit_on is false then 0; otherwise the total DATA payload in the reassembly queue of the socket.
Definition: info.hpp:558
size_t m_snd_pacing_bytes_allowed_this_slice
This many bytes worth of DATA packets may still be sent, at this time, within the time slice defined ...
Definition: info.hpp:635
Peer_socket_options m_sock_opts
Per-socket options currently set on the socket.
Definition: info.hpp:644
size_t m_snd_buf_size
The number of bytes in the internal Send buffer.
Definition: info.hpp:590
size_t m_rcv_syn_rcvd_data_cumulative_size
Total size of DATA payload queued while waiting for SYN_ACK_ACK in SYN_RCVD state.
Definition: info.hpp:573
size_t m_rcv_syn_rcvd_data_q_size
Number of DATA packets queued while waiting for SYN_ACK_ACK in SYN_RCVD state.
Definition: info.hpp:576
std::string m_int_state_str
The internal state of the socket, rendered into string (e.g., "SYN_RECEIVED" or "ESTABLISHED").
Definition: info.hpp:533
Fine_time_pt m_snd_pacing_slice_start
In pacing, the time point marking the beginning of the current pacing time slice.
Definition: info.hpp:626
size_t m_snd_cong_ctl_in_flight_count
In congestion control, the current sent data packets that have been neither acknowledged nor consider...
Definition: info.hpp:611
size_t m_snd_cong_ctl_in_flight_bytes
In congestion control, the current sent data bytes that have been neither acknowledged nor considered...
Definition: info.hpp:608
double m_snd_est_bandwidth_mbit_per_sec
Estimate of the currently available (to this connection) outgoing bandwidth, in megabits per second.
Definition: info.hpp:641
size_t m_rcv_wnd
Receive window size = max Receive buffer space minus space taken. Infinity if flow control disabled.
Definition: info.hpp:552
size_t m_rcv_packets_with_gaps
Number of DATA packets tracked in structure tracking all valid received packets such at least one pac...
Definition: info.hpp:570
size_t m_snd_cong_ctl_wnd_bytes
In congestion control, the current congestion window (number of outgoing data bytes allowed In-flight...
Definition: info.hpp:599
Fine_duration m_snd_smoothed_round_trip_time
Estimated current round trip time of packets, computed as a smooth value over the past individual RTT...
Definition: info.hpp:614
Error_code m_disconnect_cause
If the socket is closing or closed, this is the reason for the closure; otherwise the default-constru...
Definition: info.hpp:539
size_t m_snd_cong_ctl_wnd_count_approx
In congestion control, the approximate equivalent of m_snd_cong_ctl_in_flight_bytes as a full packet ...
Definition: info.hpp:602
size_t m_snd_rcv_wnd
The receive window (rcv_wnd a/k/a free Receive buffer space) value of the peer socket on the other si...
Definition: info.hpp:596
bool m_is_active_connect
true if this is the "client" socket (connect()ed); false otherwise (accept()ed).
Definition: info.hpp:536
size_t m_snd_pacing_packet_q_size
In pacing, number of packets currently queued to be sent out by the pacing module.
Definition: info.hpp:623
Fine_duration m_snd_round_trip_time_variance
RTTVAR used for m_snd_smoothed_round_trip_time calculation; it is the current RTT variance.
Definition: info.hpp:617
Peer_socket_receive_stats m_rcv
Stats for incoming direction of traffic. As opposed to the other m_rcv_* members, this typically accu...
Definition: info.hpp:508
Fine_duration m_snd_drop_timeout
Drop Timeout: how long a given packet must remain unacknowledged to be considered dropped due to Drop...
Definition: info.hpp:620
A set of low-level options affecting a single Peer_socket.
Definition: options.hpp:36
Fine_duration m_st_init_drop_timeout
Once socket enters ESTABLISHED state, this is the value for Peer_socket::m_snd_drop_timeout until the...
Definition: options.hpp:226
unsigned int m_st_max_rexmissions_per_packet
If retransmission is enabled and a given packet is retransmitted this many times and has to be retran...
Definition: options.hpp:220
size_t m_st_rcv_buf_max_size
Maximum number of bytes that the Receive buffer can hold.
Definition: options.hpp:141
size_t m_st_cong_ctl_max_cong_wnd_blocks
The constant that determines the CWND limit in Congestion_control_classic_data::congestion_window_at_...
Definition: options.hpp:296
Fine_duration m_st_snd_bandwidth_est_sample_period_floor
When estimating the available send bandwidth, each sample must be compiled over at least this long of...
Definition: options.hpp:267
unsigned int m_st_cong_ctl_cong_avoidance_increment_blocks
The multiple of max-block-size by which to increment CWND in congestion avoidance mode after receivin...
Definition: options.hpp:306
size_t m_st_cong_ctl_cong_wnd_on_drop_timeout_blocks
On Drop Timeout, set congestion window to this value times max-block-size.
Definition: options.hpp:299
size_t m_st_cong_ctl_init_cong_wnd_blocks
The initial size of the congestion window, given in units of max-block-size-sized blocks.
Definition: options.hpp:277
bool m_st_rexmit_on
Whether to enable reliability via retransmission.
Definition: options.hpp:214
size_t m_st_snd_buf_max_size
Maximum number of bytes that the Send buffer can hold.
Definition: options.hpp:134
Fine_duration m_st_connect_retransmit_period
How often to resend SYN or SYN_ACK while SYN_ACK or SYN_ACK_ACK, respectively, has not been received.
Definition: options.hpp:121
Fine_duration m_dyn_rcv_wnd_recovery_timer_period
When the mode triggered by rcv-buf-max-size-to-advertise-percent being exceeded is in effect,...
Definition: options.hpp:333
Fine_duration m_st_connect_retransmit_timeout
How long from the first SYN or SYN_ACK to allow for connection handshake before aborting connection.
Definition: options.hpp:124
size_t m_st_max_full_blocks_before_ack_send
If there are at least this many TIMES max-block-size bytes' worth of individual acknowledgments to be...
Definition: options.hpp:198
Fine_duration m_st_delayed_ack_timer_period
The maximum amount of time to delay sending ACK with individual packet's acknowledgment since receivi...
Definition: options.hpp:191
unsigned int m_dyn_drop_timeout_backoff_factor
Whenever the Drop Timer fires, upon the requisite Dropping of packet(s), the DTO (Drop Timeout) is se...
Definition: options.hpp:325
size_t m_st_max_block_size
The size of block that we will strive to (and will, assuming at least that many bytes are available i...
Definition: options.hpp:114
unsigned int m_st_cong_ctl_classic_wnd_decay_percent
In classic congestion control, RFC 5681 specifies the window should be halved on loss; this option al...
Definition: options.hpp:314
unsigned int m_st_rcv_buf_max_size_to_advertise_percent
% of rcv-buf-max-size that has to be freed, since the last receive window advertisement,...
Definition: options.hpp:171
unsigned int m_st_rcv_max_packets_after_unrecvd_packet_ratio_percent
The limit on the size of Peer_socket::m_rcv_packets_with_gaps, expressed as what percentage the maxim...
Definition: options.hpp:183
Fine_duration m_dyn_drop_timeout_ceiling
Ceiling to impose on the Drop Timeout.
Definition: options.hpp:317
Represents the remote endpoint of a Flow-protocol connection; identifies the UDP endpoint of the remo...
Definition: endpoint.hpp:93
util::Udp_endpoint m_udp_endpoint
UDP address (IP address/UDP port) where the Node identified by this endpoint bound its low-level UDP ...
Definition: endpoint.hpp:97
#define FLOW_UTIL_WHERE_AM_I_STR()
Same as FLOW_UTIL_WHERE_AM_I() but evaluates to an std::string.
Definition: util_fwd.hpp:971