Flow 1.0.1
Flow project: Full implementation reference.
peer_socket.cpp
Go to the documentation of this file.
1/* Flow
2 * Copyright 2023 Akamai Technologies, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the
5 * "License"); you may not use this file except in
6 * compliance with the License. You may obtain a copy
7 * of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in
12 * writing, software distributed under the License is
13 * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
14 * CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing
16 * permissions and limitations under the License. */
17
18/// @file
25#include "flow/async/util.hpp"
26#include <boost/algorithm/string.hpp>
27#include <boost/tuple/tuple.hpp>
28#include <utility>
29
30namespace flow::net_flow
31{
32
33// Implementations.
34
35// Peer_socket implementations.
36
38 util::Task_engine* task_engine,
39 const Peer_socket_options& opts) :
40 Log_context(logger_ptr, Flow_log_component::S_NET_FLOW),
41 m_opts(opts),
42 m_active_connect(false), // Meaningless; set explicitly.
43 m_state(State::S_CLOSED), // Incorrect; set explicitly.
44 m_open_sub_state(Open_sub_state::S_DISCONNECTING), // Incorrect; set explicitly.
45 m_node(0), // Incorrect; set explicitly.
46 m_rcv_buf(logger_ptr, 0), // Receive buffer mode: block size irrelevant (see Socket_buffer doc header).
47 // Send buffer mode: pack data into block-sized chunks for dequeueing speed. See Socket_buffer doc header.
48 m_snd_buf(logger_ptr, max_block_size()),
49 m_serialized_metadata(logger_ptr),
50 m_local_port(S_PORT_ANY), // Incorrect; set explicitly.
51 m_int_state(Int_state::S_CLOSED), // Incorrect; set explicitly.
52 m_rcv_syn_rcvd_data_cumulative_size(0), // Meaningless unless queue has elements but might as well initialize.
53 m_rcv_reassembly_q_data_size(0),
54 m_rcv_pending_acks_size_at_recv_handler_start(0),
55 m_snd_pending_rcv_wnd(0), // Meaningless originally but might as well initialize.
56 m_rcv_last_sent_rcv_wnd(0),
57 m_rcv_in_rcv_wnd_recovery(false),
58 m_rcv_delayed_ack_timer(*task_engine),
59 m_snd_flying_bytes(0),
60 m_snd_last_order_num(0),
61 m_snd_rexmit_q_size(0),
62 m_snd_remote_rcv_wnd(0),
63 m_snd_smoothed_round_trip_time(0),
64 m_round_trip_time_variance(0),
65 m_snd_drop_timeout(0),
66 m_snd_pacing_data(task_engine),
67 m_security_token(0), // Incorrect; set explicitly.
68 m_init_rexmit_count(0)
69{
70 // Only print pointer value, because most members are garbage at this point.
71 FLOW_LOG_TRACE("Peer_socket [" << static_cast<void*>(this) << "] created.");
72
73 // Log initial option values. Arguable if this should be INFO or TRACE. @todo Reconsider?
74 FLOW_LOG_TRACE("\n\n" << options());
75}
76
78{
79 /* Note that m_snd_cong_ctl, m_snd_bandwidth_estimator (etc.) and others store no Ptr(this),
80 * so this dtor will indeed execute (no circular shared_ptr problem). */
81
82 FLOW_LOG_TRACE("Peer_socket [" << this << "] destroyed.");
83}
84
86{
87 Lock_guard lock(m_mutex); // State is liable to change at any time.
88 if (open_sub_state && (m_state == State::S_OPEN))
89 {
90 *open_sub_state = m_open_sub_state;
91 }
92 return m_state;
93}
94
96{
97 Lock_guard lock(m_mutex); // m_node can simultaneously change to 0 if state changes to S_CLOSED.
98 return m_node;
99}
100
102{
103 Lock_guard lock(m_mutex);
104 return m_disconnect_cause;
105}
106
107bool Peer_socket::sync_send(const boost::asio::null_buffers& tag, Error_code* err_code)
108{
109 return sync_send(tag, Fine_duration::max(), err_code);
110}
111
113{
114 // Similar to sync_send_impl(), so keeping comments light. Reminder: Goal is to wait until *this is Writable.
115
116 namespace bind_ns = util::bind_ns;
117 using bind_ns::bind;
118
120 bind_ns::cref(wait_until), _1);
121
122 Lock_guard lock(m_mutex);
123
124 const Function<size_t (size_t)> empty_snd_buf_feed_func;
125 assert(empty_snd_buf_feed_func.empty());
126
127 lock.release();
128
129 // Intentionally pass empty function obj to indicate "reactor pattern" mode.
130 node_sync_send(empty_snd_buf_feed_func, wait_until, err_code);
131 return !*err_code; // Socket is Writable if and only if !*err_code (i.e., no timeout or other error while waiting).
132}
133
134size_t Peer_socket::node_send(const Function<size_t (size_t max_data_size)>& snd_buf_feed_func,
135 Error_code* err_code)
136{
137 // Everything is locked. (See send() template.)
138
139 const Ptr sock = shared_from_this();
140 if (!Node::ensure_sock_open(sock, err_code)) // Ensure it's open, so that we can access m_node.
141 {
142 return 0;
143 }
144 // else m_node is valid.
145
146 return m_node->send(sock, snd_buf_feed_func, err_code);
147}
148
149size_t Peer_socket::node_sync_send(const Function<size_t (size_t max_data_size)>& snd_buf_feed_func_or_empty,
150 const Fine_time_pt& wait_until,
151 Error_code* err_code)
152{
153 using boost::adopt_lock;
154
155 // Everything is locked. (See sync_send() template.)
156 Lock_guard lock(m_mutex, adopt_lock); // Adopt already-locked mutex.
157
158 const Ptr sock = shared_from_this();
159 if (!Node::ensure_sock_open(sock, err_code)) // Ensure it's open, so that we can access m_node.
160 {
161 return 0;
162 }
163 // else m_node is valid.
164
165 /* Because all Node::sync_*() implementations would follow the same pattern (create Event_set,
166 * add Readable/Writable/Acceptable event, wait, try non-blocking op, if that fails try again with
167 * wait_until ever closer, etc.), for major code reuse we use the sync_op() function template and plug in
168 * the various Peer_socket/send-specific pieces as arguments.
169 *
170 * Performance cost: The only part about this that's not as fast as copy/pasting sync_op() N times, once
171 * for each type of socket/op, is the need to lambda the proper send() call into a function object.
172 * This amounts to storing and copying the arguments and the function pointer, which should not be
173 * too bad and is worth the code reuse IMO. */
174
175 lock.release(); // Again, release lock (mutex is still locked!).
176
177 /* Operating on Peer_sockets, returning size_t; Event_set socket set type is Peer_sockets.
178 * Object is sock; non-blocking operation is m_node->send(...) -- or N/A in "reactor pattern" mode.
179 * size_t(0) is the "would-block" return value for this operation. S_PEER_SOCKET_WRITABLE
180 * is the type of event to watch for here. */
181 return m_node
182 ->sync_op<Peer_socket, size_t>
183 (sock,
184 snd_buf_feed_func_or_empty.empty()
185 ? Function<size_t ()>() // Reactor pattern mode.
186 : Function<size_t ()>([this, sock, snd_buf_feed_func_or_empty, err_code]() -> size_t
187 { return m_node->send(sock, snd_buf_feed_func_or_empty, err_code); }),
189 wait_until, err_code);
190} // Peer_socket::node_sync_send()
191
192bool Peer_socket::sync_receive(const boost::asio::null_buffers& tag, Error_code* err_code)
193{
194 return sync_receive(tag, Fine_duration::max(), err_code);
195}
196
198{
199 // Similar to sync_receive_impl(), so keeping comments light. Reminder: Goal is to wait until *this is Readable.
200
201 namespace bind_ns = util::bind_ns;
202 using bind_ns::bind;
203
205 bind_ns::cref(wait_until), _1);
206
207 Lock_guard lock(m_mutex);
208
209 const Function<size_t ()> empty_rcv_buf_consume_func;
210 assert(empty_rcv_buf_consume_func.empty());
211
212 lock.release();
213
214 // Intentionally pass empty function obj to indicate "reactor pattern" mode.
215 node_sync_receive(empty_rcv_buf_consume_func, wait_until, err_code);
216 return !*err_code; // Socket is Readable if and only if !*err_code (i.e., no timeout or other error while waiting).
217}
218
219size_t Peer_socket::node_receive(const Function<size_t ()>& rcv_buf_consume_func,
220 Error_code* err_code)
221{
222 // Everything is locked. (See receive() template.)
223
224 const Ptr sock = shared_from_this();
225 if (!Node::ensure_sock_open(sock, err_code)) // Ensure it's open, so that we can access m_node.
226 {
227 return 0;
228 }
229 // else m_node is valid.
230
231 return m_node->receive(sock, rcv_buf_consume_func, err_code);
232}
233
234size_t Peer_socket::node_sync_receive(const Function<size_t ()>& rcv_buf_consume_func_or_empty,
235 const Fine_time_pt& wait_until,
236 Error_code* err_code)
237{
238 using boost::adopt_lock;
239
240 // Everything is locked. (See sync_send() template.)
241 Lock_guard lock(m_mutex, adopt_lock); // Adopt already-locked mutex.
242
243 const Ptr sock = shared_from_this();
244 if (!Node::ensure_sock_open(sock, err_code)) // Ensure it's open, so that we can access m_node.
245 {
246 return 0;
247 }
248 // else m_node is valid.
249
250 lock.release(); // Again, release lock (mutex is still locked!).
251
252 // See comment in Peer_socket::node_sync_send().
253
254 /* Operating on Peer_sockets, returning size_t; Event_set socket set type is Peer_sockets.
255 * Object is sock; non-blocking operation is m_node->receive(...) -- or N/A in "reactor pattern" mode.
256 * size_t(0) is the "would-block" return value for this operation. S_PEER_SOCKET_READABLE
257 * is the type of event to watch for here. */
258 return m_node
259 ->sync_op<Peer_socket, size_t>
260 (sock,
261 rcv_buf_consume_func_or_empty.empty()
262 ? Function<size_t ()>() // Reactor pattern mode.
263 : Function<size_t ()>([this, sock, rcv_buf_consume_func_or_empty, err_code]() -> size_t
264 { return m_node->receive(sock, rcv_buf_consume_func_or_empty, err_code); }),
266 wait_until, err_code);
267} // Peer_socket::node_sync_receive()
268
270{
272 ([this](Error_code* actual_err_code) { close_abruptly(actual_err_code); },
273 err_code, FLOW_UTIL_WHERE_AM_I_STR()))
274 {
275 return;
276 }
277 // else
278
279 // We are in user thread U != W.
280
281 Lock_guard lock(m_mutex); // Lock m_node/m_state; also it's a pre-condition for Node::close_abruptly().
282
283 const Ptr sock = shared_from_this();
284 if (!Node::ensure_sock_open(sock, err_code)) // Ensure it's open, so that we can access m_node.
285 {
286 // *err_code will be set to original close reason (m_disconnect_cause) in this case, as advertised.
287 return;
288 }
289 // else m_node is valid.
290
291 // Forward to Node, as is the general pattern for Peer_socket method implementations.
292 lock.release(); // Let go of the mutex (mutex is still LOCKED).
293 m_node->close_abruptly(sock, err_code);
294 // No m_mutex.unlock(): Node::close_abruptly() MUST take care of it.
295} // Peer_socket::close_abruptly()
296
298{
299 namespace bind_ns = util::bind_ns;
300 FLOW_ERROR_EXEC_AND_THROW_ON_ERROR(bool, Peer_socket::set_options, bind_ns::cref(opts), _1);
301 // ^-- Call ourselves and return if err_code is null. If got to present line, err_code is not null.
302
303 // We are in thread U != W.
304
305 Lock_guard lock(m_mutex); // Lock m_node at least.
306
307 const Ptr sock = shared_from_this();
308 if (!Node::ensure_sock_open(sock, err_code)) // Ensure it's open, so that we can access m_node.
309 {
310 return false;
311 }
312 // else m_node is valid.
313
314 // As is typical elsewhere, pass the rest of the logic to a Node method.
315 return m_node->sock_set_options(sock, opts, err_code);
316} // Peer_socket::set_options()
317
319{
320 return opt(m_opts);
321}
322
324{
325 // We are in user thread U != W.
326
327 /* There are two cases. If the socket is open (not S_CLOSED), then an m_node owns it and may
328 * change the stats we want to copy in its thread W at any time. In this case we must copy it in
329 * thread W (which we do using a future and io_service::post(), as in listen() and other places in
330 * Node). In the socket is closed (S_CLOSED), then no m_node owns it, so there is no thread W
331 * applicable to this socket anymore, and we can just copy the data in thread U != W. */
332
333 Lock_guard lock(m_mutex); // Lock m_node; also it's a pre-condition for Node::sock_info().
334
335 const Const_ptr sock = shared_from_this();
336
337 // See which case it is.
338 Error_code dummy;
339 if (!Node::ensure_sock_open(sock, &dummy))
340 {
341 // Socket is closed. Done and done. Return the final stats cached at S_CLOSED time.
342 return m_info_on_close;
343 }
344 // else m_node is valid.
345
346 // Forward to Node, as is the general pattern for Peer_socket method implementations.
347 lock.release(); // Let go of the mutex (mutex is still LOCKED).
348 return m_node->sock_info(sock);
349 // No m_mutex.unlock(): Node::sock_info() MUST take care of it.
350} // Peer_socket::info()
351
353{
355}
356
357size_t Peer_socket::max_block_size_multiple(const size_t& opt_val_ref,
358 const unsigned int* inflate_pct_val_ptr) const
359{
360 // Similar to opt() but specialized for this purpose. Lock once to get both values.
362
364 const unsigned int inflate_pct = inflate_pct_val_ptr ? (*inflate_pct_val_ptr) : 0;
365
366 /* We want N's nearest multiple M of B such that M >= N. M = ceil(N/B) * B (no actual floating-point math involved).
367 *
368 * Oh, and N is opt_val_ref inflated by K%, or opt_val_ref * (100 + K)%. */
369 return util::ceil_div(opt_val_ref * (100 + inflate_pct) / 100, max_block_size)
371}
372
374{
375 return opt(m_opts.m_st_rexmit_on);
376}
377
379{
380 // Can't change; no locking needed. Safe info even if S_CLOSED.
381 return m_remote_endpoint;
382}
383
385{
386 // Can't change; no locking needed. Safe (if outdated) info even if S_CLOSED.
387 return m_local_port;
388}
389
390size_t Peer_socket::get_connect_metadata(const boost::asio::mutable_buffer& buffer,
391 Error_code* err_code) const
392{
393 namespace bind_ns = util::bind_ns;
394 using std::memcpy;
395
397 // ^-- Call ourselves and return if err_code is null. If got to present line, err_code is not null.
398
399 // We are in user thread U != W.
400
401 Lock_guard lock(m_mutex); // Lock m_serialized_metadata (it can be changed in sock_free_memory()).
402
403 if (!ensure_open(err_code)) // Ensure it's open; other m_serialized_metadata has been cleared.
404 {
405 return 0;
406 }
407 // else m_serialized_metadata is valid.
408
409 err_code->clear();
410 const size_t size = std::min(m_serialized_metadata.size(), buffer.size());
411 if (size != 0)
412 {
413 memcpy(buffer.data(), m_serialized_metadata.const_data(), size);
414 }
415
416 return size;
417} // Peer_socket::get_connect_metadata()
418
420{
421 return Node::ensure_sock_open(shared_from_this(), err_code);
422}
423
424std::string Peer_socket::bytes_blocks_str(size_t bytes) const
425{
427 using std::flush;
428
429 const auto block = max_block_size();
430 String_ostream os;
431 os.os() << bytes << '~' << (bytes / block);
432 if ((bytes % block) != 0)
433 {
434 os.os() << '+';
435 }
436 os.os() << flush;
437 return os.str();
438}
439
441 boost::shared_ptr<Data_packet> packet,
442 const Sent_when& sent_when) :
443 m_size(packet->m_data.size()),
444 m_sent_when({ sent_when }),
445 m_acks_after_me(0),
446 m_packet(rexmit_on ? packet : boost::shared_ptr<Data_packet>()) // Store packet only if we may have to rexmit later.
447{
448 // Nothing.
449}
450
452 m_size(size),
453 m_data(logger_ptr)
454{
455 if (src_data)
456 {
457 // Retransmission is on: save *src_data for later reassembly.
458 assert(m_size == size); // As promised in docs....
459
460 m_data = std::move(*src_data); // O(1) operation -- *src_data is probably cleared.
461 }
462}
463
464// Node implementations (dealing with individual Peer_sockets).
465
466// Static initializations.
467
468// Per RFC 5681 (Reno Fast Recovery; used in other congestion control specifications as well to detect drops).
470const uint8_t Node::S_DEFAULT_CONN_METADATA = 0; // Keep in sync with doc get_connect_metadata() doc header.
471
472// Implementations.
473
475 Peer_socket::Ptr sock,
476 boost::shared_ptr<const Syn_ack_packet> syn_ack)
477{
478 // We are in thread W.
479
480 /* We'd sent SYN and just got SYN_ACK. Assuming their SYN is valid, our side of connection can
481 * move to ESTABLISHED state. We can also complete the other side's connection by sending
482 * SYN_ACK_ACK. */
483
484 FLOW_LOG_INFO("NetFlow worker thread continuing active-connect of [" << sock << "]. "
485 "Received [" << syn_ack->m_type_ostream_manip << "] with "
486 "ISN [" << syn_ack->m_init_seq_num << "]; "
487 "security token [" << syn_ack->m_packed.m_security_token << "].");
488
489 // Send SYN_ACK_ACK to finish the handshake.
490
491 if (!async_low_lvl_syn_ack_ack_send_or_close_immediately(sock, syn_ack))
492 {
493 return;
494 }
495 /* send will happen asynchronously, and the registered completion handler will execute in this
496 * thread when done (NO SOONER than this method finishes executing). */
497
498 // No more errors.
499
500 // Handle the logical SYN part of their SYN_ACK.
501
502 // Save the start of the sequence number series based on their initial sequence number.
503 sock->m_rcv_init_seq_num = syn_ack->m_init_seq_num;
504 sock->m_rcv_next_seq_num = sock->m_rcv_init_seq_num + 1;
505
506 // Move ourselves to connected state.
507
508 // Public state.
510 // Internal state. SYN_SENT -> ESTABLISHED.
511 sock_set_int_state(sock, Peer_socket::Int_state::S_ESTABLISHED);
512
513 // Got the acknowledgment to SYN, so cancel retransmits and the timeout for that SYN.
514 cancel_timers(sock);
515
516 // Setup the Drop Timeout engine (m_snd_drop_timer).
517 setup_drop_timer(socket_id, sock);
518
519 // Record initial rcv_wnd; it should be the entire size of the other side's Receive buffer.
520 sock->m_snd_remote_rcv_wnd = syn_ack->m_packed.m_rcv_wnd;
521
522 /* Since sock is now connected and has an empty Send buffer, it is certainly now Writable.
523 * Therefore we should soon inform anyone waiting on any Event_sets for sock to become Writable.
524 *
525 * Caveat: Similar to that in Node::handle_syn_ack_ack_to_syn_rcvd() at similar point in the
526 * code. */
527
528 // Accumulate the event into the Node store (note: not any Event_set yet).
529 if (m_sock_events[Event_set::Event_type::S_PEER_SOCKET_WRITABLE].insert(sock).second)
530 {
531 // Possibly inform the user for any applicable Event_sets right now.
532 event_set_all_check_delta(true);
533 /* ^-- defer_delta_check == true: because the only way to get to this method is from
534 * async_low_lvl_recv(), which will perform event_set_all_check_delta(false) at the end of itself,
535 * before the boost.asio handler exits. See Node::m_sock_events doc header for details. */
536 }
537} // Node::handle_syn_ack_to_syn_sent()
538
540 boost::shared_ptr<const Syn_ack_packet> syn_ack)
541{
542 // We are in thread W.
543
544 /* We're ESTABLISHED but got a duplicate (valid) SYN_ACK again. For reasons explained in
545 * handle_incoming() at the call to the current method, we simply give them a SYN_ACK_ACK again
546 * and continue like nothing happened. */
547
548 FLOW_LOG_INFO("NetFlow worker thread working on [" << sock << "]. "
549 "In [" << Peer_socket::Int_state::S_ESTABLISHED << "] state "
550 "received duplicate [" << syn_ack->m_type_ostream_manip << "] with "
551 "ISN [" << syn_ack->m_init_seq_num << "]; "
552 "security token [" << syn_ack->m_packed.m_security_token << "]. "
553 "Could be from packet loss.");
554
555 // Everything has already been validated.
556
557 async_low_lvl_syn_ack_ack_send_or_close_immediately(sock, syn_ack);
558} // Node::handle_syn_ack_to_established()
559
561 Peer_socket::Ptr sock,
562 boost::shared_ptr<Data_packet> packet,
563 bool syn_rcvd_qd_packet)
564{
565 /* This is a complex method that does many things. Therefore readability is hard to accomplish, as the logic
566 * makes sense when writing it, but the big picture is hard to see when reading it. The necessary heavy commenting
567 * further increases the size and therefore (along that dimension) decreases readability. For these reasons,
568 * many logically distinct parts were placed into helper methods -- not to increase code reuse but to help
569 * the aforementioned consideration. */
570
571 // We are in thread W.
572
573 /* Connection is open, and we got data from other side. Note: For maintainability, this method features
574 * (and should continue to feature) mainly high-level flow control and method calls, as opposed to tons of lower-level
575 * detail (this should be factored out into methods being called).
576 *
577 * Summary of below (assuming no misbehavior by other side; also ignoring that every action is categorized
578 * in sock->m_rcv_stats for statistical purposes):
579 *
580 * - Determine `dupe` (is packet a duplicate of previously received packet?) by checking against
581 * sock->m_rcv_{next_seq_num|packets_with_gaps}. If so:
582 * - (Op AAA) Acknowledge packet (ACK to other side).
583 * - Return (do not close connection).
584 * - Determine `slide` (are packet's data the next expected [first -- by seq. # -- not-yet-received] data?)
585 * by checking against sock->m_rcv_{next_seq_num|packets_with_gaps}.
586 * - If retransmission is off:
587 * - (Op ###) Pass packet's data to Receive buffer sock->m_rcv_buf!
588 * - Except if that would overflow sock->m_rcv_buf, then return (do not close connection).
589 * - (Op %%%) Inform the event subsystem that Receive buffer is readable!
590 * - (Op AAA)
591 * - If (!slide):
592 * - Save packet info (except packet->m_data itself!) in sock->m_rcv_packets_with_gaps.
593 * - But if that overflows sock->m_rcv_packets_with_gaps, then also pretend
594 * gap before start of sock->m_rcv_packets_with_gaps has all been filled: set `slide = true;`.
595 * (This will cause below to pop sock->m_rcv_packets_with_gaps to not overflow.)
596 * - If `slide`:
597 * - (Op ***) Update sock->m_rcv_{next_seq_num|packets_with_gaps} (increment the former,
598 * possibly pop-front contiguous packets from the other).
599 * - Else, if retransmission is on:
600 * - If `slide`:
601 * - (Op ###)
602 * - (Op ***)
603 * - Plus, for each packet popped from sock->m_rcv_packets_with_gaps, in increasing seq. # order:
604 * Pass packet's data to Receive buffer sock->m_rcv_buf!
605 * - (Op %%%)
606 * - Else if (!slide):
607 * - Save packet info (including packet->m_data itself!) in sock->m_rcv_packets_with_gaps (reassembly queue).
608 * - But if that WOULD overflow sock->m_rcv_packets_with_gaps, then don't;
609 * and return (do not close connection).
610 * - (Op AAA) */
611
612 /* Set up some short-hand references to commonly used sock members. This should also help
613 * performance a little by skipping the shared_ptr dereference. (Should be safe since sock
614 * cannot get ref-counted down to zero in this method, unless there is an error, at which point
615 * we return anyway.) Just remember these are not simply local variables -- nor const references -- but refer
616 * to on-the-heap stuff! */
617 const bool rexmit_on = sock->rexmit_on();
618 const Sequence_number& seq_num = packet->m_seq_num;
619
620 auto& data = packet->m_data; // NOT const, since we may well be _moving_ this into Receive buffer, etc.
621 assert(!data.empty()); // This should have been verified immediately in handle_incoming().
622 // Save this before we possibly destroy `data`'s contents below when _moving_ into Receive buffer, etc.
623 const size_t data_size = data.size();
624
625 // Register one packet with N bytes of data (not necessarily acceptable data).
626 Peer_socket_receive_stats_accumulator& rcv_stats = sock->m_rcv_stats;
627 rcv_stats.total_data_packet(data_size);
628
629 // Before potential changes, log.
630
631 FLOW_LOG_TRACE("NetFlow worker thread working on [" << sock << "]. "
632 "Received [" << packet->m_type_ostream_manip << "] with "
633 "sequence number [" << seq_num << "]; data size [" << data_size << "].");
634 // Very verbose and CPU-intensive!
635 FLOW_LOG_DATA("Data [" << util::buffers_dump_string(data.const_buffer(), "", size_t(-1)) << "].");
636 // In below TRACE logging we will omit most of the above details, since they'll be already logged.
637
638 log_rcv_window(sock); // Especially log this state.
639
640 /* Compute `dupe` and `slide[_size]`, bits of info that are key to how the incoming packet fits into the rcv window.
641 * Also, regardless of anything else we need to register N bytes worth of data in DATA packets via
642 * one rcv_stats.<...>_data_packet(data_size); we can determine the <...> based on dupe, slide, or lack thereof. */
643
644 /* True will means it's a duplicate packet -- ACK but don't give to the user again.
645 * False will mean it's a new packet -- ACK and save to a buffer for eventual consumption (unless overflow). */
646 bool dupe;
647 // Will mean this packet is the first (by seq. #) unreceived packet we want. Only applies if !dupe.
648 bool slide;
649 /* ^-- @todo Eliminate this; use slide_size == 0 to mean !slide? Less state is a good thing.
650 * Also, slide_size can be assumed to be data_size, except in one case below -- *never* via
651 * sock_categorize_data_to_established(); both of these improvements will lead to cleaner code. */
652 size_t slide_size; // If (slide), this will be how much to increment m_rcv_next_seq_num.
653
654 const Error_code cat_result = sock_categorize_data_to_established(sock, packet, &dupe, &slide, &slide_size);
655 if (cat_result)
656 {
657 // Register one packet with N bytes of data (not acceptable due to error).
658 rcv_stats.error_data_packet(data_size);
659
660 /* Close connection in our structures (inform user if necessary as well). Pre-conditions
661 * assumed by call: sock in m_socks and sock->state() == S_OPEN (yes, since m_int_state ==
662 * S_ESTABLISHED); 3rd arg contains the reason for the close (yes). This will empty the Send
663 * and Receive buffers. That is OK, because this is the abrupt type of close (error). */
664 rst_and_close_connection_immediately(socket_id, sock, cat_result, true);
665 // ^-- defer_delta_check == true: for similar reason as in handle_syn_ack_ack_to_syn_rcvd().
666 return;
667 }
668 // else
669
670 // If we got here, no error so far; `dupe` and `slide` are both set properly.
671
672 if (dupe)
673 {
674 /* It's a duplicate received packet. We should still acknowledge every valid packet, even if
675 * duplicate, since at least it helps the other side measure congestion. Is it "lying," since
676 * we're throwing this dupe away? No, because we DID receive it earlier; and in fact that
677 * earlier packet's ACK packet may have itself gotten lost by the network. (Example: A sends P
678 * to B; A receives and responds with ACK of P; that's lost; A receives dupe of P and responds
679 * with ACK; B receives that ACK. Good.) Anyway if the other side doesn't like it, it can just
680 * ignore it.
681 *
682 * It is also important to ack a duplicate packet, if retransmission is enabled. For example,
683 * sender may send packet X, and we'll ack it; but the ACK may be lost. Then the sender will
684 * retransmit X thinking X was lost; if we don't ACK the retransmitted one, the sender will
685 * retransmit again, until it runs out of retransmissions and closes connection... all because
686 * of one lousy lost ACK. */
687
688 // Plenty of TRACE logging about duplicate packets above; and here is probably too verbose for an INFO; => no log.
689
690 // Register one packet with N bytes of data (not acceptable into Receive buffer but probably legal, just late).
691 rcv_stats.late_or_dupe_data_packet(data_size);
692
693 // Register one individual acknowledgment of N bytes of data (will go out but acks late DATA).
694 rcv_stats.late_or_dupe_to_send_ack_packet(data_size);
695
696 // ACK will happen asynchronously (not in this handler, and at best once UDP net-stack considers itself writable).
697 async_acknowledge_packet(sock, seq_num, packet->m_rexmit_id, data_size); // rcv_stats kept inside.
698 return;
699 }
700 // else if (!dupe), i.e. data to be saved in Receive buffer or reassembly queue (unless overflow).
701
702 // Register one packet with N bytes of data (legal and acceptable into Receive buffer).
703 rcv_stats.good_data_packet(data.size());
704
705 /* Behavior is different at this point depending on whether retransmission is enabled or
706 * disabled. Many of the building blocks are the same and have been factored out into helpers. */
707
708 if (!rexmit_on)
709 {
710 /* No retransmission, so things are fairly simple. Firstly any new received data go
711 * straight to Receive buffer (out of order or not). */
712
713 if (!sock_data_to_rcv_buf_unless_overflow(sock, packet))
714 {
715 /* Not so fast. There's no space in the Receive buffer, so there's no choice except to drop the
716 * packet despite all of the above. Note that this means the packet was not "received" (and
717 * we can't slide the window forward either).
718 *
719 * Should we RST/close? Absolutely not. The sender did nothing wrong (except maybe they suck
720 * at detecting congestion caused by our user not reading the Receive buffer fast enough and
721 * thus letting it fill up, or maybe they just suck at congestion control). Our user is not
722 * consuming the Receive buffer in time. We drop packet and let chips fall where they may
723 * (reliability measures will handle it).
724 *
725 * Should we still acknowledge it? No. Dropping a packet at this late stage is still
726 * dropping a packet and indicates congestion of the network, of sorts; if we ACK it, the
727 * other side will assume the packet is being delivered and won't slow down its packet
728 * onslaught. So nothing else to. */
729 return;
730 }
731
732 /* DO NOT use `data` from this point forward -- it was just emptied by sock_data_to_rcv_buf_unless_overflow()!
733 * data_size is fine. */
734
735 /* Since sock now has a non-empty Receive buffer, it is certainly now Readable. Handle implications
736 * on relevant waiting Event_sets. */
737 sock_rcv_buf_now_readable(sock, syn_rcvd_qd_packet);
738
739 // Successfully wrote to Receive buffer. Can certainly acknowledge it at this point.
740
741 // Register one individual acknowledgment of N bytes of data (will go out and acks new, acceptable DATA).
742 rcv_stats.good_to_send_ack_packet(data_size);
743
744 // ACK will happen asynchronously (not in this handler, and at best once UDP net-stack considers itself writable).
745 async_acknowledge_packet(sock, seq_num, 0, data_size); // rcv_stats kept inside.
746
747 if (!slide)
748 {
749 /* !slide means new packet didn't resolve the first unreceived gap; hence by definition
750 * sock->m_rcv_packets_with_gaps must be updated. Due to certain overflow mechanisms, this may also
751 * cause the removal of part of the first gap, ironically! So pass in &slide, etc.
752 *
753 * Pass in data_size, since data.size() would run on an emptied `data` as noted above and be useless. */
754 sock_track_new_data_after_gap_rexmit_off(sock, packet, data_size, &slide, &slide_size);
755
756 // `slide` may now be true or not.
757 }
758
759 // `slide` may now be true or not.
760
761 /* Finally, update the window, since we've received a new packet. Maintain large invariant described in doc headers
762 * for Peer_socket::m_rcv_packets_with_gaps and related members. */
763
764 if (slide)
765 {
766 sock_slide_rcv_next_seq_num(sock, slide_size, false);
767 }
768 } // if (!rexmit_on)
769 else // if (rexmit_on)
770 {
771 /* Retransmission is on, so we have to deal with the reassembly queue. Namely if this packet
772 * fills the gap between stuff already given to Receive buffer and the first packet in the
773 * reassembly queue, then we should feed-to-user not just the new packet but also all contiguous packets
774 * at the front of the queue into Receive buffer. If it does not fill it, then we have to add
775 * it to reassembly queue in the proper spot. */
776
777 if (slide)
778 {
779 // New packet filled at least part of the first gap. So we should feed it to Receive buffer.
780
781 if (!sock_data_to_rcv_buf_unless_overflow(sock, packet))
782 {
783 /* Not so fast. If there's no space in the Receive buffer, there's no choice except to drop the
784 * packet despite all of the above. All comments from same spot in the no-retransmission
785 * code above apply (not repeating here). */
786 return;
787 }
788 // else
789
790 /* DO NOT use `data` from this point forward -- it was just emptied by sock_data_to_rcv_buf_unless_overflow().
791 * data_size is fine. */
792
793 /* Now update the receive window structure. Maintain invariants described in doc headers
794 * for m_rcv_packets_with_gaps and related members. Additionally, since retransmission is
795 * on, if the new packet bridged gap to the first packet(s) in the reassembly queue, then
796 * add their data to Receive buffer also (the `true` argument triggers this). */
797
798 sock_slide_rcv_next_seq_num(sock, slide_size, true);
799
800 /* Since sock nsqow has a non-empty Receive buffer, it is certainly now Readable. Handle implications
801 * on relevant waiting Event_sets. */
802 sock_rcv_buf_now_readable(sock, syn_rcvd_qd_packet);
803 } // if (slide)
804 else if (!sock_data_to_reassembly_q_unless_overflow(sock, packet)) // && (!slide)
805 {
806 /* Out-of-order packet. Couldn't feed to Receive buffer, so fed to reassembly queue (in sock_data_to_reass...()).
807 * However, if we're here, then that indicated we overflowed reassembly queue and decided to drop the packet
808 * instead. Bail out; which essentially just means don't acknowledge it, as that would occur just below. */
809 return;
810 }
811
812 // Either fed to Receive buffer or reassembly queue. Can certainly acknowledge it at this point.
813
814 // Register one individual acknowledgment of N bytes of data (will go out and acks new, acceptable DATA).
815 rcv_stats.good_to_send_ack_packet(data_size);
816
817 // ACK will happen asynchronously (not in this handler, and at best once UDP net-stack considers itself writable).
818 async_acknowledge_packet(sock, seq_num, packet->m_rexmit_id, data_size); // More rcv_stats kept inside.
819 } // else if (rexmit_on)
820
821 // After changes, log.
822 log_rcv_window(sock);
823} // Node::handle_data_to_established()
824
826 boost::shared_ptr<const Data_packet> packet,
827 bool* dupe, bool* slide, size_t* slide_size)
828{
829 assert(dupe && slide && slide_size);
830
831 /* Note this is a helper to handle_data_to_established() to make it more manageable. See comments and
832 * flow in that caller first.
833 *
834 * Note: not dealing with rcv_stats, as it's less code (assuming 1 call to us anyway) to do it based on our result. */
835
836 // See comment in same spot in handle_data_to_established().
837 Peer_socket_receive_stats_accumulator& rcv_stats = sock->m_rcv_stats;
838 const Sequence_number& rcv_next_seq_num = sock->m_rcv_next_seq_num;
839 const Peer_socket::Recvd_pkt_map& rcv_packets_with_gaps = sock->m_rcv_packets_with_gaps;
840
841 const auto& data = packet->m_data;
842 const Sequence_number& seq_num = packet->m_seq_num;
843
844 // Get the sequence number just past the last datum in this packet.
845 Sequence_number seq_num_end = seq_num;
846 advance_seq_num(&seq_num_end, data.size());
847
848 // If false, all received packets are followed by all unreceived ones. Otherwise there's at least 1 gap.
849 bool first_gap_exists;
850 // If true, then this is the sequence number of the first datum right after that first gap.
851 Sequence_number seq_num_after_first_gap;
852 rcv_get_first_gap_info(sock, &first_gap_exists, &seq_num_after_first_gap);
853
854 // Validate the 1st sequence number in DATA against the ISN.
855
856 if (seq_num <= sock->m_rcv_init_seq_num)
857 {
858 /* Sequence number precedes or equals the original SYN's sequence number. Either the other side
859 * is an a-hole, or somehow a socket_id was reused from a recent connection, which we do try to
860 * avoid like the plague. Therefore, send them an RST and abort connection. If they send more
861 * data packets to this port (which is quite possible; many could already be on the way),
862 * they'll get more RSTs still. */
863
864 // Interesting/rare enough to log a WARNING.
865 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
866 "Received [" << packet->m_type_ostream_manip << "] with "
867 "sequence number [" << seq_num << "]; data size [" << data.size() << "]; "
868 "sequence number precedes "
869 "ISN [" << sock->m_rcv_init_seq_num << "].");
870
871 return error::Code::S_SEQ_NUM_IMPLIES_CONNECTION_COLLISION; // Bad behavior from other side is fatal.
872 }
873 // else if (seq_num >= sock->m_rcv_init_seq_num)
874
875 if (seq_num < rcv_next_seq_num)
876 {
877 /* The packet claims to begin BEFORE the first gap (i.e., unreceived packet). This may be a
878 * valid duplicate packet. First, though, ensure it's not a "straddling" packet, i.e., that its
879 * last datum's sequence number is not past rcv_next_seq_num. If it is, that would imply one
880 * sequence number's datum is in two packets that are not duplicates of each other which is illegal. */
881
882 if (seq_num_end > rcv_next_seq_num)
883 {
884 // Interesting/rare enough to log a WARNING.
885 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
886 "Received [" << packet->m_type_ostream_manip << "] with "
887 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
888 "data size [" << data.size() << "]; "
889 "straddle first unreceived "
890 "sequence number [" << rcv_next_seq_num << "].");
891
892 // Yep, it straddles the boundary. Other side is behaving badly. RST/close as above.
894 }
895 // else ([seq_num, end seq_num] is before the first unreceived packet sequence, a/k/a gap)
896
897 FLOW_LOG_TRACE("Duplicate packet before first unreceived sequence number [" << rcv_next_seq_num << "].");
898
899 *dupe = true;
900 *slide = false;
901 return Error_code();
902 } // if (seq_num < rcv_next_seq_num)
903 // else if (seq_num >= rcv_next_seq_num)
904
905 /* Packet claims to be in what TCP would call the receive window (somewhere at or after the
906 * first gap). Pin down in what part of that space it is, in order of increasing seq. #s. */
907
908 // First see if it's right at the start of the first gap.
909
910 if (seq_num == rcv_next_seq_num)
911 {
912 /* Great. It's at the start of the first gap, so we should be able to advance the window
913 * (increment rcv_next_seq_num). First check that it doesn't straddle the next received packet
914 * after the gap, if any. (Again, if it does that means one sequence number is inside 2
915 * packets that aren't dupes of each other, which is illegal.) */
916 if (first_gap_exists && (seq_num_end > seq_num_after_first_gap))
917 {
918 // Interesting/rare enough to log a WARNING.
919 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
920 "Received [" << packet->m_type_ostream_manip << "] with "
921 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
922 "data size [" << data.size() << "]; "
923 "supposed gap-filling data "
924 "straddle the boundary of packet [" << seq_num_after_first_gap << ", ...).");
925
927 }
928 // else legal -- can slide window to the right and save to Receive buffer.
929
930 FLOW_LOG_TRACE("Packet filled first [" << data.size() << "] unreceived sequence numbers "
931 "starting with [" << rcv_next_seq_num << "].");
932
933 *dupe = false;
934 *slide = true;
935 *slide_size = size_t(seq_num_end - seq_num);
936 assert(*slide_size == data.size());
937 return Error_code();
938 }
939
940 // else if:
941 assert(seq_num > rcv_next_seq_num);
942
943 *slide = false; // This much is certain, as we're not filling the first gap from the front.
944
945 /* Packet doesn't fill that first gap. It's somewhere after the start of the first gap. Now
946 * there are 3 possibilities:
947 *
948 * -1- It's illegal: it straddles the boundary of one of the packets in m_rcv_packets_with_gaps,
949 * meaning some sequence number is inside 2 non-identical packets. RST/close as above.
950 *
951 * -2- It is a duplicate (same starting sequence number and length) of one of the packets
952 * past the first gap (i.e., of the packets in rcv_packets_with_gaps). Thus dupe =
953 * true (we should ACK but not save to Receive buffer).
954 *
955 * -3- It fits into one of the gaps; i.e. its sequence number range is either entirely
956 * before that of rcv_packets_with_gaps; entirely after it; or entirely before the
957 * first sequence number of an element of rcv_packets_with_gaps AND entirely after the
958 * last sequence number of the preceding element of rcv_packets_with_gaps. Thus we
959 * should ACK and save to Receive buffer.
960 *
961 * Determine which one it is.
962 *
963 * @todo Below technique is fun and all, but I now suspect the following might be simpler:
964 * 1, is seq_num in rcv_packets_with_gaps already? If so but different length, error; if so but
965 * but same length, *dupe is true. Otherwise: 2, insert a thing representing `packet` into rcv_packets_with_gaps
966 * as if for real; call inserted thing P. 3, check for straddling against right edge of prior(P), if any;
967 * if so, error. 4, check for straddling against left edge of next(P), if any; if so, error.
968 * 5, *dupe is false. The problem? It requires insertion, when this is supposed to not modify `packet` but only
969 * categorize it. Can of course remove it at the end, but that's cheesy. Can also modify our contract
970 * accordingly, but that reduces separation of concerns in caller's algorithm. Also, possibly the resulting
971 * algorithm might be easier to grok but not much shorter, if at all, anyway. Finally, could leave the
972 * straddling detection to later parts of the algorithm (again, changing our contract to be weaker though).
973 * In any case, not a top concern; and in terms of performance I doubt it would differ much from below. */
974
975 /* Find where we are compared to the various received packets past the first gap.
976 * This gets the first packet whose first sequence number is >= seq_num. There are 3 possibilities:
977 * that is equal to seq_num, past seq_num, or there is no such packet.
978 *
979 * Note that the lookup is O(log n) amortized, and then the subsequent checking is O(1).
980 * This is one of the reasons to use a sorted map by seq. #. */
981 const Peer_socket::Recvd_pkt_const_iter next_packet = rcv_packets_with_gaps.lower_bound(seq_num);
982
983 if (next_packet == rcv_packets_with_gaps.end())
984 {
985 /* There is no packet after ours, and there is no packet equal to ours. Thus we'll just
986 * insert our packet at the end. Check, however, that there is no straddling (-1- above).
987 * What packet's boundary can we straddle? At least the last one (assuming there's a gap). Its
988 * last number may be >= seq_num. (Its first is guaranteed to be < seq_num based on the
989 * above check.) If we don't straddle that boundary, we can't straddle any other packet's boundary,
990 * since all other packets precede the last one, so just check the last one (if exists). */
991 if (first_gap_exists)
992 {
993 const Peer_socket::Recvd_pkt_const_iter last_packet = prior(rcv_packets_with_gaps.end());
994 Sequence_number seq_num_last_end;
995 get_seq_num_range(last_packet, 0, &seq_num_last_end);
996
997 if (seq_num_last_end > seq_num) // (Corner case check: == means it contiguously precedes `packet`; no straddle.)
998 {
999 // Yep, packet straddles boundary of last_packet.
1000
1001 // Interesting/rare enough to log a WARNING.
1002 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
1003 "Received [" << packet->m_type_ostream_manip << "] with "
1004 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
1005 "data size [" << data.size() << "]; "
1006 "supposed middle gap-filling packet data "
1007 "straddle the boundary of last packet [..., " << seq_num_last_end << ").");
1008
1009 // Register one packet with N bytes of data (not acceptable due to error).
1010 rcv_stats.error_data_packet(data.size());
1012 }
1013 // else OK, we're a new packet that happens to be the newest (by sequence number).
1014
1015 FLOW_LOG_TRACE("New packet is newest packet after unreceived gap; "
1016 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
1017 "first unreceived packet [" << rcv_next_seq_num << "].");
1018 }
1019 else // if (!first_gap_exists)
1020 {
1021 // OK, we're a new packet that happens to be the packet that forms the first gap by being after that gap.
1022
1023 FLOW_LOG_TRACE("New packet forms gap; sequence numbers [" << seq_num << ", " << seq_num_end << "); "
1024 "first unreceived packet [" << rcv_next_seq_num << "].");
1025 }
1026
1027 *dupe = false;
1028 return Error_code();
1029 } // if (next_packet does not exist)
1030 // else if (next_packet exists at the same or later sequence number as seq_num)
1031
1032 // Get the [range) of sequence numbers in the packet that starts at or after seq_num.
1033 Sequence_number seq_num_next_start, seq_num_next_end;
1034 get_seq_num_range(next_packet, &seq_num_next_start, &seq_num_next_end);
1035
1036 if (seq_num_next_start == seq_num)
1037 {
1038 /* Our first datum has same sequence number as next_packet. Thus it's a duplicate.
1039 * Check, however, that their last sequence numbers are also identical. Otherwise, again,
1040 * one datum is in two different packets, which is illegal. */
1041 if (seq_num_next_end != seq_num_end)
1042 {
1043 // Yep, not a valid duplicate.
1044
1045 // Interesting/rare enough to log a WARNING.
1046 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
1047 "Received [" << packet->m_type_ostream_manip << "] with "
1048 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
1049 "data size [" << data.size() << "]; "
1050 "do not match supposed "
1051 "duplicate packet [" << seq_num << ", " << seq_num_next_end << ").");
1052
1054 }
1055 // else
1056
1057 /* @todo With rexmit_on we can also/instead compare `data` against actual data payload in next_packet -- not just
1058 * the sequence numbers. With !rexmit_on, there's no need to store the payloads, as they're always fed directly
1059 * to user upon receipt, even out of order. */
1060
1061 FLOW_LOG_TRACE("Duplicate packet after unreceived data; "
1062 "sequence numbers [" << seq_num << ", " << seq_num_end << ").");
1063
1064 *dupe = true;
1065 return Error_code();
1066 } // if (seq_num_next_start == seq_num)
1067 // else if:
1068 assert(seq_num_next_start > seq_num); // lower_bound() is not horrifically broken.
1069
1070 // We've eliminated all dupe possibilities above. It's either error or not, at this point.
1071 *dupe = false;
1072
1073 /* Since next_packet starts after `packet`, the best outcome is that packet is entirely
1074 * before next_packet and entirely after prev_packet, where prev_packet == prior(next_packet) (if
1075 * such a thing exists). So we must check that we don't straddle
1076 * either next_packet's starting boundary or prev_packet's ending boundary. All other
1077 * preceding boundaries are straddled if and only if the prev_packet end is, and all
1078 * succeding boundaries iff next_packet start is. */
1079
1080 if (seq_num_end > seq_num_next_start) // Corner case check: == means `packet` contiguously precedes next_packet.
1081 {
1082 // Straddle one or more succeding packets. RST/close as above.
1083
1084 // Interesting/rare enough to log a WARNING.
1085 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
1086 "Received [" << packet->m_type_ostream_manip << "] with "
1087 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
1088 "data size [" << data.size() << "]; "
1089 "supposed middle gap-filling packet data "
1090 "straddle the left boundary of packet "
1091 "[" << seq_num_next_start << ", " << seq_num_next_end << ").");
1092
1093 return error::Code::S_SEQ_NUM_ARITHMETIC_FAILURE; // Bad behavior is fatal to connection, as above.
1094 }
1095 // else succeding packets OK. Check preceding packets.
1096
1097 if (next_packet == rcv_packets_with_gaps.begin())
1098 {
1099 FLOW_LOG_TRACE("New packet partially fills first gap without sliding window; "
1100 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
1101 "first unreceived packet [" << rcv_next_seq_num << "].");
1102 return Error_code(); // There are none. We're good.
1103 }
1104
1105 const Peer_socket::Recvd_pkt_const_iter prev_packet = prior(next_packet);
1106 Sequence_number seq_num_prev_start, seq_num_prev_end;
1107 get_seq_num_range(prev_packet, &seq_num_prev_start, &seq_num_prev_end);
1108
1109 if (seq_num_prev_end > seq_num) // Corner case check: == means prev_packet contiguously precedes `packet`.
1110 {
1111 // Straddling one or more preceding packets. RST/close as above.
1112
1113 // Interesting/rare enough to log a WARNING.
1114 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
1115 "Received [" << packet->m_type_ostream_manip << "] with "
1116 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
1117 "data size [" << data.size() << "]; "
1118 "supposed middle gap-filling packet data "
1119 "straddle the right boundary of packet "
1120 "[" << seq_num_prev_start << ", " << seq_num_prev_end << ").");
1121
1122 return error::Code::S_SEQ_NUM_ARITHMETIC_FAILURE; // Bad behavior is fatal to connection, as above.
1123 }
1124 // else preceding packets OK.
1125
1126 FLOW_LOG_TRACE("New packet fills some middle gap; "
1127 "sequence numbers [" << seq_num << ", " << seq_num_end << "); "
1128 "first unreceived packet [" << rcv_next_seq_num << "].");
1129
1130 return Error_code();
1131} // Node::sock_categorize_data_to_established()
1132
1134 boost::shared_ptr<Data_packet> packet)
1135{
1136 using util::Blob;
1137
1138 /* Note this is a helper to handle_data_to_established() to make it more manageable. See comments and
1139 * flow in that caller first. */
1140
1141 // See comment in same spot in handle_data_to_established().
1142 Peer_socket_receive_stats_accumulator& rcv_stats = sock->m_rcv_stats;
1143 Blob& data = packet->m_data; // NOT const due to Socket_buffer::feed*(). See below.
1144 // Save this before we possibly destroy data's contents below (for performance).
1145 const size_t data_size = data.size();
1146
1147 size_t buf_size;
1148 {
1149 // Receive Buffer can be consumed by user threads (not W) at the same time. Must lock.
1150 Peer_socket::Lock_guard lock(sock->m_mutex);
1151
1152 /* First we must check if block will fit into sock->m_rcv_buf. Why not just use feed_buf_move()'s
1153 * max_data_size argument? Because that would allow to partially enqueue the block, if there's
1154 * space for some but not all of the block. Since we can't partially ACK a packet, we have to
1155 * drop the whole thing in that case.
1156 *
1157 * Round up to a multiple of max-block-size to ensure we never fragment a max-block-size-sized
1158 * chunk of data when they're using unreliable mode! Also apply the slack % to account for
1159 * the fact that rcv_wnd sent to the other side may lag behind reality (the key is to NOT
1160 * apply the slack % when sending rcv_wnd, so that it is more conservative). */
1161 if ((sock->m_rcv_buf.data_size() + data_size)
1162 > sock->max_block_size_multiple(sock->m_opts.m_st_rcv_buf_max_size,
1163 &sock->m_opts.m_st_rcv_buf_max_size_slack_percent))
1164 {
1165 // Receive buffer overflow.
1166
1167 // Register one packet of N bytes of acceptable data that we unfortunately have to drop due to buffer overflow.
1168 rcv_stats.good_data_dropped_buf_overflow_packet(data_size);
1169
1170 // Not an error but interesting. Might be too verbose for INFO but what the hell.
1171 FLOW_LOG_INFO("NetFlow worker thread working on [" << sock << "]. "
1172 "Received [" << packet->m_type_ostream_manip << "] with "
1173 "sequence numbers [" << packet->m_seq_num << ", " << (packet->m_seq_num + data_size) << "); "
1174 "data size [" << data_size << "]; "
1175 "dropping because Receive buffer full.");
1176 return false;
1177 }
1178 // else can successfully write to Receive buffer (enough space for entire block).
1179
1180 /* Let's make data available to user! This is a constant-time operation that MOVES
1181 * packet.data's contents into m_rcv_buf (via swap). That's why packet is Ptr and not
1182 * Const_ptr. Note that after that we no longer work with packet -- it's a goner; data.empty()
1183 * is true.
1184 *
1185 * No need to provide max buffer size -- we already checked that's not an issue above. */
1186
1187#ifndef NDEBUG
1188 const size_t written =
1189#endif
1190 sock->m_rcv_buf.feed_buf_move(&data, std::numeric_limits<size_t>::max());
1191 // `data` is now empty.
1192 assert(written == data_size);
1193
1194 buf_size = sock->m_rcv_buf.data_size();
1195 } // lock(sock->m_mutex)
1196
1197 // Register one packet of N bytes of acceptable data that we accepted -- did not drop.
1198 rcv_stats.good_data_accepted_packet(data_size);
1199 // Register one packet of N bytes of acceptable data that we delivered to user.
1200 rcv_stats.good_data_delivered_packet(data_size);
1201 // Register that the Receive buffer grew.
1202 rcv_stats.buffer_fed(buf_size);
1203
1204 // They've sent reasonable data -- so handle the implications on rcv_wnd recovery (if any).
1205 receive_wnd_recovery_data_received(sock);
1206
1207 return true;
1208} // Node::sock_data_to_rcv_buf_unless_overflow()
1209
1210void Node::sock_rcv_buf_now_readable(Peer_socket::Ptr sock, bool syn_rcvd_qd_packet)
1211{
1212 /* We are told sock now has a non-empty Receive buffer and is thus Readable. Therefore we
1213 * should soon inform anyone waiting on any Event_sets for sock to become Readable.
1214 *
1215 * Caveat: Similar to that in Node::handle_syn_ack_ack_to_syn_rcvd() at similar point in the
1216 * code.
1217 *
1218 * Also: why do this outside the locked block that likely preceded this to actually write to the
1219 * Receive buffer? Avoid possibility of deadlock, since there
1220 * are two mutexes at play: sock->m_mutex (locked in the likely Receive buffer
1221 * update and in event_set_all_check_delta()) and Event_set::m_mutex (which is locked in
1222 * event_set_all_check_delta()). Different mutexes should always be locked in the same order,
1223 * and other threads lock in the sock->m_mutex/event_set->m_mutex order.
1224 *
1225 * Finally: if this packet was not received in ESTABLISHED but rather in SYN_RCVD and saved
1226 * until ESTABLISHED, then we skip this (syn_rcvd_qd_packet).
1227 * Why? Answer: in this case the socket has not yet been
1228 * given to the user (they need to call accept() or equivalent). Therefore, they could not have
1229 * added it to an Event_set and thus are not interested in Readable status on it. (For
1230 * background on this queueing, see handle_data_to_syn_rcvd(). */
1231
1232 // Accumulate the event into the Node store (note: not any Event_set yet) (if received during ESTABLISHED).
1233 if ((!syn_rcvd_qd_packet) &&
1234 m_sock_events[Event_set::Event_type::S_PEER_SOCKET_READABLE].insert(sock).second)
1235 {
1236 // Possibly inform the user for any applicable Event_sets right now.
1237 event_set_all_check_delta(true);
1238 /* ^-- defer_delta_check == true: because the only way to get to this method is from
1239 * async_low_lvl_recv(), which will perform event_set_all_check_delta(false) at the end of itself,
1240 * before the boost.asio handler exits. See Node::m_sock_events doc header for details. */
1241 }
1242} // Node::sock_rcv_buf_now_readable()
1243
1245 boost::shared_ptr<const Data_packet> packet,
1246 size_t data_size,
1247 bool* slide, size_t* slide_size)
1248{
1249 using std::make_pair;
1250
1251 /* Note this is a helper to handle_data_to_established() to make it more manageable. See comments and
1252 * flow in that caller first. */
1253
1254 *slide = false;
1255 *slide_size = 0;
1256
1257 // See comment in same spot in handle_data_to_established().
1258 Peer_socket_receive_stats_accumulator& rcv_stats = sock->m_rcv_stats;
1259 Peer_socket::Recvd_pkt_map& rcv_packets_with_gaps = sock->m_rcv_packets_with_gaps;
1260 const Sequence_number& seq_num = packet->m_seq_num;
1261
1262 /* Since we may increase rcv_packets_with_gaps size below, we may exceed the limit as described
1263 * in m_rcv_packets_with_gaps doc header. (The limit is due to memory concerns.) Let's compute
1264 * that limit. */
1265 const size_t max_packets_after_unrecvd_packet = sock_max_packets_after_unrecvd_packet(sock);
1266
1267 /* A pre-condition is: The received packet is NOT the first (earliest) unreceived packet we're waiting
1268 * for; in other words it is not the packet at the start of the first gap. So we should save
1269 * the packet into rcv_packets_with_gaps. (This will elsewhere help us, at least, detect if this
1270 * packet comes in again [duplicate]. See sock_categorize_data_to_established().) */
1271#ifndef NDEBUG
1272 const auto insert_result =
1273#endif
1274 rcv_packets_with_gaps.insert
1275 (make_pair(seq_num,
1277 // m_rcv_reassembly_q_data_size untouched because !rexmit_on.
1278 assert(!sock->rexmit_on());
1279 assert(insert_result.second); // If was already there, there's some serious bug in above logic.
1280 // No other part of the invariant is violated, so that's it.
1281
1282 bool first_gap_exists;
1283 // The sequence number of the first datum right after the first unreceived gap.
1284 Sequence_number seq_num_after_first_gap;
1285
1286 rcv_get_first_gap_info(sock, &first_gap_exists, &seq_num_after_first_gap);
1287 assert(first_gap_exists);
1288
1289 /* We would be done here, except we need to protect against rcv_packets_with_gaps growing too
1290 * large. This is explained in detail in the m_rcv_packets_with_gaps doc comment. Long story
1291 * short: if we exceed a certain length in this structure, pretend we have "received" the entire
1292 * first gap, which will allow us to slide the window forward and eliminate all the contiguous
1293 * received packets following this gap, of which there will be at least one
1294 * (rcv_packets_with_gaps.begin()), bringing the structure's size back to the limit. */
1295
1296 if (rcv_packets_with_gaps.size() == max_packets_after_unrecvd_packet + 1)
1297 {
1298 // Use these output knobs to reduce rcv_packets_with_gaps.size() after all to avoid overflow.
1299 *slide = true;
1300 *slide_size = size_t(seq_num_after_first_gap - sock->m_rcv_next_seq_num);
1301
1302 // Register unknown # of packets with N bytes of data, which we are assuming are dropped.
1303 rcv_stats.presumed_dropped_data(data_size);
1304
1305 // Not an error but interesting. Might be too verbose for INFO but what the hell.
1306 FLOW_LOG_INFO("NetFlow worker thread working on [" << sock << "]. "
1307 "Received [" << packet->m_type_ostream_manip << "] with "
1308 "sequence numbers [" << packet->m_seq_num << ", " << (packet->m_seq_num + data_size) << "); "
1309 "exceeded max gapped packet list size [" << max_packets_after_unrecvd_packet << "]; "
1310 "assuming Dropped; "
1311 "will fake receiving all [" << slide_size << "] sequence numbers in the first unreceived gap.");
1312 }
1313 else
1314 {
1315 // Our logic shouldn't be allowing the max to be exceeded by more than 1 at any time; we "wrist-slap" it above at 1.
1316 assert(rcv_packets_with_gaps.size() <= max_packets_after_unrecvd_packet);
1317 }
1318} // Node::sock_track_new_data_after_gap_rexmit_off()
1319
1321 boost::shared_ptr<Data_packet> packet)
1322{
1323 using std::make_pair;
1324
1325 /* Note this is a helper to handle_data_to_established() to make it more manageable. See comments and
1326 * flow in that caller first. */
1327
1328 Peer_socket_receive_stats_accumulator& rcv_stats = sock->m_rcv_stats;
1329 Peer_socket::Recvd_pkt_map& rcv_packets_with_gaps = sock->m_rcv_packets_with_gaps;
1330 const Sequence_number& seq_num = packet->m_seq_num;
1331
1332 auto& data = packet->m_data; // NOT const due to the move into Received_packet; see below.
1333 // Save this before we possibly destroy data's contents below (for performance).
1334 const size_t data_size = data.size();
1335
1336 /* Since we will increase rcv_packets_with_gaps size below, we may exceed the limit as
1337 * described in m_rcv_packets_with_gaps doc header. (The limit is due to memory concerns.)
1338 * Let's compute that limit. */
1339 size_t max_packets_after_unrecvd_packet = sock_max_packets_after_unrecvd_packet(sock);
1340
1341 /* Update: Actually, that limit is (as noted in the doc header for Peer_socket::m_rcv_packets_with_gaps, whose
1342 * growth we are constraining here) more of a formality, as in practice things like sender's CWND or
1343 * sender's following our rcv-wnd guidance should keep the size of this retranmission queue much lower than
1344 * the limit that was just computed. However! There IS a retransmission-enabled-exclusive limit we should
1345 * apply here, and it may at times be applied in practice, unlike what we just computed. Namely, consider
1346 * that if we receive N in-order, fully populated (up to max-block-size) DATA packets, and NxMBS exceeds
1347 * max-on-Receive-buffer, then indeed we will drop the overflowing portion and not put into Receive buffer;
1348 * but if we don't receive 1 in-order packet, get the next (N - 1) packets, and then finally get the one
1349 * missing DATA packet, then they will all be delivered to Receive buffer without a problem. (The next in-order
1350 * packet would indeed hit overflow, unless user dequeues some. This only highlights the oddness.)
1351 * Why? Because the above-computed limit is far higher than the equivalent max-on-Receive-buffer configuration
1352 * (typically), so the reassembly queue would be loaded up with stuff without hitting any limit, and the
1353 * code that dequeues from reassembly queue into Receive buffer does not follow any overflow logic (nor can it,
1354 * really, since by that point those DATA packets have long since been ACKed, and we do not renege ACKs).
1355 * Long story short, that is not good, and we should simply apply the max-on-Receive-buffer to not just
1356 * the Receive buffer but to this reassembly queue PLUS the Receive buffer.
1357 *
1358 * Caution! This policy means the rcv-wnd advertisements to the other side must follow this policy too.
1359 *
1360 * OK, make the computation as described. First compute the max-on-Receive-buffer, same as when actually computing
1361 * that when enqueueing that structure. Then subtract how much of it we've used in actual Receive buffer.
1362 * What remains is what's allowed for rcv_packets_with_gaps:
1363 *
1364 * Rbufdata + Rqdata <= Rbufmax <==> Rqdata <= Rbufmax - Rbufdata = S.
1365 * S_blocks = floor(S / max-block-size).
1366 * Ensure Rcurdata_blocks + 1 <= S_blocks.
1367 *
1368 * This is about right but actually slightly oversimplified, because that limit assumes the data are packed
1369 * in max-block-sized packets except possibly the last one. In reality the existing payload of the reassembly queue
1370 * may be not stored so efficiently (who knows how stuff got packetized or supplied by user or both?). To compute
1371 * this quite carefully (maybe overkill, but I feel deterministically understood to be correct = a good thing), we
1372 * model it as the queue already storing what it's storing; and we must allow a certain number of packets
1373 * on top of that and no more; and the question is whether that's enough for the incoming 1 DATA packet.
1374 * So then, we want this:
1375 *
1376 * Ensure Rqcurdata_blocks + 1 <= Rqcurdata_blocks + Sleft_blocks.
1377 * Sleft_blocks = # additional packets allowed by policy = floor(Sleft / max-block-size).
1378 * Sleft = max(Rbufmax - Rqcurdata - Rbufdata, 0).
1379 *
1380 * So we're doctoring it: we know Rqcurdata_blocks = rcv_packets_with_gaps.size() are already used; so we will
1381 * allow some # of packets beyond that, and the question is what is that # according to our policy? Well, it's just
1382 * the configured limit minus the used Receive buffer in bytes and minus the sum of rcv_packets_with_gaps's bytes.
1383 * Since we're using bytes there, that's the maximum possible accuracy, without any inefficiency being assumed to
1384 * not exist. Note that we have Rqcurdata* being subtracted from Rqcurdata* on one side, and that may seem like
1385 * those should cancel each other out to zero, but no -- that was the case in the simpler model above, but the more
1386 * realistic one means those are (sligthly, potentially) different. */
1387 size_t max_packets_in_reassembly_q
1388 = sock->max_block_size_multiple(sock->m_opts.m_st_rcv_buf_max_size,
1389 &sock->m_opts.m_st_rcv_buf_max_size_slack_percent);
1390 // We have to momentarily lock sock due to access to sock->m_rcv_buf.
1391 size_t rcv_buf_size;
1392 {
1394 rcv_buf_size = sock->m_rcv_buf.data_size(); // This access requires locking.
1395 }
1396 util::subtract_with_floor(&max_packets_in_reassembly_q, rcv_buf_size) && // [sic]
1397 util::subtract_with_floor(&max_packets_in_reassembly_q, sock->m_rcv_reassembly_q_data_size);
1398 // Convert from bytes to max-block-sizes. Note this is the floor of the division (so it is strict).
1399 max_packets_in_reassembly_q /= sock->max_block_size();
1400 /* Okay, we have Sleft in blocks now; add this for direct comparison to the left side, which will be .size() + 1,
1401 * where the 1 is the incoming packet `packet`. Full-circle, this is `Rqcurdata_blocks + Sleft_blocks` from
1402 * the above big comment. */
1403 max_packets_in_reassembly_q += rcv_packets_with_gaps.size();
1404
1405 // The final limit is the lower of the two limits; realistically we expect max_packets_in_reassembly_q to "win."
1406 if (max_packets_in_reassembly_q < max_packets_after_unrecvd_packet)
1407 {
1408 max_packets_after_unrecvd_packet = max_packets_in_reassembly_q;
1409 }
1410 else
1411 {
1412 // Not an error but pretty weird configuration (but too verbose for INFO, if it really does occur).
1413 FLOW_LOG_TRACE("Unexpected Receive buffer limits: safety net [" << max_packets_after_unrecvd_packet << "] <= "
1414 "real limit [" << max_packets_in_reassembly_q << "], but the opposite is typical. "
1415 "See details just below."); // See next log message.
1416 }
1417
1418 if (rcv_packets_with_gaps.size() + 1 > max_packets_after_unrecvd_packet)
1419 {
1420 /* Overflow. Drop this new packet instead of queueing it. Note that this is different
1421 * from the handling of the same situation in the no-retransmit case. In that case, this
1422 * situation is probably more common under loss, since once a packet is considered Dropped by sender, it is NEVER
1423 * re-sent; thus Receiver eventually also considers it Dropped and (instead of dropping
1424 * the new packet, which would be a disastrous policy) simply pretends the gap has been
1425 * filled, thus consolidating the front of rcv_packets_with_gaps. */
1426
1427 // Register one packet of N bytes of acceptable data that we unfortunately have to drop due to overflow.
1429
1430 // This is an error, though not our fault.
1431 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
1432 "Received [" << packet->m_type_ostream_manip << "] with "
1433 "sequence numbers [" << packet->m_seq_num << ", " << (packet->m_seq_num + data_size) << "); "
1434 "exceeded max gapped packet list size [" << max_packets_after_unrecvd_packet << "]; "
1435 "dropping packet.");
1436 return false;
1437 }
1438 // else we can insert into reassembly queue (priority queue by seq. #) rcv_packets_with_gaps.
1439
1440 FLOW_LOG_TRACE("NetFlow worker thread working on [" << sock << "]. "
1441 "Enqueueing [" << packet->m_type_ostream_manip << "] payload onto reassembly queue with "
1442 "sequence numbers [" << packet->m_seq_num << ", " << (packet->m_seq_num + data_size) << ") "
1443 "of size [" << data_size << "]; "
1444 "successfully fit into max gapped packet list size [" << max_packets_after_unrecvd_packet << "]; "
1445 "could have fit [" << (max_packets_after_unrecvd_packet - rcv_packets_with_gaps.size()) << "] more.");
1446
1447 // This decimates `data` but is constant time, much like the buffer enqueueing done elsewhere.
1448#ifndef NDEBUG
1449 const auto insert_result =
1450#endif
1451 rcv_packets_with_gaps.insert
1452 (make_pair(seq_num, // Decimation occurs in here: ------------------v, hence the `&`: -------------v.
1454 sock->m_rcv_reassembly_q_data_size += data_size;
1455 assert(insert_result.second); // If was already there, there's some serious bug in above logic.
1456 // No other part of the invariant is violated, so that's it.
1457
1458 // DO NOT use `data` from this point forward -- it was just emptied by moving into the new Received_packet.
1459
1460 // Register one packet of N bytes of acceptable data that we accepted -- did not drop.
1461 rcv_stats.good_data_accepted_packet(data_size);
1462 // Register one packet of N bytes of acceptable data that we queued for reassembly -- not yet in Receive buffer.
1463 rcv_stats.good_data_first_qd_packet(data_size);
1464
1465 // They've sent reasonable data -- so handle the implications on rcv_wnd recovery (if any).
1466 receive_wnd_recovery_data_received(sock);
1467
1468 return true;
1469} // Node::sock_data_to_reassembly_q_unless_overflow()
1470
1471void Node::sock_slide_rcv_next_seq_num(Peer_socket::Ptr sock, size_t slide_size, bool reassembly_in_progress)
1472{
1473 /* Note this is a helper to handle_data_to_established() to make it more manageable. See comments and
1474 * flow in that caller first. */
1475
1476 // See comment in same spot in handle_data_to_established().
1477 Peer_socket_receive_stats_accumulator& rcv_stats = sock->m_rcv_stats;
1478 Peer_socket::Recvd_pkt_map& rcv_packets_with_gaps = sock->m_rcv_packets_with_gaps;
1479 Sequence_number& rcv_next_seq_num = sock->m_rcv_next_seq_num;
1480
1481 /* OK, caller determined that the front of the gap between rcv_next_seq_num and
1482 * seq_num_after_first_gap has been received. Indeed mark this fact by sliding the former to a higher value,
1483 * indicating sliding right of the left edge of the receive window, in TCP terminology. */
1484 rcv_next_seq_num += slide_size; // Use op+= over advance_seq_num(): slide_size is of Sequence_numbers, not bytes.
1485
1486 FLOW_LOG_TRACE("First unreceived packet pointer moved from "
1487 "[" << (rcv_next_seq_num - slide_size) << "] to "
1488 "[" << rcv_next_seq_num << "].");
1489
1490 /* Now update the receive window structure. Maintain invariant described in doc headers for
1491 * for m_rcv_packets_with_gaps and related members. Additionally, IF retranmission-related
1492 * reassembly is in progress (presumably, because retransmission is enabled), and if the new packet bridged
1493 * gap to the first seq.-#-contiguous packet(s) in the reassembly queue, then add their data to Receive buffer
1494 * also. */
1495
1496 // Start of range to delete.
1497 const Peer_socket::Recvd_pkt_iter start_contig_it = rcv_packets_with_gaps.begin();
1498 // End of range to delete (just past last element to delete).
1499 Peer_socket::Recvd_pkt_iter end_contig_it;
1500 size_t total_written = 0;
1501
1502 // The following loop is O(n) worst case.
1503 for (end_contig_it = start_contig_it;
1504 /* Search until the infinite gap is found; or the first finite gap is found.
1505 * Note invariant at entry to each loop iteration: rcv_next_seq_num is seq. # just past last received
1506 * packet's data (so for contiguousness, it must equal the 1st seq. # in next packet). */
1507 (end_contig_it != rcv_packets_with_gaps.end()) && (end_contig_it->first == rcv_next_seq_num);
1508 ++end_contig_it)
1509 {
1510 Peer_socket::Received_packet& rcvd_packet = *end_contig_it->second;
1511
1512 if (reassembly_in_progress)
1513 {
1514 /* Receive Buffer can be consumed by user threads (not W) at the same time. Must lock.
1515 * @todo Probably possible to make the critical section smaller.
1516 *
1517 * Conversely, maybe it's better to lock around the entire while () loop, for potentially less
1518 * locking/unlocking while another thread is reading from buffer, which intuitively "feels" churn-y.
1519 * Arguments against: the loop may have 0 iterations, meaning the locking was a waste; also, locking
1520 * once per packet is no worse in aggregate than if we'd received these packets in order without
1521 * needing reassembly -- and that's the much more typical state of affairs; so it's not like we're
1522 * adding some unusually excessive amount of locking/unlocking by locking once per packet during
1523 * reassembly. */
1524 size_t written;
1525 size_t buf_size;
1526 {
1527 Peer_socket::Lock_guard lock(sock->m_mutex);
1528
1529 /* Reassemble! This is constant-time. Note we don't check for overflow here, but that's because we
1530 * checked for it cleverly in first enqueueing this in rcv_packets_with_gaps
1531 * (see sock_data_to_reassembly_q_unless_overflow()). */
1532 written = sock->m_rcv_buf.feed_buf_move(&rcvd_packet.m_data, std::numeric_limits<size_t>::max());
1533 // rcvd_packet.m_data is now empty.
1534 buf_size = sock->m_rcv_buf.data_size();
1535 }
1536 total_written += written;
1537
1538 // Similarly to when receiving a first-gap-filling (or just in-order, if there is no gap) DATA packet:
1539 rcv_stats.good_data_delivered_packet(written);
1540 rcv_stats.buffer_fed(buf_size);
1541
1542 assert(written != 0);
1543 }
1544
1545 advance_seq_num(&rcv_next_seq_num, rcvd_packet.m_size);
1546
1547 FLOW_LOG_TRACE("First unreceived packet pointer moved again to "
1548 "[" << rcv_next_seq_num << "]; packet subsumed by this move.");
1549 } // while (keep encountering contiguous packets)
1550
1551 // The following, according to STL requirements, is O(k + log n), where k is # erased; thus O(n) worst case.
1552 rcv_packets_with_gaps.erase(start_contig_it, end_contig_it); // Does nothing if end_contig_it == start_contig_it.
1553 sock->m_rcv_reassembly_q_data_size -= total_written;
1554} // Node::sock_slide_rcv_next_seq_num()
1555
1557{
1558 /* The limit itself is not an option but rather computed from other options to be
1559 * more dynamic. Let N be the desired max ratio of rcv_packets_with_gaps.size() * max-block-size
1560 * to the max Receive buffer size, expressed in percent. Then the max
1561 * rcv_packets_with_gaps.size() value is N% * <max Receive buffer size> / max-block-size / 100%.
1562 * N is the option m_st_rcv_max_packets_after_unrecvd_packet_ratio_percent. */
1563 return uint64_t(sock->opt(sock->m_opts.m_st_rcv_max_packets_after_unrecvd_packet_ratio_percent)) *
1564 uint64_t(sock->opt(sock->m_opts.m_st_rcv_buf_max_size)) /
1565 uint64_t(sock->max_block_size()) /
1566 100;
1567}
1568
1570 bool* first_gap_exists, Sequence_number* seq_num_after_first_gap)
1571{
1572 // If false, all received packets are followed by all unreceived ones. Otherwise there's at least 1 gap.
1573 *first_gap_exists = !sock->m_rcv_packets_with_gaps.empty();
1574 // If true, then this is the sequence number of the first datum right after that first gap.
1575 if (*first_gap_exists)
1576 {
1577 *seq_num_after_first_gap = sock->m_rcv_packets_with_gaps.begin()->first;
1578 }
1579}
1580
1581void Node::async_acknowledge_packet(Peer_socket::Ptr sock, const Sequence_number& seq_num, unsigned int rexmit_id,
1582 size_t data_size)
1583{
1584 // We are in thread W.
1585
1586 // Plenty of info logged in caller, so don't re-log.
1587 FLOW_LOG_TRACE("Accumulating for acknowledgment.");
1588
1589 // Register one packet with N bytes of data (not necessarily acceptable data).
1590 sock->m_rcv_stats.total_to_send_ack_packet(data_size);
1591
1592 const size_t acks_pending_before_this = sock->m_rcv_pending_acks.size();
1593
1594 /* Just the starting sequence number sufficient to identify a single packet. The time point saved
1595 * here is subtracted from time_now() at ACK send time, to compute the artificial delay introduced
1596 * by ACK delaying (explained just below). This helps other side calculate a more accurate RTT by
1597 * substracting the ACK delay from its RTT measurement. */
1598 sock->m_rcv_pending_acks.push_back
1600 (new Peer_socket::Individual_ack{ seq_num, rexmit_id, Fine_clock::now(), data_size }));
1601
1602 /* m_rcv_pending_acks now stores at least one packet to acknowledge. We can acknowledge it
1603 * immediately (modulo UDP layer availability of course). However, suppose there is a fast stream
1604 * of packets coming in, such that several DATA packets were read in within one
1605 * low_lvl_recv_and_handle() call. Then each DATA packet will result in one ACK packet.
1606 * This introduces a ton of overhead, as the header is quite large given that the payload is just
1607 * a Sequence_number. Instead we would want to pack all the DATA packets' acknowledgments into
1608 * one ACK packet (unless it overflows, in which case create more ACK packets as needed). So we
1609 * only accumulate the individual acknowledgments here; we will possibly send the actual ACK(s) in
1610 * perform_accumulated_on_recv_tasks(), which runs at the end of low_lvl_recv_and_handle() (or its
1611 * bro, the async part of async_wait_latency_then_handle_incoming()).
1612 *
1613 * Caveat: The above is rock-solid if the different DATA packets being acked were contiguous to
1614 * each other chronologically. What if there is another type of packet between some two of these
1615 * DATAs? Well, it depends on what it is. Ignoring the misbehaving/duplicate/whatever packets
1616 * (SYN, for example) -- which will just be discarded basically -- let's consider the
1617 * possibilities. If the packet is ACK, then it is irrelevant; NetFlow (like TCP) is full-duplex
1618 * (actually more so, since there's no DATA+ACK piggy-backing), therefore the micro-ordering of
1619 * traffic in opposite directions is irrelevant. If the packet is RST, then that means the socket
1620 * will get closed (no longer ESTABLISHED) before we get a chance to send any of the individual
1621 * acknowledgments. However, that is more or less OK; if the other side sent RST, then they won't
1622 * accept any ACKs we may send them anyway. The only other possibility has to with graceful close,
1623 * but that is not yet implemented.
1624 * @todo Revisit this when graceful close is implemented. (Preliminary idea: force immediate ACK
1625 * handling when FIN/etc. detected? Or something.) */
1626
1627 if (m_socks_with_accumulated_pending_acks.insert(sock).second)
1628 {
1629 /* First acknowledgment to be accumulated in this handler (low_lvl_recv_and_handle() or
1630 * async part of async_wait_latency_then_handle_incoming()). So mark down whether at that time there were
1631 * already timer-delayed acknowledgments pending (and how many). See
1632 * sock_perform_accumulated_on_recv_tasks() for details on delayed ACKs. */
1633 sock->m_rcv_pending_acks_size_at_recv_handler_start = acks_pending_before_this;
1634 }
1635 // else already had registered pending acknowledgment in this handler.
1636} // Node::async_acknowledge_packet()
1637
1639{
1640 using boost::chrono::milliseconds;
1641 using boost::chrono::microseconds;
1642 using boost::chrono::duration_cast;
1643 using boost::chrono::round;
1644 using std::vector;
1645
1646 // We are in thread W.
1647
1648 // For background see Node::perform_accumulated_on_recv_tasks().
1649
1650 // For brevity and speed:
1651 vector<Peer_socket::Individual_ack::Ptr>& pending_acks = sock->m_rcv_pending_acks;
1652
1653 if (sock->m_int_state != Peer_socket::Int_state::S_ESTABLISHED)
1654 {
1655 // For example, we got DATA and then RST on the same socket almost simultaneously.
1656 FLOW_LOG_TRACE("Was about to perform accumulated acknowledgment tasks on [" << sock << "] but skipping because "
1657 "state is now [" << sock->m_int_state << "].");
1658 return;
1659 }
1660
1661 // Check explicit pre-condition.
1662 assert(!pending_acks.empty());
1663
1664 /* Deal with any accumulated acknowledgments. Naively, we'd simply call async_low_lvl_ack_send()
1665 * here, which would take pending_acks and bundle them up into as few as possible ACK
1666 * packets and send them off.
1667 *
1668 * However, we potentially instead use delayed ACKing as in typical TCP implementations (based on
1669 * various standard RFCs). The idea is that a few DATA packets have come in around the same time,
1670 * but not close enough to be handled in one receive handler. So upon detecting the first DATA
1671 * packet in the steady state, start a timer; until it fires accumulate more packets in
1672 * pending_acks; and when it fires finally assemble and flush (send) the ACK(s). Something else may trigger
1673 * the flushing of the ACK(s) ahead of this timer or even immediately.
1674 *
1675 * These are situations where we must short-circuit the timer and send the ACK(s)
1676 * immediately:
1677 *
1678 * 1. From TCP (RFC 5681-4.2), which says that an ACK should be generated for at
1679 * least every second full-sized (data size = MSS) incoming data segment. The reasoning is
1680 * two-fold: causing bursty sending by the receiver of the ACKs; and slowing down slow start
1681 * in Reno (and others) congestion control. The latter is not really a problem for us (since
1682 * ACKs are not cumulative but selective and handled as such by our congestion control logic);
1683 * but the former is definitely an easily demonstrable issue. @todo This paragraph is difficult
1684 * to understand right now. There might be 1 or more unintentional meaning inversions, wherein
1685 * I mean to say X is good, but instead say X is bad, or vice vera, or at least it's unclear. Research;
1686 * rephrase.
1687 *
1688 * 2. Also from TCP (RFC 5681-3.2), which says that an ACK should be
1689 * immediately generated upon detecting an out-of-order data segment. This is to inform
1690 * congestion control of any loss event as soon as possible (Fast Recovery algorithm).
1691 *
1692 * Note that TCP RFCs don't account for the implementation detail that several packets can be
1693 * received "simultaneously" (in one handler in our case), nor for selective ACKs (in this
1694 * context), so when they say we must send an ACK for every 2 incoming segments at least, we do
1695 * not take this literally. Instead, we just say that if (here, after a full receive handler has
1696 * run) there are at least 2 full blocks' worth of pending acknowledgments (there could be many
1697 * more in theory) and/or there's an out-of-order DATA packet, then we send immediate ACK(s), thus
1698 * following the spirit of the rules in the RFC. The spirit of the rule is to short-circuit the
1699 * timer the moment at least 2 full packets can be acknowledged.
1700 *
1701 * We detect both of these situations below and act accordingly. We also start the delayed ACK
1702 * timer, if necessary, otherwise. Oh, and there's a mode to disable delayed ACKs.
1703 *
1704 * @todo We may also force immediate ACKing during graceful shutdown. Revisit when graceful
1705 * shutdown is implemented.... */
1706
1707 const Fine_duration delayed_ack_timer_period = sock->opt(sock->m_opts.m_st_delayed_ack_timer_period);
1708
1709 bool force_ack = delayed_ack_timer_period == Fine_duration::zero(); // Delayed ACKs disabled.
1710
1711 if (force_ack)
1712 {
1714 ("Delayed [ACK] feature disabled on [" << sock << "]; forcing immediate [ACK]. "
1715 "Receive window state: [" << sock->m_rcv_init_seq_num << ", " << sock->m_rcv_next_seq_num << ") "
1716 "| " << sock->m_rcv_packets_with_gaps.size() << ":{...}.");
1717 }
1718 else if (!sock->m_rcv_packets_with_gaps.empty())
1719 {
1720 /* Scan to see if there was an out-of-order DATA packet. That is to say, have we received a
1721 * DATA packet -- i.e., have we queued a pending acknowledgment in this receive handler -- that
1722 * follows at least one unreceived packet in the sequence number space.
1723 *
1724 * There is a gap in the received sequence number space, so this is potentially possible. Scan
1725 * only the DATA packets (acknowledgments) accumulated in THIS handler (since previous ones
1726 * have already been checked, and unreceived gaps can't just appear out of nowhere later). If
1727 * any is past the first gap, it qualifies. (The reverse is true. If it's past any gap, it's
1728 * past the first gap.) */
1730 for (size_t ack_idx = sock->m_rcv_pending_acks_size_at_recv_handler_start;
1731 ack_idx != pending_acks.size(); ++ack_idx)
1732 {
1733 ack = pending_acks[ack_idx];
1734 if (ack->m_seq_num > sock->m_rcv_next_seq_num)
1735 {
1736 force_ack = true;
1737 break;
1738 }
1739 }
1740
1741 if (force_ack)
1742 {
1744 ("On [" << sock << "] "
1745 "received out-of-order packet [" << ack->m_seq_num << ", size " << ack->m_data_size << ", "
1746 "rexmit " << ack->m_rexmit_id << "]; "
1747 "forcing immediate [ACK]. "
1748 "Receive window state: [" << sock->m_rcv_init_seq_num << ", " << sock->m_rcv_next_seq_num << ") "
1749 "| " << sock->m_rcv_packets_with_gaps.size() << ":{...}.");
1750 }
1751 }
1752 if (!force_ack)
1753 {
1754 // No out-of-order stuff. See if there are at least N * max-block-size bytes pending to be acknowledged.
1755
1756 const size_t limit // Default 2.
1757 = sock->opt(sock->m_opts.m_st_max_full_blocks_before_ack_send) * sock->max_block_size();
1758 size_t bytes = 0;
1759 for (Peer_socket::Individual_ack::Const_ptr ack : pending_acks)
1760 {
1761 bytes += ack->m_data_size;
1762 if (bytes >= limit)
1763 {
1764 force_ack = true;
1765 break;
1766 }
1767 }
1768
1769 if (force_ack)
1770 {
1771 FLOW_LOG_TRACE("On [" << sock << "] "
1772 "accumulated at least [" << limit << "] bytes to acknowledge; "
1773 "forcing immediate [ACK].");
1774 }
1775 }
1776
1777 // OK; force_ack is set finally.
1778
1779 if (force_ack)
1780 {
1781 /* Yep, must send ACK(s) now. There are two possibilities. One, a delayed ACK timer may
1782 * already be running. If so, we should cancel it and send immediately. If the cancel fails
1783 * (returns 0 tasks canceled), then it was already queued to fire very soon, so we should
1784 * just let the ACKing happen that way instead of sending immediately.
1785 *
1786 * Two, a timer is not running, so we shouldn't cancel and should just send immediately.
1787 *
1788 * How to determine if timer is currently running? If
1789 * m_rcv_pending_acks_size_at_recv_handler_start == 0, then the timer was either never scheduled
1790 * (only scheduled when pending_acks.empty()) or was triggered and handled before the current
1791 * handler; therefore it is not running. Otherwise, there were pending acks to send, yet they
1792 * were not sent by the end of the last handler, which means the timer must be running.
1793 *
1794 * (There may be some corner case I'm not imagining such that the timer was running even while
1795 * m_rcv_pending_acks_size_at_recv_handler_start == 0, but even then the worst that will happen is
1796 * that we will perform the ACKing here, not cancel that wait, and that timer will
1797 * harmlessly expire with the timer handler doing nothing.) */
1798
1799 if (sock->m_rcv_pending_acks_size_at_recv_handler_start != 0)
1800 {
1801 FLOW_LOG_TRACE("On [" << sock << "] "
1802 "canceling delayed [ACK] timer due to forcing "
1803 "immediate [ACK]; would have fired "
1804 "in [" << round<milliseconds>(sock->m_rcv_delayed_ack_timer.expires_from_now()) << "] "
1805 "from now.");
1806
1807 Error_code sys_err_code;
1808 const size_t num_canceled = sock->m_rcv_delayed_ack_timer.cancel(sys_err_code);
1809 if (sys_err_code)
1810 {
1811 FLOW_ERROR_SYS_ERROR_LOG_WARNING(); // Log the non-portable system error code/message.
1812
1813 // Pretty unlikely, but let's send RST and abort connection, since something crazy is going on.
1814
1815 // As above....
1816 rst_and_close_connection_immediately(socket_id, sock,
1818 // ^-- defer_delta_check == true: for similar reason as in handle_syn_ack_ack_to_syn_rcvd().
1819 return;
1820 }
1821 // else
1822
1823 if (num_canceled == 0)
1824 {
1825 /* Unlikely but legitimate; timer was queued to trigger very soon, so we could not
1826 * cancel it. No problem -- just let the ACKing happen per timer. Log INFO due to
1827 * rarity of this situation. */
1828 FLOW_LOG_INFO("On [" << sock << "] "
1829 "tried to cancel delayed [ACK] timer while "
1830 "forcing [ACK], but it was already just about to fire.");
1831 force_ack = false;
1832 }
1833 } // if (m_rcv_pending_acks_size_at_recv_handler_start != 0) [timer was running]
1834
1835 // If still forcing immediate ACK, finally do it.
1836 if (force_ack)
1837 {
1838 async_low_lvl_ack_send(sock, true);
1839 // ^-- defer_delta_check == true: for similar reason as in handle_syn_ack_ack_to_syn_rcvd().
1840
1841 assert(pending_acks.empty());
1842 }
1843 } // if (force_ack)
1844 else // if (!force_ack)
1845 {
1846 /* There are pending individual acks but no reason to send them off right now. The only
1847 * remaining question is whether we need to schedule the delayed ACK timer to send them
1848 * later. That depends on whether the timer is already running. If
1849 * m_rcv_pending_acks_size_at_recv_handler_start == 0, then the timer was either never scheduled
1850 * or was triggered and handled before the current handler; therefore it is not running. So
1851 * in that case we should start it, as we've just received our first ackable DATA since
1852 * we've sent off our last ACK. If m_rcv_pending_acks_size_at_recv_handler_start != 0, then the
1853 * timer must be running, because there were pending acks to send, yet they were not send by
1854 * the end of the last handler (which would have caused this very code to schedule the
1855 * timer).
1856 *
1857 * (There may be some corner case I'm not imagining such that the timer was running even while
1858 * m_rcv_pending_acks_size_at_recv_handler_start == 0, but even then it can't possibly be set to
1859 * the right time [which is S_DELAYED_ACK_TIMER_PERIOD for now], so we need to re-set it
1860 * anyway. [Re-setting the expiry time will cancel that running timer wait. Even if that
1861 * somehow fails, the worst case is that the ACK(s) will be sent prematurely.]) */
1862
1863 if (sock->m_rcv_pending_acks_size_at_recv_handler_start == 0)
1864 {
1865 // First individual acknowledgment accumulated: start countdown to send the next batch of acknowledgments.
1866
1867 Error_code sys_err_code;
1868 sock->m_rcv_delayed_ack_timer.expires_from_now(delayed_ack_timer_period, sys_err_code);
1869 if (sys_err_code)
1870 {
1871 FLOW_ERROR_SYS_ERROR_LOG_WARNING(); // Log the non-portable system error code/message.
1872
1873 // Pretty unlikely, but let's send RST and abort connection, since something crazy is going on.
1874
1875 /* Close connection in our structures (inform user if necessary as well). Pre-conditions
1876 * assumed by call: sock in m_socks and sock->state() == S_OPEN (yes, since m_int_state ==
1877 * S_ESTABLISHED); 3rd arg contains the reason for the close (yes). This will empty the Send
1878 * and Receive buffers. That is OK, because this is the abrupt type of close (error). */
1879 rst_and_close_connection_immediately(socket_id, sock,
1881 // ^-- defer_delta_check == true: for similar reason as in handle_syn_ack_ack_to_syn_rcvd().
1882 return;
1883 }
1884 // else
1885
1886 FLOW_LOG_TRACE("On [" << sock << "] "
1887 "scheduled delayed [ACK] timer to fire "
1888 "in [" << round<milliseconds>(delayed_ack_timer_period) << "].");
1889
1890 // When triggered or canceled, call this->async_low_lvl_ack_send(sock, false, <error code>).
1891 sock->m_rcv_delayed_ack_timer.async_wait([this, socket_id, sock](const Error_code& sys_err_code)
1892 {
1893 async_low_lvl_ack_send(sock, false, sys_err_code);
1894 });
1895 // ^-- defer_delta_check == false: for similar reason as in send_worker_check_state() calling send_worker().
1896 }
1897 // else the timer is already started, so just accumulating onto pending_acks is enough. Done.
1898 } // if (!force_ack)
1899
1900 // Register the current # of DATA packets to acknowledge. Note that we're near the end of current handler.
1901 sock->m_rcv_stats.current_pending_to_ack_packets(pending_acks.size());
1902} // Node::sock_perform_accumulated_on_recv_tasks()
1903
1904void Node::log_rcv_window(Peer_socket::Const_ptr sock, bool force_verbose_info_logging) const
1905{
1906 using std::vector;
1907 using std::string;
1908 using boost::algorithm::join;
1909
1910 // We're in thread W.
1911
1912 // For brevity and a little speed:
1913 const Peer_socket::Recvd_pkt_map& rcv_packets_with_gaps = sock->m_rcv_packets_with_gaps;
1914
1915 // force_verbose_info_logging => log the most detail, as INFO (if INFO logging enabled).
1916
1917 auto const logger_ptr = get_logger();
1918 if (((!logger_ptr) || (!logger_ptr->should_log(log::Sev::S_DATA, get_log_component()))) &&
1919 (!(force_verbose_info_logging && logger_ptr->should_log(log::Sev::S_INFO, get_log_component()))))
1920 {
1921 // Can't print entire In-flight data structure, but can print a summary, if TRACE enabled.
1923 ("Receive window state for [" << sock << "]: "
1924 "[" << sock->m_rcv_init_seq_num << ", " << sock->m_rcv_next_seq_num << ") "
1925 "| " << rcv_packets_with_gaps.size() << ":{...}.");
1926 return;
1927 }
1928 // else
1929
1930 /* Construct full printout of the packets we've received past the first unreceived gap.
1931 *
1932 * Very verbose and slow! Even so, if it gets beyond a certain size it's absurd, so skip some in
1933 * that case even though DATA logging is sanctioned. (That amount of data cannot really be useful
1934 * in any case.) */
1935
1936 vector<string> pkt_strs;
1937 pkt_strs.reserve(rcv_packets_with_gaps.size());
1938
1939 const size_t MAX_TO_SHOW = 100;
1940 bool skipped_some = false;
1941 size_t count = 0;
1942
1943 for (Peer_socket::Recvd_pkt_const_iter pkt_it = rcv_packets_with_gaps.begin();
1944 pkt_it != rcv_packets_with_gaps.end();
1945 ++pkt_it)
1946 {
1947 const bool last_iteration = (count == rcv_packets_with_gaps.size() - 1);
1948
1949 if ((!skipped_some) && (count > MAX_TO_SHOW) && (!last_iteration))
1950 {
1951 // First packet past the limit we can print. Start skipping mode.
1952 skipped_some = true;
1953 ++count;
1954 continue;
1955 }
1956 // else either we are in skipping more from before, or we are not in skipping mode.
1957
1958 string pkt_str;
1959
1960 if (skipped_some)
1961 {
1962 // We are in skipping mode from before.
1963 if (!last_iteration)
1964 {
1965 // Since it's not the last iteration, skip: print nothing.
1966 ++count;
1967 continue;
1968 }
1969 // else we are in skipping more from before, and this is the last iteration. Print the placeholder.
1970 pkt_str = "[...skipped...] ";
1971 }
1972 // Either we are not in skipping mode (just print the thing) or we are and it's last iteration (also print it).
1973
1974 Sequence_number start, end;
1975 get_seq_num_range(pkt_it, &start, &end);
1976
1977 util::ostream_op_to_string(&pkt_str, '[', start, ", ", end, ')');
1978 pkt_strs.push_back(pkt_str);
1979
1980 ++count;
1981 } // for (packets in rcv_packets_with_gaps)
1982
1984 (force_verbose_info_logging ? log::Sev::S_INFO : log::Sev::S_DATA,
1985 "Receive window state for [" << sock << "]: "
1986 "[" << sock->m_rcv_init_seq_num << ", " << sock->m_rcv_next_seq_num << ") "
1987 "| " << rcv_packets_with_gaps.size() << ":{" << join(pkt_strs, " ") << "}.");
1988} // Node::log_rcv_window()
1989
1991 boost::shared_ptr<const Ack_packet> ack)
1992{
1993 // We are in thread W.
1994
1995 /* packet is an ACK, so its payload consists of at least m_rcv_wnd (the current advertised Receive
1996 * buffer space on the receiver) and packet->m_rcv_acked_packets, which is basically a list of ZERO or
1997 * more sequence numbers, each of which represents a packet we'd (hopefully) sent that the
1998 * receiver has received. Naively we'd just handle the window update and each individual ack here
1999 * in a loop, then inform congestion control, etc. etc. However there is an optimization to make.
2000 * Suppose in the calling low_lvl_recv_and_handle() or async-part-of-async_wait_latency_then_handle_incoming()
2001 * there are several more ACKs for this socket sock that will be received. This may well happen in
2002 * high traffic; for instance the sender may have had too many individual acks for one ACK and
2003 * thus sent several; or maybe the UDP net-stack had a few packets ready by the time boost.asio was
2004 * free in thread W. In this case, it is better to collect all the individuals acks in these
2005 * several ACKs, and then handle them all at the same time. Why? Answer: it will update our
2006 * sender state (what's ACKed, what's dropped) entirely in one go instead of doing it in two or
2007 * more steps. Because congestion control activities ("on drop event," "on acknowledgment") are
2008 * performed after handling all the available acks, it gives a truer, simpler picture to the
2009 * congestion control module, when compared to giving it one picture and then almost instantly
2010 * giving it another. Another way to think of it is simply that since the different ACKs arrived
2011 * at the same time, and all an ACK is is a collection of individual acks that could fit into the
2012 * ACK packet, then conceptually this is no different from being one super-ACK with all the
2013 * individual acks contained in it. Therefore it is at least not worse.
2014 *
2015 * (In addition, m_rcv_wnd also affects the decision on whether to send more data over the wire,
2016 * as can_send() is part of that same algorithm.)
2017 *
2018 * Caveat: The above is rock-solid if the different ACKs being combined were contiguous to each
2019 * other chronologically. What if there is another type of packet between some two of these ACKs?
2020 * Well, it depends on what it is. Ignoring the misbehaving/duplicate/whatever packets (SYN, for
2021 * example) -- which will just be discarded basically -- let's consider the possibilities. If
2022 * the packet is DATA, then it is irrelevant; NetFlow (like TCP) is full-duplex (actually more so,
2023 * since there's no DATA+ACK piggy-backing), therefore the micro-ordering of traffic in opposite
2024 * directions is irrelevant. If the packet is RST, then that means the socket will get closed (no
2025 * longer ESTABLISHED) before we get a chance to process any of the individual acknowledgments.
2026 * However, that is more or less OK; if the other side sent RST, then they won't accept any
2027 * further data we may send after processing the acknowledgments anyway. The only other
2028 * possibility has to with graceful close, but that is not yet implemented.
2029 * @todo Revisit this when graceful close is implemented. (Preliminary idea: accumulate DATA and
2030 * FIN/etc. packets and always handle them after handling ACKs. Then the DATA/FIN stream will not
2031 * have a chance to disrupt (by initiating closing the connection) the ACK handling, while the ACK
2032 * handling should have no bearing on the DATA/FIN stream.)
2033 *
2034 * So, let's accumulate the individual acks in packet->m_rcv_acked_packets into a big
2035 * sock->m_rcv_acked_packets to be handled from perform_accumulated_on_recv_tasks() at the end of the
2036 * current handler. Similarly save m_rcv_wnd into sock->m_pending_rcv_wnd. To let that method
2037 * know sock has a new m_pending_rcv_wnd and possibly non-empty sock->m_rcv_acked_packets, insert sock
2038 * into m_socks_with_accumulated_acks. */
2039
2040 /* Note: We're not setting the actual sock->m_snd_remote_rcv_wnd until
2041 * perform_accumulated_on_recv_tasks().
2042 *
2043 * Also note: the latest ACK to arrive in this receive handler will contain the most up-to-date
2044 * rcv_wnd value (previous ones are overwritten by this). */
2045 sock->m_snd_pending_rcv_wnd = ack->m_rcv_wnd;
2046
2047 // It's a (ref-counted) pointer copy. Note there may be 0 elements there, if it's just an m_rcv_wnd update alone.
2048 sock->m_rcv_acked_packets.insert(sock->m_rcv_acked_packets.end(), // Append.
2049 ack->m_rcv_acked_packets.begin(), ack->m_rcv_acked_packets.end());
2050 m_socks_with_accumulated_acks.insert(sock); // May already be in there.
2051
2052 FLOW_LOG_TRACE("NetFlow worker thread working on [" << sock << "]. "
2053 "Received and accumulated [" << ack->m_type_ostream_manip << "] with "
2054 "[" << ack->m_rcv_acked_packets.size() << "] individual acknowledgments "
2055 "and rcv_wnd = [" << ack->m_rcv_wnd << "]; total for this socket in this "
2056 "receive handler is [" << sock->m_rcv_acked_packets.size() << "] individual acknowledgments.");
2057
2058 sock->m_snd_stats.received_low_lvl_ack_packet(ack->m_rcv_acked_packets.empty());
2059} // Node::handle_ack_to_established()
2060
2062{
2063 using std::min;
2064 using std::vector;
2065 using boost::tuple;
2066 using boost::unordered_set;
2067 using boost::chrono::round;
2068 using boost::chrono::milliseconds;
2069 using boost::chrono::seconds;
2070
2071 /* This is a complex method that does many things. Therefore readability is hard to accomplish, as the logic
2072 * makes sense when writing it, but the big picture is hard to see when reading it. The necessary heavy commenting
2073 * further increases the size and therefore (along that dimension) decreases readability. For these reasons,
2074 * many logically distinct parts were placed into helper methods -- not to increase code reuse but to help
2075 * the aforementioned consideration. */
2076
2077 // We are in thread W.
2078
2079 log_accumulated_acks(sock);
2080 // Below TRACE messages omit most of the just-logged detail, since it's already logged now.
2081
2082 // For brevity and a little speed:
2083 using Acks = vector<Ack_packet::Individual_ack::Ptr>;
2084 Acks& acked_packets = sock->m_rcv_acked_packets;
2085 /* To not put already-handled acknowledgments up for handling again in the next run of this method
2086 * (which would be wrong), we must clear acked_packets before exiting this method. To be safe,
2087 * make sure acked_packets.clear() runs no matter how this method exits. */
2088 util::Auto_cleanup cleanup = util::setup_auto_cleanup([&]() { acked_packets.clear(); });
2089
2090 /* Handle all the acknowledgments we've received in this receive handler. Background on the
2091 * accumulation tactic is in handle_ack_to_established(). As explained in that method, some
2092 * packet between the first and last ACK received in this handler may have changed state away from
2093 * ESTABLISHED. For example, there could have been an RST. Check for that. */
2094 if (sock->m_int_state != Peer_socket::Int_state::S_ESTABLISHED)
2095 {
2096 // Rare/interesting enough for INFO.
2097 FLOW_LOG_INFO("NetFlow worker thread working on [" << sock << "]. "
2098 "Accumulated [ACK] packets with [" << acked_packets.size() << "] "
2099 "individual acknowledgments, but state is now [" << sock->m_int_state << "]; ignoring ACKs forever.");
2100 return;
2101 }
2102 // else OK. Handle the accumulated acknowledgments.
2103 assert(sock->m_int_state == Peer_socket::Int_state::S_ESTABLISHED);
2104
2105 /* The individual acknowledgments are (sequence number, ACK delay in unit X, retransmission ID)
2106 * triples, where the latter is always zero unless retransmission is enabled. Let's handle each
2107 * one by updating m_snd_flying_pkts* (i.e., removing that packet from m_snd_flying_pkts*) and
2108 * informing congestion control. Before continuing reading the method please look at the large
2109 * comments for Peer_socket::m_snd_flying_pkts_by_{sent_when|seq_num} (drawing a diagram might also help).
2110 *
2111 * Before continuing, quick discussion of corner cases:
2112 *
2113 * Any two given such triples may have equal sequence number/retransmission ID entries. This
2114 * means that during the last ACK delay timer or boost.asio handler, while accumulating the
2115 * acknowledgments for this ACK packet, the receiver received the same packet twice (duplicate).
2116 * (This can happen due to network misbehavior; and due to ACK loss and other conditions when
2117 * retransmits are enabled.) Call 2 such packets P1 and P2, where P1 was received first and thus
2118 * appears earlier in acked_packets. How do we handle this?
2119 *
2120 * Suppose instead of being in the same ACK, P1 and P2 were in different ACKs that arrived in that
2121 * order, P1 and P2 (something that certainly could happen depending on how the delayed ACK timer
2122 * works out). That situation is basically the same (except that if they're in one ACK there's
2123 * the added guarantee that we KNOW what P1 is acknowledging arrived to the receiver before
2124 * what P2 was acknowledging did, which is even more solid knowledge). Therefore, it makes sense
2125 * to simply handle each acknowledgment in the ACK in the order they're listed in acked_packets.
2126 * The 2nd, 3rd, etc. occurrence will thus be treated the same way as if it arrived in a later
2127 * ACK. */
2128
2129 /* Congestion control: First see introduction to this topic in class Congestion_control_strategy
2130 * doc header. Then resume here.
2131 *
2132 * Since information stored in ACKs is of paramount importance to how congestion control views the
2133 * pipe, congestion control is very relevant in this method: this method is the main (but not
2134 * only) source of events for m_snd_cong_ctl.
2135 *
2136 * These ACK-based events are of interest to m_snd_cong_ctl:
2137 *
2138 * - on_acks(N, M): N bytes in M packets have just been converted from In-flight to
2139 * Acknowledged. Note that these packets have NOT been previously Acknowledged or considered
2140 * Dropped (they are In-flight just before the ACK).
2141 * - This should also be immediately preceded with M on_individual_ack(N', T, CWND) events, where N'
2142 * is the # of bytes in the individual acked packet; and T is the RTT of the packet, and CWND is the
2143 * # of bytes in cwnd that was used when the acked data pkt was sent.
2144 * In the rest of the discussion I omit this event, as it can be thought of as part of
2145 * on_acks() for purposes of the discussion.
2146 * - on_loss_event(N', M'): N' bytes in M' packets have just been converted from In-flight to
2147 * Dropped.
2148 *
2149 * The basic idea is to determine which of these events are implied by the acks passed to this
2150 * method, inform m_snd_cong_ctl, and then check if the new m_snd_cong_ctl->congestion_window_bytes()
2151 * value (a/k/a CWND) -- if it has changed -- allows us to now send more bytes (if we have any).
2152 *
2153 * An important decision (and one sadly not very explicitly exposed [perhaps as an exercise to the
2154 * reader, or to avoid being too rigid] in the various TCP RFCs) is how to group these events and
2155 * in what order. In other words, do we call on_acks(N, 1) for each acknowledged packet? Do we
2156 * then check for drops and call on_loss_event(N', M') immediately, or wait to process all acked
2157 * packets first?
2158 *
2159 * The answer we choose is simple. First, scan all individual (i.e., for each sent packet) acks
2160 * given to us and update m_snd_flying_pkts_by_seq_num (the "scoreboard"). While doing so keep track of
2161 * the cumulative N and M. Having done that, we will also expose zero or more In-flight packets
2162 * as Dropped. (In this method, a given packet is exposed as Dropped if the total number of
2163 * acknowledged packets AFTER that packet exceeds a constant like 2. So basically if the acks we
2164 * process here make that counter exceed that limit for a given packet P, P is Dropped and removed
2165 * from m_snd_flying_pkts_by_seq_num.) So after the ack scanning phase, tally up all packets now
2166 * considered Dropped, which gives us N' and M'.
2167 *
2168 * Finally, call on_loss_event(N', M') (assuming N' and M' are not zero). And then call
2169 * on_acks(M, N) (assuming N and M are not zero).
2170 *
2171 * Let's justify this. First, assume it's correct to tally these things up and then just
2172 * call each method once. Is the "report loss, report acks" order right? Yes. Intuitively,
2173 * m_snd_cong_ctl wants to know about events in the chronological order they occur. While the Drop(s)
2174 * are detected at the same time as the Ack(s), the actual packet dropping INFERRED from the
2175 * Ack(s) occurred in the past; we're only deducing it now. The received Acks are in fact for
2176 * packets AFTER the now-Dropped packets. Hence this is the right order.
2177 *
2178 * Now the only remaining thing is to justify combining the ack and drop events in one (each). For
2179 * acknowledgments, it's straightforward: so far, most Congestion_control_strategy modules
2180 * don't need to know about each individual ack, so for simplicity/efficiency we can just combine
2181 * them. (However, some algorithms do need it; e.g., FAST would need it; still, many don't.
2182 * Other modules, like Send_bandwidth_estimator, may also care about individual acks.)
2183 *
2184 * What about the drop events? Why combine all the drops into one? Should we include all the
2185 * drops into the one? To answer, I use as a reference DCCP CCID 2 RFC 4341 (standards track)
2186 * which describes a protocol similar to ours and implies the following model. Basically, over
2187 * time, the pipe experiences a series of 0 or more congestion events (more accurately loss
2188 * events). Two loss events cannot overlap in this implied model. Thus any given Dropped packet
2189 * belongs to exactly one loss event. Here is how the RFC (section 5) more or less formally
2190 * defines whether 2 packets belong to one event: "As in TCP, two losses [...] are considered part
2191 * of a single congestion event when the second packet was sent before the loss [...] of the first
2192 * packet was detected." Presumably the text also assumes that the "second" packet was
2193 * found to be dropped either at the same or later time as the "first" packet was found to be
2194 * dropped (otherwise the text makes no sense, as the very earliest Dropped packet would be in the
2195 * same congestion event as the very last Dropped packed in a very long session). Let's build an
2196 * algorithm inductively based on this definition.
2197 *
2198 * At first there are no loss events. We get a group of acks which render another group of
2199 * packets P1, P2, ... (in order of increasing sequence number) Dropped. Certainly P1 is in a
2200 * loss event; call it L1. P2 was found to be dropped at the same or later time as P1; and it was
2201 * obviously sent before L1 was detected (which was NOW; call it T1). So P2 is in loss event L1.
2202 * Similarly, so is P3, P4, .... Now let's say some time passes and we get more acks and thus
2203 * dropped packets P7, P8, P9, .... Suppose P7 was sent before T1 (but found Dropped at T2 > T1),
2204 * which is quite possible (e.g., T2 could be just after T1). Then by the above definition P7 is
2205 * in loss event L1 (no new loss event). P8 could be in the same situation. In fact, all Dropped
2206 * packets from this ack group may be in L1. Suppose, conversely, that P9 was sent AFTER T1. By
2207 * the above definition, it is part of a new loss event L2, detected at T2. Now P10, is certainly
2208 * in L2 as well, since it was sent before T2, obviously. Thus we can, for each Dropped packet P,
2209 * determine whether it's part of the preceding loss event or part of a new one.
2210 *
2211 * Intuitively, it makes sense as well. If, say, we got 5 dropped packets at the same time, and
2212 * informed Congestion_control_classic (Reno) with 5 calls to on_loss_event(), then CWND would get
2213 * halved 5 times! Intuitively that's not right (and way too conservative). More likely the 5
2214 * packets belong to the same congestion or loss event, so CWND should only be halved once. Then
2215 * the only question is how to group packets into separate loss events. The above algorithm,
2216 * roughly speaking, considers two packets as part of the same loss event if they're within an RTT
2217 * of each other (indeed RFC 4341 says one can use the SRTT to approximate the above algorithm,
2218 * although we choose to use the exact definition instead).
2219 *
2220 * Therefore the final algorithm is justified and is as follows:
2221 *
2222 * 0. Before the current method is ever called, set time stamp m_snd_last_loss_event_when =
2223 * -infinity.
2224 * 1. Scan all acknowledgments, updating m_snd_flying_pkts* and m_snd_flying_bytes.
2225 * Keep track of total acknowledgment stats (bytes and packets). (Inform side modules like
2226 * Send_bandwidth_estimator with any required individual ack info like RTTs.)
2227 * Ignore acks of packets not in m_snd_flying_pkts* (not In-flight).
2228 * 2. Tally up which packets are exposed as Dropped by the above m_snd_flying_pkts* updates.
2229 * Keep track of total loss stats (bytes and packets). However, when doing the latter ignore
2230 * any packet P for which P.m_sent_when < m_snd_last_loss_event_when.
2231 * 3. If at least 1 packet exposed as Dropped in step 2, call
2232 * m_snd_cong_ctl->on_loss_event(...stats...); and set m_snd_last_loss_event_when to the current time,
2233 * marking this the start of a new loss event.
2234 * 4. If at least 1 packet exposed as Acknowledged in step 1, call
2235 * m_snd_cong_ctl->on_acks(...stats...). */
2236
2237 // Set up work state and save certain "before" values.
2238
2239 // For RTT at least. Use steady, high-res clock. Use one coherent value for entire method to simulate simultaneity.
2240 const Fine_time_pt time_now = Fine_clock::now();
2241
2242 // For brevity and a little speed:
2243 const bool rexmit_on = sock->rexmit_on();
2244 auto& snd_stats = sock->m_snd_stats;
2245 auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
2246 /* These guys are only stored in Peer_socket (instead of creating locally here) for a bit of performance. Reuse now.
2247 * Note that clear() is a very fast operation; it will essentially just set the internal element count to 0. */
2248 auto& pkts_marked_to_drop = sock->m_snd_temp_pkts_marked_to_drop;
2249 pkts_marked_to_drop.clear();
2250
2251 // To check, at the end, whether we've changed can_send() false => true.
2252 const bool could_send_before_acks = can_send(sock);
2253 // To check, at the end, whether we've changed snd_deqable() false => true.
2254 const bool had_rexmit_data_before_acks = !sock->m_snd_rexmit_q.empty();
2255
2256 /* Includes each order number (unique packet ID) for which the packet was acknowledged.
2257 * Used for Drop_timer events to register at the bottom; and also to feed the priority queue high_ack_count_q
2258 * (explained below in detail). */
2259 unordered_set<Peer_socket::order_num_t> flying_now_acked_pkts;
2260
2261 // These are the N, M arguments to on_acks() described just above in big comment.
2262 size_t clean_acked_bytes = 0;
2263 size_t clean_acked_packets = 0;
2264
2265 /* The are the individual T, N' (RTT, acked_bytes, sent_cwnd_bytes) arguments to pass to on_individual_ack() described
2266 * just above in big comment. We will be accumulating these across all the acks in the loop below. */
2267 using Clean_acked_packet = tuple<Fine_duration, size_t, size_t>;
2268 vector<Clean_acked_packet> clean_acked_packet_events;
2269 clean_acked_packet_events.reserve(min(acked_packets.size(), snd_flying_pkts_by_when.size())); // Small optimization.
2270
2271 /* Handle each acknowledgment in the order that the corresponding packet was received by other
2272 * side (earliest to latest) per above discussion. */
2273 for (const Ack_packet::Individual_ack::Const_ptr ack : acked_packets)
2274 {
2275 /* Use helper to classify this individual ack as one of the following:
2276 * - Malformed/illegal. => error_ack is true. Else:
2277 * - Legal but referring to an already-acknowledged packet, or arriving too late. => dupe_or_late is true.
2278 * - The packet being acknowledged is unknown. => flying_pkt_it == past_oldest() (a/k/a end()).
2279 * - The packet being acknowledged is known. => flying_pkt_it points to that acked packet.
2280 * - Legal and acking a not-yet-acked packet, arriving in time. => dupe_or_late is false.
2281 * => flying_pkt_it points to that acked packet.
2282 * Note: The helper takes care of snd_stats updating, closing socket on error, and relevant logging. */
2283
2285 bool dupe_or_late;
2286
2287 const bool error_ack = !categorize_individual_ack(socket_id, sock, ack, &dupe_or_late, &flying_pkt_it);
2288 if (error_ack)
2289 {
2290 return; // Fatal error for entire socket (malformed ack, etc.). Socket is closed; all logged; bail out now.
2291 }
2292 // else
2293
2294 // Note these may never be initialized.
2295 Fine_duration round_trip_time;
2297 const Peer_socket::Sent_packet::Sent_when* sent_when;
2298
2299 // Compute RTT, assuming we ID'ed the original DATA. (RTT logged even if we still throw away the ack just below.)
2300 if (flying_pkt_it != snd_flying_pkts_by_when.past_oldest())
2301 {
2302 // Use helper to compute RTT and, as a side effect, get `Sent_when* sent_when` set to point to appriate structure.
2303 flying_pkt = flying_pkt_it->second;
2304 round_trip_time = compute_rtt_on_ack(flying_pkt, time_now, ack, &sent_when); // It logs details.
2305 } // Done computing (if possible) RTT and logging it.
2306
2307 if (dupe_or_late)
2308 {
2309 continue; // Do NOT return! There may well be valid individual acks after it. All logged; get out now.
2310 }
2311
2312 // else it's an in-time acking of DATA packet that has not yet been acked (is considered In-flight)!
2313 assert(!dupe_or_late);
2314 // The following is guaranteed by helper above, since !dupe_or_late. Hence, also, flying_pkt, sent_when, RTT set.
2315 assert(flying_pkt_it != snd_flying_pkts_by_when.past_oldest());
2316 assert(flying_pkt);
2317
2318 // Update SRTT, etc.
2319 new_round_trip_time_sample(sock, round_trip_time);
2320
2321 /* Similarly, inform congestion control (see big comment at top of method). Some strategies
2322 * use individual acks to gauge the pipe's properties. Save the info to
2323 * later call on_individual_ack(). Why not just call
2324 * it here? Answer: Congestion_control_strategy interface specifies that
2325 * on_individual_ack() must be called AFTER on_loss_event() (which can only be called once
2326 * we've fully updated snd_flying_pkts, thus handled all acks). It also specifies that
2327 * snd_flying_pkts must be updated to reflect the handled ack. So we have no choice but
2328 * to save it. (@todo Performance?) */
2329 const size_t bytes_acked = flying_pkt->m_size;
2330 const size_t cwnd_bytes = sent_when->m_sent_cwnd_bytes;
2331 clean_acked_packet_events.emplace_back(round_trip_time, bytes_acked, cwnd_bytes);
2332
2333 // Maintain invariant. Packet acknowledged, so remove from In-flight packet list and related structures.
2334 snd_flying_pkts_erase_one(sock, flying_pkt_it);
2335
2336 // Bona fide In-flight->Acknowledged data; accumulate to inform congestion control below.
2337 clean_acked_bytes += bytes_acked;
2338 ++clean_acked_packets;
2339
2340 /* If we got here, then it is in fact what we consider a valid acknowledgment of packet
2341 * sent at time sent_when. Therefore, we should increment m_acks_after_me for any packet that has NOT
2342 * been acknowledged that was sent earlier than sent_when. (Later we'd consider Dropped any
2343 * packets for which this value is too high, as in TCP Fast Recovery/Retransmit.) Note that if
2344 * retransmission is off, that's the same as all packets with a lower first sequence number.
2345 * However if retransmission is on, then a packet may have a lower sequence number but be sent
2346 * later. Thus we use sent_when and not seq_num.
2347 *
2348 * Naively, we could just have a for () loop here to increment all such data members. However
2349 * that's inefficient -- order O(k * n), where k = acked_packets.size() and n =
2350 * snd_flying_pkts*.size(), in the worst case. Moreover, some of the Sent_packet structs in
2351 * which we increment m_acks_after_me may be acknowledged and thus erased from snd_flying_pkts*
2352 * in subsequent iterations of the for () loop we are in, wasting that work.
2353 *
2354 * So instead we count the individual acks in a hash map that maps sent_when to the number of
2355 * times (in this ACK) that sequence number's packet was validly acknowledged. This is O(k)
2356 * amortized total. Then elsewhere we use that hash map to more efficiently update m_acks_after_me
2357 * where appropriate. In addition, this hash map is used to register certain Drop_timer
2358 * at the end of the method. */
2359
2360 /* Note that we track these by "order number"; each sent packet (no matter if retransmitted or
2361 * not) gets a unique order number, higher than all previous. Since no two packets will have
2362 * the same order number, we keep a set of order numbers. */
2363 flying_now_acked_pkts.insert(sent_when->m_order_num);
2364 } // for (all acked_packets)
2365
2366 /* snd_flying_pkts* is updated w/r/t removing the In-flight-now-acked packets. Now, realize that
2367 * for a given packet P still In-flight, if packets sent BEFORE it have just become acked, intuitively
2368 * it raises the probability P has been lost and should be considered Dropped. In fact, as explained in
2369 * helper categorize_pkts_as_dropped_on_acks(), if one finds the latest-sent such packet P, then all
2370 * packets sent before it should all be dropped as well. So, let's find this latest-sent P: */
2371 const Peer_socket::Sent_pkt_ordered_by_when_iter last_dropped_pkt_it
2372 = categorize_pkts_as_dropped_on_acks(sock, flying_now_acked_pkts);
2373
2374 /* OK, so P and all In-flight packets sent before it must be dropped. This helper takes all the actions
2375 * necessary (or at least records data we use to take such actions below) w/r/t all those packets.
2376 * Namely: erases them from snd_flying_pkts*; accumulates packet and bytes counts to do with these
2377 * dropped packets; saves the packet IDs from Drop timer purposes into pkts_marked_to_drop. */
2378 size_t dropped_pkts;
2379 size_t dropped_bytes;
2380 size_t cong_ctl_dropped_bytes;
2381 size_t cong_ctl_dropped_pkts;
2382 if (!drop_pkts_on_acks(sock, last_dropped_pkt_it,
2383 &cong_ctl_dropped_pkts, &cong_ctl_dropped_bytes,
2384 &dropped_pkts, &dropped_bytes, &pkts_marked_to_drop))
2385 {
2386 return; // Already closed/logged/etc. (too many retransmissions probably).
2387 }
2388
2389 // As long promised since the top of this method, let congestion control (and B/W estimator) know what happened!
2390
2391 /* Bandwidth estimation: It can be useful to estimate the available outgoing bandwidth (available
2392 * meaning the total bandwidth of the empty pipe minus any other traffic other than this
2393 * connection [NetFlow or otherwise] currently occupying this pipe). Mostly it's useful for certain
2394 * congestion control strategies like Congestion_control_classic_with_bandwidth_est, but it may be
2395 * good information to have if only for the user's general information. Therefore we keep an
2396 * independent m_snd_bandwidth_estimator regardless of the congestion control strategy in use.
2397 * Like Congestion_control_strategy, it updates its state based on events. It currently cares
2398 * about at least one event: on_acks(N), where N is the number of bytes acknowledged. This is
2399 * very similar to the on_acks(N, M) event for congestion control (see above). None of the other
2400 * aspects of the above discussion (such as loss events) apply to m_snd_bandwidth_estimator. */
2401
2402 // Note that the order is as required by Congestion_control_strategy() API: loss, individual acks, consolidated acks.
2403
2404 // Report loss event info to congestion control.
2405 if (dropped_pkts != 0)
2406 {
2407 // @todo Might be too verbose to keep it as INFO!
2408 FLOW_LOG_INFO("NetFlow worker thread working on [" << sock << "]. "
2409 "Considering Dropped: [" << dropped_bytes << "] bytes = [" << dropped_pkts << "] packets.");
2410
2411 if (cong_ctl_dropped_pkts != 0) // Again, cong_ctl_dropped_pkts != dropped_pkts, potentially.
2412 {
2413 // New loss event!
2414 assert(cong_ctl_dropped_bytes != 0); // Empty blocks not allowed (should have been eliminated by now).
2415
2416 FLOW_LOG_INFO("cong_ctl [" << sock << "] update: loss event: "
2417 "Dropped [" << cong_ctl_dropped_bytes << "] bytes "
2418 "= [" << cong_ctl_dropped_pkts << "] packets.");
2419
2420 sock->m_snd_cong_ctl->on_loss_event(cong_ctl_dropped_bytes, cong_ctl_dropped_pkts);
2421 sock->m_snd_last_loss_event_when = Fine_clock::now();
2422
2423 // As a silver lining, we probably got some nice new acknowledgments following that drop.
2424 }
2425 }
2426 else
2427 {
2428 assert(dropped_pkts == 0);
2429 assert(cong_ctl_dropped_pkts == 0);
2430 }
2431
2432 if (clean_acked_packets != 0)
2433 {
2434 assert(clean_acked_bytes != 0); // Empty blocks not allowed (should have been eliminated by now).
2435 assert(!clean_acked_packet_events.empty());
2436
2437 // Report individual (clean) acks to congestion control.
2438 for (const auto& [rtt, bytes, cwnd_bytes] : clean_acked_packet_events)
2439 {
2440 FLOW_LOG_TRACE("cong_ctl [" << sock << "] update: clean individual acknowledgment: "
2441 "[" << sock->bytes_blocks_str(bytes) << "] with RTT [" << round<milliseconds>(rtt) <<
2442 "] and sent_cwnd_bytes [" << cwnd_bytes << "].");
2443
2444 sock->m_snd_cong_ctl->on_individual_ack(rtt, bytes, cwnd_bytes);
2445 }
2446
2447 FLOW_LOG_TRACE("cong_ctl/bw_est [" << sock << "] update: clean acknowledgments: "
2448 "[" << sock->bytes_blocks_str(clean_acked_bytes) << "] = "
2449 "[" << clean_acked_packets << "] packets.");
2450
2451 // Report the totality of (clean) acks to congestion control and bandwidth estimator.
2452 sock->m_snd_bandwidth_estimator->on_acks(clean_acked_bytes);
2453 sock->m_snd_cong_ctl->on_acks(clean_acked_bytes, clean_acked_packets);
2454 }
2455
2456 /* For debugging it can be useful to log socket state right after loss and handling everything.
2457 * Do so but only if the last time we so logged was some time ago; this is a CPU-intensive
2458 * operation.
2459 *
2460 * Also, register dropped data in snd_stats. */
2461 if (dropped_pkts != 0)
2462 {
2463 // Register that we have convered N bytes over M packets from In-flight to Acknowledged.
2464 snd_stats.dropped_data(dropped_bytes, dropped_pkts);
2465
2466 const seconds MIN_TIME_BETWEEN_LOGS(1);
2467 const Fine_duration since_last_loss_sock_log = Fine_clock::now() - m_last_loss_sock_log_when;
2468
2469 if (since_last_loss_sock_log > MIN_TIME_BETWEEN_LOGS)
2470 {
2471 FLOW_LOG_INFO("Will log socket state on loss, because last such loss-driven logging was "
2472 "[" << round<milliseconds>(since_last_loss_sock_log) << " >"
2473 " " << MIN_TIME_BETWEEN_LOGS << "] ago.");
2474 sock_log_detail(sock);
2475 m_last_loss_sock_log_when = Fine_clock::now();
2476 }
2477 else
2478 {
2479 FLOW_LOG_INFO("Will NOT log socket state on loss, because last such loss-driven logging was "
2480 "[" << round<milliseconds>(since_last_loss_sock_log) << " <="
2481 " " << MIN_TIME_BETWEEN_LOGS << "] ago.");
2482 }
2483 }
2484
2485 // Log the send window state after the above changes (if at least TRACE enabled).
2486 log_snd_window(sock);
2487
2488 /* Handle possible effect of above activities on the Drop Timer. (It may get disabled or restarted anew.)
2489 * Why not just do this right when we erase the associated packets from snd_flying_pkts*? Answer: We don't want to
2490 * trigger disruptive behavior like possibly retransmitting everything in the middle of all that accounting
2491 * which is not yet complete. Now it's complete, so it's the right time to handle this.
2492 *
2493 * Recall that snd_flying_pkts* have been updated and no longer contain the associated packets' info. */
2494
2495 const Drop_timer::Ptr drop_timer = sock->m_snd_drop_timer;
2496 drop_timer->start_contemporaneous_events();
2497
2498 for (const auto pkt_order_num : flying_now_acked_pkts)
2499 {
2500 drop_timer->on_ack(pkt_order_num);
2501 drop_timer->on_packet_no_longer_in_flight(pkt_order_num);
2502 }
2503 for (const auto pkt_order_num : pkts_marked_to_drop)
2504 {
2505 drop_timer->on_packet_no_longer_in_flight(pkt_order_num);
2506 }
2507
2508 drop_timer->end_contemporaneous_events();
2509
2510 /* As avertised, handle the rcv_wnd update: the latest ACK we are handling here contains the
2511 * latest info about the Receive buffer space on the other side that is available. */
2512 if (sock->m_snd_pending_rcv_wnd != sock->m_snd_remote_rcv_wnd)
2513 {
2514 FLOW_LOG_TRACE("Other side advertised "
2515 "rcv_wnd change [" << sock->m_snd_remote_rcv_wnd << "] => [" << sock->m_snd_pending_rcv_wnd << "].");
2516 sock->m_snd_remote_rcv_wnd = sock->m_snd_pending_rcv_wnd;
2517 /* Why have this intermediate m_snd_pending_rcv_wnd thing at all then? Answer: can_send(),
2518 * checked at the start of this method and saved into could_send_before_acks, uses the "before
2519 * handling the ACKs" state, which should not yet include the receive window update. Then
2520 * since we update m_snd_remote_rcv_wnd after that is computed, but before can_send() is
2521 * re-checked just below, we are able to see if the ACKs have changed can_send() from false to
2522 * true. */
2523
2524 /* Register whether after this window update, if we had a packet to send and no data In-flight,
2525 * we would be able to send at least one full DATA packet or not (i.e., can_send() would return
2526 * true). That is, register whether Receive window is ~0. */
2527 sock->m_snd_stats.updated_rcv_wnd(sock->m_snd_remote_rcv_wnd < sock->max_block_size());
2528 }
2529
2530 /* We've received ACKs and thus have quite likely reduced the number of bytes we
2531 * consider In-flight. Moreover we may have increased CWND. Moreover we may have added packets
2532 * to retransmit queue (if retransmission is on). Moreover we may have increased m_snd_remote_rcv_wnd.
2533 * Therefore can_send() may now return true while at the beginning of the method it returned
2534 * false; and similarly for snd_deqable(). So have send_worker() check and send more if possible.
2535 * See Node::send() for discussion of overall strategy on this topic. */
2536 if ((!could_send_before_acks) || (rexmit_on && (!had_rexmit_data_before_acks)))
2537 {
2538 send_worker(sock, true);
2539 /* ^-- defer_delta_check == true: because the only way to get to this method is from
2540 * async_low_lvl_recv(), which will perform event_set_all_check_delta(false) at the end of itself,
2541 * before the boost.asio handler exits. See Node::m_sock_events doc header for details. */
2542 }
2543} // Node::handle_accumulated_acks()
2544
2547 bool* dupe_or_late, Peer_socket::Sent_pkt_ordered_by_when_iter* acked_pkt_it)
2548{
2549 assert(dupe_or_late);
2550 assert(acked_pkt_it);
2551
2552 /* This helper of handle_accumulated_acks() exists to make the latter method briefer/readable, not for code reuse
2553 * as of this writing. It figures out whether the given individual ack is invalid, valid but duplicate/late, or
2554 * valid and on-time. Results go into the return value and *dupe_or_late and *acked_pkt_it. */
2555
2556 /* Now to discuss what happens when an ACK is received, with a seemingly valid sequence number
2557 * (i.e., in [m_snd_init_seq_num + 1, m_snd_next_seq_num - 1] range) -- but the corresponding
2558 * packet is not in m_snd_flying_pkts_by_seq_num. What does this mean? One, unlikely, possibility is
2559 * that it's a fake/wrong acknowledgment, not pertaining to any packet we'd sent but in the range
2560 * of sequence numbers we did send (in other words, the sequence number is in the right range but
2561 * doesn't correspond to a first sequence number of a packet we'd really sent). Unfortunately we
2562 * have no way to detect that fully, since it's not in m_snd_flying_pkts_by_seq_num, and that's basically the only
2563 * place we store packet boundaries of sent packets. Suppose we eliminate that possibility.
2564 *
2565 * Then the only remaining possibility is that this acknowledgment is a duplicate of a previous
2566 * one, which had caused us to remove that packet from m_snd_flying_pkts_by_seq_num. So, how DO we handle
2567 * a duplicate acknowledgment? We already know they got packet, as we've already measured RTT
2568 * from the previous copy of this ack, so there's nothing useful for us. Conclusion: ignore
2569 * duplicate acknowledgments.
2570 *
2571 * Note that the above discussion pertains to a dupe ack where both the sequence number and the
2572 * retransmission ID are the same as a previous one. If the retransmission ID is different (only
2573 * legal when retransmission is enabled), that's a different situation -- the acknowledgment is
2574 * not duplicate but rather acknowledging a different send attempt for the same-numbered packet.
2575 * That is less of a corner case and is handled below explicitly.
2576 *
2577 * Sent, unacknowledged packets are eventually considered Dropped. In terms of our data structures
2578 * they are handled just like acknowledged ones. Therefore, an acknowledgment of such a Dropped
2579 * packet may arrive. This is a "late" acknowledgment. It is treated just like a duplicate
2580 * acknowledgment (in fact, there is no way to tell them apart). (Note that a packet is still
2581 * considered Dropped even if retransmission is on -- it's just that in that case it's also queued
2582 * on the retransmission queue to be re-sent when possible.)
2583 *
2584 * Another caveat is that two acknowledgments that are duplicates of each other can get
2585 * mis-ordered and thus arrive in opposite order. Thus the one with the longer one-way time would
2586 * yield the higher RTT, while the shorter one would get ignored. However, RTT measurement is an
2587 * art, not a science, so this is acceptable.
2588 *
2589 * @todo Acknowledgments themselves could actually be identified with something other other
2590 * than sequence numbers and retransmission IDs; e.g., with reflected sender time stamps. Then
2591 * one could do fancier stuff... but let's not overdo it for now. */
2592
2593 // For brevity and a little speed:
2594 const bool rexmit_on = sock->rexmit_on();
2595 auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
2596 auto& snd_flying_pkts_by_seq = sock->m_snd_flying_pkts_by_seq_num;
2597 auto& snd_stats = sock->m_snd_stats;
2598
2599 // First sequence number in acknowledged packet.
2600 const Sequence_number& seq_num = ack->m_seq_num;
2601 // Retransmission ID (0 = first attempt, 1 = 1st retransmission, 2 = 2nd, ...).
2602 const unsigned int rexmit_id = ack->m_rexmit_id;
2603 assert(rexmit_on || (rexmit_id == 0)); // Should be guaranteed by deserialization.
2604
2605 // Register one individual acknowledgment of unknown # of bytes of data (may or may not be acceptable).
2606 snd_stats.received_ack();
2607
2608 /* Ensure it's within the range of sequence numbers we've already sent.
2609 * Note that this doesn't really guarantee its validity. It could be in that range but still
2610 * not correspond to any packet we'd actually sent. We try to detect that below. */
2611
2612 if (!util::in_open_open_range(sock->m_snd_init_seq_num, seq_num, sock->m_snd_next_seq_num))
2613 {
2614 /* Either the other side is an a-hole, or somehow a socket_id was reused from a recent
2615 * connection, which we do try to avoid like the plague. Therefore, send them an RST and
2616 * abort connection. If they send more data packets to this port (which is quite possible;
2617 * many could already be on the way), they'll get more RSTs still. */
2618
2619 // Interesting/rare enough to log a WARNING.
2620 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
2621 "Received [ACK]; "
2622 "acknowledgment [" << seq_num << ", ...) is outside (ISN, snd_next) "
2623 "range (" << sock->m_snd_init_seq_num << ", " << sock->m_snd_next_seq_num << ").");
2624
2625 // Register one individual acknowledgment of unknown # of bytes of data (not acceptable due to error).
2626 snd_stats.error_ack();
2627
2628 /* Close connection in our structures (inform user if necessary as well). Pre-conditions
2629 * assumed by call: sock in m_socks and sock->state() == S_OPEN (yes, since m_int_state ==
2630 * S_ESTABLISHED); 3rd arg contains the reason for the close (yes). This will empty the Send
2631 * and Receive buffers. That is OK, because this is the abrupt type of close (error). */
2632 rst_and_close_connection_immediately(socket_id, sock,
2634 /* ^-- defer_delta_check == true: because the only way to get to this method is from
2635 * async_low_lvl_recv(), which will perform event_set_all_check_delta(false) at the end of itself,
2636 * before the boost.asio handler exits. See Node::m_sock_events doc header for details. */
2637 return false; // Other out-params are meaningless.
2638 }
2639 // else within sane range.
2640
2641 // Check if the sequence number matches that of one of the packets we've sent and want acnowledged.
2642 *acked_pkt_it = snd_flying_pkts_by_when.find(seq_num);
2643 if (*acked_pkt_it == snd_flying_pkts_by_when.past_oldest()) // A/k/a end().
2644 {
2645 /* No such packet. Assuming no foul play/dumbassery, it's probably a duplicate acknowledgment
2646 * (i.e., we've already sent and got the ack, removing that packet from snd_flying_pkts*)
2647 * or a late acknowledgment (i.e., we've already sent and eventually considered Dropped the
2648 * the packet, removing it from snd_flying_pkts*).
2649 *
2650 * There is a corner case if retransmission is on. Suppose we sent packet P, consider it
2651 * Dropped (removing it from snd_flying_pkts*), and thus we place it on retransmission
2652 * queue. Suppose there is not enough CWND space to send it right away, so while it's pending
2653 * on that queue, we now get a late ack for it. Ideally in this case we'd remember it was in
2654 * retransmission queue, remove it from there, and basically act as if we hadn't removed it
2655 * from snd_flying_pkts* and got the ack for it. Instead we're just going to ignore this
2656 * information and needlessly retransmit. So why do this? Answer: It is troublesome to
2657 * design and code this. The part where we wouldn't retransmit it is fairly straightforward
2658 * and is a nice @todo. However acting as if it was normally ACKed after all is complex; for
2659 * instance, since we thought it was Dropped, we already informed m_cong_ctl of the loss event
2660 * -- how can we undo that in a clean way? It does not seem worth it. Again, checking
2661 * and updating the retransmission queue, though, is a nice @todo (but would ideally need fast
2662 * lookup into that linked list so not totally trivial).
2663 *
2664 * So, let's say that the concession described in the previous paragraph is OK.
2665 *
2666 * Could also be invalid. We only know seq_num (one boundary of packet), so how do we detect
2667 * it's invalid? One case where we know it's invalid is if this left boundary happens to be
2668 * straddled by a sequence number range in an element of snd_flying_pkts_by_seq. That would mean
2669 * that the same sequence number is in two different packets, which is in no way legal.
2670 * Example: we sent [5, 10), then received ACK with [7, ...). 7 is inside [5, 10) and is
2671 * thus illegal. */
2672
2673 /* Here's the technique we use. snd_flying_pkts_by_seq.upper_bound(S) gets the first packet
2674 * [U1, U2) such that U1 > S. Let prev(P) denote the packet preceding P in
2675 * snd_flying_pkts_by_seq; let prev([U1, U2)) = [L1, L2). Note that [U1, U2) may not exist
2676 * -- i.e., nothing after S is in the map. If so, [U1, U2) == snd_flying_pkts_by_seq.end(). Even
2677 * in that case [L1, L2) = prev([U1, U2)) MAY still exist; it is the last element of
2678 * snd_flying_pkts_by_seq in that situation.
2679 *
2680 * Given that, here are all the situations that mean P is straddled by a packet:
2681 *
2682 * - S inside [U1, U2) or any packet after it.
2683 * - Impossible. U1 > S by definition; so S is not inside any packet at U1 or later.
2684 * - S inside [L1, L2).
2685 * - Possible. We know S > L1, since otherwise S <= L1, which means we can't be inside this
2686 * if (and we are), or snd_flying_pkts_by_seq.upper_bound(S) == [L1, L2) (not true, since
2687 * snd_flying_pkts_by_seq.upper_bound(S) == [U1, U2), which != [L1, L2)). So, since S > L1,
2688 * we must check for S < L2. If true, S is straddled.
2689 * - S inside some packet [K1, K2) before [L1, L2).
2690 * - Impossible. Suppose S is inside [K1, K2) immediately preceding [L1, L2). Then
2691 * snd_flying_pkts_by_seq.upper_bound(S) == [L1, L2). But we already know
2692 * snd_flying_pkts_by_seq.upper_bound(S) == [U1, U2) (which != [L1, L2)). So that's
2693 * impossible. Repeat this logic for all packets [K1, K2) preceding [L1, L2) to show that
2694 * it can't be straddled by any of those either.
2695 *
2696 * Therefore, S is straddled by a packet if and only if:
2697 * - prev(snd_flying_pkts_by_seq.upper_bound(S)) exists; call it [L1, L2); and
2698 * - S < L2.
2699 *
2700 * This can be further restated as:
2701 * - snd_flying_pkts_by_seq.upper_bound(S) != snd_flying_pkts_by_seq.begin(); and
2702 * - (letting [L1, L2) = prev(snd_flying_pkts_by_seq.upper_bound(S)))
2703 * S < L2.
2704 *
2705 * So check for that. */
2706
2707 // Find U.
2708 Peer_socket::Sent_pkt_ordered_by_seq_const_iter pkt_it = snd_flying_pkts_by_seq.upper_bound(seq_num);
2709 // Check that prev(U) exists.
2710 if (pkt_it != snd_flying_pkts_by_seq.begin())
2711 {
2712 // prev(U) = L exists. Compute L.
2713 --pkt_it;
2714 // Compute [L1, L2), and check for straddling: S < L2. pkt_it->second points into snd_flying_pkts_by_when.
2715 Sequence_number l1, l2;
2716 get_seq_num_range(pkt_it->second, &l1, &l2);
2717
2718 assert(l1 < seq_num); // Sanity-check of upper_bound().
2719 if (seq_num < l2)
2720 {
2721 // Straddles. Other side is sending us bad stuff. As above, warn and RST/close.
2722
2723 // Register one individual acknowledgment of unknown # of bytes of data (not acceptable due to error).
2724 snd_stats.error_ack();
2725
2726 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
2727 "Received [ACK]; "
2728 "acknowledgment [" << seq_num << ", ...) is at least partially inside "
2729 "packet [" << l1 << ", " << l2 << ").");
2730 rst_and_close_connection_immediately(socket_id, sock, error::Code::S_SEQ_NUM_ARITHMETIC_FAILURE, true);
2731 /* ^-- defer_delta_check == true: because the only way to get to this method is from
2732 * async_low_lvl_recv(), which will perform event_set_all_check_delta(false) at the end of itself,
2733 * before the boost.asio handler exits. See Node::m_sock_events doc header for details. */
2734 return false; // Other out-params are meaningless.
2735 }
2736 // else if (seq_num >= l2) { It's past [L1, L2); does not straddle. }
2737 }
2738 // else { Legit because there is no packet L that could possibly straddle seq_num. }
2739
2740 /* OK, so NOW do we know it's a duplicate/late acknowledgment? Well, no. Suppose we sent packet
2741 * [5, 10) and get ACK with [5, ...). That's fine. So we erase [5, 10) from
2742 * snd_flying_pkts_by_seq. Now say we get ACK with [7, ...). Well, that's in the
2743 * [m_snd_next_seq_num, m_snd_next_seq_num) range certainly; and it doesn't get straddled by
2744 * any member of snd_flying_pkts_by_seq. Yet it's certainly invalid: we never sent (and could've
2745 * never sent) [7, ...). We can't know that, however, since [5, 10) is gone from
2746 * snd_flying_pkts_by_seq. Is this OK? More or less, yes. What do we do with a duplicate/late
2747 * acknowledgment just below? We log and ignore it. That doesn't seem harmful. NORMALLY
2748 * when something is invalid we'd RST and close connection, but here we can't know we should
2749 * do that; however ignoring it still seems fine and better than doggedly inventing data
2750 * structures to detect this corner case.
2751 *
2752 * What about m_snd_cong_ctl? Should we report this in m_snd_cong_ctl->on_acks()?
2753 * No. on_acks() specifically documents that it wants info on
2754 * In-flight->Acknowledged acknowledgments, not duplicates. (Briefly,
2755 * that's because it's measuring sent data in the pipe; acknowledgment duplication has unclear
2756 * implications about what it's acknowledging; it is unlikely that it represents more pipe
2757 * being available than if only one acknolwedgment had been received. In any case this should
2758 * hopefully be pretty rare and thus not too significant either way.)
2759 *
2760 * Same reasoning for not counting it in m_snd_bandwidth_estimator->on_acks(). */
2761
2762 // Per above discussion, ignore duplicate (or maybe invalid, but we can't know/assume that) acknowledgment.
2763
2764 // Register one individual acknowledgment of unknown # of bytes of data (late, dupe, or maybe invalid).
2765 snd_stats.late_or_dupe_ack();
2766
2767 FLOW_LOG_INFO("NetFlow worker thread working on [" << sock << "]. "
2768 "Acknowledged packet [" << seq_num << ", ...) is duplicate or late (or invalid). "
2769 "RTT unknown. Ignoring.");
2770
2771 // Ensure out-params indicating a dupe/late ack such that the packet being acked is not known.
2772 *dupe_or_late = true;
2773 assert(*acked_pkt_it == snd_flying_pkts_by_when.past_oldest()); // A/k/a end().
2774 return true;
2775 } // if (seq_num is not in snd_flying_pkts*) // i.e., duplicate/late acknowledgment with unknown acked packet.
2776 // else if (seq_num IS in snd_flying_pkts*): *acked_pkt_it points to snd_flying_pkts_by_when[seq_num].
2777 assert(*acked_pkt_it != snd_flying_pkts_by_when.past_oldest());
2778
2779 // It's an ack of sequence number we'd sent, but if retransmission is on it may not be of the one we LAST sent.
2780
2781 const Peer_socket::Sent_packet& acked_pkt = *((*acked_pkt_it)->second);
2782 const unsigned int acked_rexmit_id = rexmit_on ? acked_pkt.m_packet->m_rexmit_id : 0;
2783 Sequence_number seq_num_end; // Get sequence number just past last datum in packet.
2784 get_seq_num_range(*acked_pkt_it, 0, &seq_num_end);
2785
2786 // Note that both rexmit_id and acked_rexmit_id are guaranteed 0 at this point if !rexmit_on.
2787
2788 if (rexmit_id > acked_rexmit_id)
2789 {
2790 // This is entirely illegal. Can't acknowledge a packet copy we hadn't sent yet.
2791 FLOW_LOG_WARNING("NetFlow worker thread working on [" << sock << "]. "
2792 "Acknowledged packet [" << seq_num << ", " << seq_num_end << ") "
2793 "rexmit_id [" << int(rexmit_id) << "] "
2794 "exceeds highest sent rexmit_id [" << int(acked_rexmit_id) << "].");
2795 rst_and_close_connection_immediately(socket_id, sock, error::Code::S_SEQ_NUM_ARITHMETIC_FAILURE, true);
2796 /* ^-- defer_delta_check == true: because the only way to get to this method is from
2797 * async_low_lvl_recv(), which will perform event_set_all_check_delta(false) at the end of itself,
2798 * before the boost.asio handler exits. See Node::m_sock_events doc header for details. */
2799 return false; // Other out-params are meaningless.
2800 }
2801 // else if (rexmit_id <= acked_rexmit_id)
2802
2803 if (rexmit_id != acked_rexmit_id)
2804 {
2805 assert(rexmit_id < acked_rexmit_id);
2806
2807 /* This is legal: it's possible we had sent packet P, considered it Dropped, retransmitted it
2808 * (thus incrementing rexmit_id), and have now received a late acknowledgment of the
2809 * PREVIOUS attempt to send P (before retransmission). We could actually consider this
2810 * entirely equivalent to simply getting the last attempt acked. In fact I specifically kept
2811 * an array for m_sent_when, so that we can even compute accurate RTT. Yet, at least for now,
2812 * I am going to ignore such an acknowledgment. Reasons:
2813 *
2814 * - The RTT may be an outlier affected by some random event; we considered it Dropped, so
2815 * if those heuristics are generally sound, getting a late ack is suspicious.
2816 *
2817 * - Suppose I do take the RTT and report to congestion control, use for SRTT computation,
2818 * and remove from snd_flying_pkts*. I've in effect recorded a loss but then also
2819 * reported a successful retransmission, even though the ack is not for the retransmission
2820 * but more like a correction on the original loss. That's potentially fine, but chances
2821 * are I will soon receive the ack for the latest transmission, which is what I was really
2822 * expecting. That one will now be considered a late ack and will be ignored, even though
2823 * that RTT is actually probably more accurate, since chances are it arrived before the
2824 * retransmission would've been considered Dropped as well. So, basically, we're kind of
2825 * trying to use the "two wrongs make a right" philosophy, which seems messy.
2826 *
2827 * - Earlier in the method, I mentioned that if we detect P as dropped and queue it for
2828 * retransmission but get P acked *before* we get a chance to retransmit, then we consider
2829 * that ack as late and ignore it (and will still retransmit P). The reasons for that are
2830 * given in that comment. However, given that we made that decision, it would seem
2831 * strange to follow a different philosophy just because we did happen to get to
2832 * retransmit P. That would be inconsistent.
2833 *
2834 * - Keeping it in perspective, it should be fairly rare that a packet we considered Dropped
2835 * is acked after all. So it is perhaps not worth the trouble to go crazy about this
2836 * corner case.
2837 *
2838 * Nevertheless, a @todo would be to experimentally measure the effect of this policy and
2839 * decide whether it is sound. In that case also consider the aforementioned "P is acked
2840 * after queued for retransmission but before retransmitted" corner case. */
2841
2842 // Register one individual acknowledgment of unknown # of bytes of data (late).
2843 snd_stats.late_or_dupe_ack();
2844
2845 FLOW_LOG_INFO("NetFlow worker thread working on [" << sock << "]. "
2846 "Acknowledged packet [" << seq_num << ", " << seq_num_end << ") "
2847 "order_num [" << acked_pkt.m_sent_when[rexmit_id].m_order_num << "] "
2848 "rexmit_id [" << int(rexmit_id) << "] "
2849 "is less than highest sent [" << int(acked_rexmit_id) << "]. Ignoring.");
2850
2851 // Ensure out-params indicating a dupe/late ack of a specific known sent packet.
2852 *dupe_or_late = true;
2853 assert(*acked_pkt_it != snd_flying_pkts_by_when.past_oldest()); // A/k/a end().
2854 return true;
2855 }
2856 // else
2857 assert(rexmit_id == acked_rexmit_id);
2858
2859 // Do not log this mainstream case; only the exceptions above. RTT will probably be logged separately.
2860
2861 // Register one individual acknowledgment of N bytes of data (converts from In-flight to Acknowledged).
2862 snd_stats.good_ack(acked_pkt.m_size);
2863
2864 // Ensure out-params indicating an in-time, first ack of a specific known sent packet.
2865 *dupe_or_late = false;
2866 assert(*acked_pkt_it != snd_flying_pkts_by_when.past_oldest()); // A/k/a end().
2867 return true;
2868} // Node::categorize_individual_ack()
2869
2871 const Fine_time_pt& time_now,
2873 const Peer_socket::Sent_packet::Sent_when** sent_when) const
2874{
2875 using boost::chrono::milliseconds;
2876 using boost::chrono::round;
2877
2878 Fine_duration round_trip_time;
2879
2880 /* This helper of handle_accumulated_acks() exists to make the latter method briefer/readable, not for code reuse
2881 * as of this writing. It computes the RTT implied by the given individual ack and also returns the Sent_when
2882 * (which contains info on when the original packet was sent) structure as an out-param. */
2883
2884 /* RTT subtleties:
2885 *
2886 * How long did the other side, upon receiving the acked packet, wait before sending this
2887 * containing ACK with that individual acknowledgment? Why do we care? For RTT. Why do we
2888 * want RTT? To measure how long it takes for a sent packet to reach the receiver (one-way trip
2889 * time, or OWTT). Since mesuring OWTT is quite hard/impossible due to lack of absolute clock
2890 * synchronization between us and the receiver, RTT/2 is used as the next best way to get OWTT.
2891 * We can measure RTT by subtracting our recorded packet send time from the current time (ACK
2892 * receipt time). However, the ACK delay introduced by the receiver to reduce ACK overhead has
2893 * nothing to do with OWTT; it just (randomly, from the other side's point of view) inflates the RTT.
2894 * Thus we subtract the ACK delay from the RTT to get the actual RTT we use for congestion control, etc. */
2895
2896 const unsigned int rexmit_id = ack->m_rexmit_id;
2897 // Get the RTT for the transmission attempt that is actually being acknowledged (always 0 if retransmission off).
2898 *sent_when = &(flying_pkt->m_sent_when[rexmit_id]);
2899 const Peer_socket::order_num_t order_num = (*sent_when)->m_order_num;
2900
2901 /* ?second-resolution value (ack_delay) subtracted from max-resolution values. If ack_delay is
2902 * also in the max-resolution time unit, then there is no loss of precision. Otherwise we lose
2903 * precision by subtracting a number with fewer significant digits from one with more
2904 * significant digits. So Ack_delay_time_unit should ideally be Fine_duration, for precise RTT
2905 * values (especially for queueing delay-based congestion control algorithms); however that
2906 * decision is discussed elsewhere (Low_lvl_packet). */
2907 const auto& ack_delay = ack->m_delay;
2908 round_trip_time = time_now - (*sent_when)->m_sent_time - ack_delay;
2909
2910 if (round_trip_time.count() < 0)
2911 {
2912 /* Because this combines measurements on both sides, and each may have some error (plus or
2913 * minus a few hundred microseconds, possibly), and the result can be quite close to zero in
2914 * extremely low-latency situations, this may come out to be negative. So assume zero and
2915 * log a TRACE message at most.
2916 *
2917 * @todo Should we put also a ceiling on the RTT?
2918 * @todo For the floor, maybe it's better to use a higher guess than zero? */
2919 FLOW_LOG_TRACE("Acknowledged packet [" << ack->m_seq_num << ", ...) "
2920 "order_num [" << order_num << "] has negative "
2921 "RTT [" << round_trip_time << "]; assuming zero. "
2922 "Sent at [" << (*sent_when)->m_sent_time << "]; "
2923 "received at [" << time_now << "]; "
2924 "receiver-reported ACK delay [" << ack_delay << "].");
2925 round_trip_time = Fine_duration::zero();
2926 }
2927 FLOW_LOG_TRACE("Acknowledged packet [" << ack->m_seq_num << ", ...) "
2928 "order_num [" << order_num << "] "
2929 "has RTT [" << round<milliseconds>(round_trip_time) << "] "
2930 "(ACK delay [" << round<milliseconds>(ack_delay) << "]).");
2931
2932 return round_trip_time;
2933} // Node::compute_rtt_on_ack()
2934
2937 const boost::unordered_set<Peer_socket::order_num_t>& flying_now_acked_pkts)
2938{
2939 using std::priority_queue;
2940
2941 /* This helper of handle_accumulated_acks() exists to make the latter method briefer/readable, not for code reuse
2942 * as of this writing. The background is that once a set of individual acks has been processed in the sense that
2943 * sock->m_snd_flying_pkts* (which tracks In-flight outbound DATA packets) has been updated by removing the
2944 * acked packets (they are no longer In-flight), it's time to also recategorize certain further In-flight
2945 * packets as Dropped -- the intuition being that once N packets sent LATER than a given packet P have been
2946 * acked, it's highly probable that P has been Dropped by the network. This method determines the packets to drop
2947 * in that fashion.
2948 *
2949 * Now, as explained below, when ack set S causes packet set P' to be Dropped, this (possibly null) set P'
2950 * always has the following form: there is some particular packet P which is the most-recently-sent one
2951 * that is in P'; and therefore ALL other In-flight packets sent before P must be dropped too and also are in P'.
2952 * Thus P is necessary/sufficient to specify P'. Thus this method simply finds and returns a thing pointing to P. */
2953
2954 // For brevity and a little speed:
2955 auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
2956
2957 /* OK, snd_flying_pkts* has been updated, in that we've removed any Sent_packet entries
2958 * corresponding to valid acknolwedgments in this ACK. As promised elsewhere we should also update
2959 * the remaining Sent_packets' m_acks_after_me entries and erase any Sent_packets that we consider
2960 * Dropped due to too high m_acks_after_me values. (As in TCP Fast Retransmit/Recovery, an
2961 * unacknowledged packet is considered Dropped based on the heuristic that a few packets with
2962 * higher sequence numbers have been acknowledged. Except since we have Sent_when, that should be
2963 * even better than using sequence number ordering as TCP would.)
2964 *
2965 * (Warning: below and nearby, I make pseudo-code-y leaps, such as saying flying_now_acked_pkts stores
2966 * Sent_whens when really it stores order_nums; just bear with me by trusting that it makes the logic
2967 * easier to explain, and that the actual structure in code is sufficiently
2968 * similar to the wording here to not make a salient difference in practice.)
2969 *
2970 * Consider the two structures we have now. snd_flying_pkts_by_when (call
2971 * it F) is a collection of Sent_packets, each with Sent_packet::m_acks_after_me, ordered by decreasing
2972 * Sent_when. flying_now_acked_pkts (call it C) is an unordered collection that contains each Sent_when
2973 * (i.e., reference to a send-packet attempt) that has been ACKed. That is, flying_now_acked_pkts tells us
2974 * by Sent_when which exact send attempts from the past are acknowledged in this set of accumulated acks.
2975 *
2976 * Even less formally -- just for sanity's sake -- F are In-flight packets; C are just-acked packets that were
2977 * very recently in F. C may be interleaved among F if viewed in increasing Sent_when order:
2978 * e.g., [ F F F F C F C F C C ] (where F represents a still-In-flight send attempt, or an F element;
2979 * C a just-acked send attempt, thus a C element; and the order is from earlier/lower Sent_when to
2980 * later/higher Sent_when).
2981 *
2982 * Note that, conceptually, the key sets (Sent_when values) in F and C are disjoint,
2983 * since each send attempt has a unique Sent_when value (because it at least consists of a unique m_order_num).
2984 * How do we correctly yet efficiently increment m_acks_after_me (call it A) for each
2985 * element in F to represent the new ackage? First observe that if F[t].A is incremented by N, then
2986 * F[prev(t)].A should be incremented by N PLUS the number of acks for all packets sent at times in range
2987 * (prev(t), t), where prev(t) is the element of C with the next lower (ealier) Sent_when.
2988 * Consider the example scoreboard above, [ F F F F C F# C F* C C ]. F*.A is incremented by 2, because
2989 * plainly there are two Cs after it. Therefore, the preceding F, which is F#,
2990 * is also incremented by 2; plus another 1, because there is another C (recall, simply another acknowledgment)
2991 * between F# and F*. And so it goes for all the Fs. Side/sanity note: The range is (prev[t], t), not
2992 * [prev(t), t), simply because F and C are disjoint; and prev(t) by definition is in F (hence not in C, hence
2993 * no ack for that seq. #).
2994 *
2995 * This suggests a simple inductive algorithm, wherein the latest F element's F[t].A is incremented by I, which
2996 * is the count of C elements with Sent_when > t; memorize I; now for each progressively older F[t],
2997 * count C elements in (t, next(t)) and increment F[T].A by I += <that count>. Repeat until all Fs incremented.
2998 * Ultimately I = # of new valid, acknowledgments. (Recall: scoreboard cannot begin with any Cs, [C C ... ], as
2999 * such a C would be acking a non-In-flight send attempt, so a dupe, and we specifically eliminate dupes from
3000 * consideration before inserting into C.) So that's O(F.size()) increment operations.
3001 *
3002 * OK, but how do we get this "count of acks between t and next(t)"? Let t be the last element of
3003 * F. For it, that count is the count of all keys > t in C (i.e., the total # of acks for all
3004 * packets sent after t). Let the lowest such key (Sent_when value) be `s`. Now let t' = prev(t) as before.
3005 * For t', the count of acks sent in (t', t) is the count of all elements in C with keys
3006 * in (s', s), where s' is again the lowest key > t'. Having counted that, set s = s, and repeat for each key t' of F.
3007 *
3008 * Of course, for that to be practical, C would need to be sorted by Sent_when. Since in reality it's not sorted,
3009 * we could first sort it in O(k log k) operations, worst-case, k = C.size(). More convenient, however, is to
3010 * construct a priority queue (heap) from C; then keep popping the Sent_whens down to and
3011 * including s at each step. That's O(k) to make the heap and O(k log k) total time spent
3012 * popping it.
3013 *
3014 * The above explanation strikes me as somewhat cryptic, but hopefully the code will clarify it; I was just
3015 * trying to explain why the code works. */
3016
3017 /* Make heap out of flying_now_acked_pkts; top()/pop() will return the element with the highest (latest) Sent_when.
3018 * Just store the Sent_when values directly in the heap; std::pair::operator<() will do
3019 * the right thing since no element's Sent_when equals another element's Sent_when (they were
3020 * stored in a uniquely-keyed dictionary in the first place).
3021 *
3022 * Let cur_sent_pkt be the element of snd_flying_pkts_by_sent_when we're currently
3023 * considering, and it starts at F.newest() and progresses accordingly through F.
3024 * Then, invariant: high_ack_count_q contains the acks for all send attempts P where
3025 * P.m_sent_when < cur_sent_pkt.m_sent_when. In particular, P.m_sent_when.top < cur_sent_pkt.m_sent_when. */
3026 priority_queue<Peer_socket::order_num_t>
3027 high_ack_count_q(flying_now_acked_pkts.begin(), flying_now_acked_pkts.end());
3028
3029 // Invariant: this will be the m_acks_after_me increment applied to the just-considered packet in snd_flying_pkts*.
3030 using ack_count_t = Peer_socket::Sent_packet::ack_count_t;
3031 ack_count_t ack_increment_after_me = 0;
3032
3033 // As explained above, start with the first (latest send time) unacked packet and go forward (earlier and earlier).
3035 for (last_dropped_pkt_it = snd_flying_pkts_by_when.newest();
3036 last_dropped_pkt_it != snd_flying_pkts_by_when.past_oldest();
3037 ++last_dropped_pkt_it) // Up to k repetitions.
3038 {
3039 Peer_socket::Sent_packet& cur_sent_pkt = *(last_dropped_pkt_it->second);
3040 const Peer_socket::Sent_packet::Sent_when& cur_pkt_sent_when = cur_sent_pkt.m_sent_when.back();
3041
3042 /* We will increment cur_sent_pkt.m_acks_after_me by ack_increment_after_me + X, where X is
3043 * the total number of acks for packets with send times between cur_pkt_sent_when and the
3044 * cur_pkt_sent_when in the last loop iteration (or infinity if this is the first loop
3045 * iteration). The high_ack_count_q invariant we maintain is that high_ack_count_q holds the
3046 * ack counts for all packets with Sent_when values EXCEPT those >= the previous
3047 * iteration's cur_pkt_sent_when. Therefore, we need only find all elements of high_ack_count_q
3048 * whose Sent_whens are > our cur_pkt_sent_when. Since high_ack_count_q.top() is always the ack
3049 * count with the highest sent_when in that structure (priority queue), we just pop and sum
3050 * until high_ack_count_q.top() < cur_pkt_sent_when. */
3051
3052 // We've just assigned cur_sent_pkt, breaking invariant; pop until it holds again.
3053 while ((!high_ack_count_q.empty()) &&
3054 // Compare order numbers -- they are always unique.
3055 (high_ack_count_q.top() > cur_pkt_sent_when.m_order_num))
3056 {
3057 // Found acked packet with sent_when > cur_pkt_sent_when (but < previous iteration's cur_pkt_sent_when).
3058 ++ack_increment_after_me; // So add that packet's ack.
3059
3060 // And remove it, bringing the next highest entry to the top. O(log k).
3061 high_ack_count_q.pop(); // Note this maintains the invariant that defines high_ack_count_q.
3062 }
3063 // Note we've maintained the invariant defining ack_increment_after_me.
3064
3065 // Hence this many more acks for packets after us have occurred within this ack set.
3066 cur_sent_pkt.m_acks_after_me += ack_increment_after_me;
3067
3068 if (cur_sent_pkt.m_acks_after_me > S_MAX_LATER_ACKS_BEFORE_CONSIDERING_DROPPED)
3069 {
3070 /* Ah ha! For this packet we've exceeded the limit -- we will consider it Dropped. What
3071 * about the next (meaning, earlier-sent) unacknowledged packets? Observe that packets with
3072 * earlier send times MUST (if we were to continue the loop in this manner) end up with
3073 * equal or larger cur_sent_pkt.m_acks_after_me. (Intuitively: any acknowledgment after
3074 * packet P is also after any packet preceding P in the sent_when ordering.) Therefore, we
3075 * can break out of the loop and consider Dropped ALL packets from last_dropped_pkt_it to
3076 * snd_flying_pkts_by_when.past_oldest(). Yay! */
3077
3078 auto const logger_ptr = get_logger();
3079 if (logger_ptr && logger_ptr->should_log(log::Sev::S_TRACE, get_log_component()))
3080 {
3081 Sequence_number cur_pkt_seq_num, cur_pkt_seq_num_end;
3082 get_seq_num_range(last_dropped_pkt_it, &cur_pkt_seq_num, &cur_pkt_seq_num_end);
3083
3085 ("Unacknowledged packet [" << cur_pkt_seq_num << ", " << cur_pkt_seq_num_end << ") "
3086 "order_num [" << cur_pkt_sent_when.m_order_num << "] has "
3087 "had [" << cur_sent_pkt.m_acks_after_me << "] acknowledgments "
3088 "for later packets; considering it and "
3089 "all unacknowledged packets sent earlier as Dropped.");
3090 }
3091
3092 break;
3093 }
3094 // else
3095
3096 // ack_increment_after_me and high_ack_count_q invariants hold, so the next iteration can proceed.
3097 } // for (all elements in snd_flying_pkts_by_when, in decreasing m_sent_when order: newest -> oldest)
3098
3099 return last_dropped_pkt_it;
3100} // Node::categorize_pkts_as_dropped_on_acks()
3101
3103 const Peer_socket::Sent_pkt_ordered_by_when_iter& last_dropped_pkt_it,
3104 size_t* cong_ctl_dropped_pkts, size_t* cong_ctl_dropped_bytes,
3105 size_t* dropped_pkts, size_t* dropped_bytes,
3106 std::vector<Peer_socket::order_num_t>* pkts_marked_to_drop)
3107{
3108 // using boost::next; // Still ambiguous for some reason (in clang at least).
3109
3110 /* This helper of handle_accumulated_acks() exists to make the latter method briefer/readable, not for code reuse
3111 * as of this writing. The background is that once a set of individual acks has been processed in the sense that
3112 * sock->m_snd_flying_pkts* (which tracks In-flight outbound DATA packets) has been updated by removing the
3113 * acked packets (they are no longer In-flight), it's time to also recategorize certain further In-flight
3114 * packets as Dropped -- the intuition being that once N packets sent LATER than a given packet P have been
3115 * acked, it's highly probable that P has been Dropped by the network. This method does that (dropping
3116 * all such packets P) and certain related tasks such as tracking the associated loss event(s) for congestion
3117 * control.
3118 *
3119 * Now, as explained elsewhere, when ack set S causes packet set P' to be Dropped, this (possibly null) set P'
3120 * always has the following form: there is some particular packet P which is the most-recently-sent one
3121 * that is in P'; and therefore ALL other In-flight packets sent before P must be dropped too and also are in P'.
3122 * Thus P is necessary/sufficient to specify P'. last_droppped_pkt_it argument points to P' and is determined
3123 * elsewhere and used by this helper. */
3124
3125 // For brevity and a little speed:
3126 const bool rexmit_on = sock->rexmit_on();
3127 auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
3128 auto& snd_stats = sock->m_snd_stats;
3129
3130 /* Pre-condition: all elements starting with (inclusive) last_dropped_pkt_it (within
3131 * snd_flying_pkts_by_when) should be considered Dropped. If last_dropped_pkt_it ==
3132 * snd_flying_pkts_by_when.past_oldest() a/k/a end(), then none should be considered
3133 * Dropped (i.e., no m_acks_after_me became high enough).
3134 *
3135 * Given that, we have a number of tasks remaining:
3136 *
3137 * 1. Count the total # of packets and bytes now considered Dropped and pass this to congestion control.
3138 * Omit those packets/bytes heuristically determined to belong to a loss event detected in an earlier
3139 * call, namely those for which m_sent_when < m_snd_last_loss_event_when.
3140 * 2. (If retransmission is enabled) Queue those Dropped packets for retransmission in retransmission queue.
3141 * 3. Erase the Dropped packets from snd_flying_packets*.
3142 *
3143 * For (non-asymptotic) performance, ideally we want to traverse snd_flying_pkts_by_when just once,
3144 * computing what's needed for these. Drilling down a bit:
3145 *
3146 * (2) and (3) are simple and involve walking over the Dropped range that has been computed (pre-condition above)
3147 * and adding-elsewhere or erasing those elements, respectively, though (2) must be done in chronological order
3148 * (increasing Sent_when).
3149 *
3150 * (1) is a matter of walking in anti-chronological (decreasing Sent_when) order over that same range, until
3151 * a certain Sent_when threshold is found, and stopping there.
3152 *
3153 * Thus, the kitchen-sink algorithm emerges: walk through Dropped range in decreasing Sent_when order, so
3154 * from last_dropped_pkt_it along snd_flying_pkts_by_when. Accumulate bytes/packets for (1), but stop
3155 * accumulating once m_snd_last_loss_event_when is reached w/r/t m_sent_when. Erase from snd_flying_pkts*
3156 * (carefully, since we are walking along one of them), for (3). And add to the retransmission queue, but in
3157 * reverse order versus the walking order, for (2). */
3158
3159 *dropped_pkts = snd_flying_pkts_by_when.size(); // We will just compute the final value by subtracting "after."
3160 *dropped_bytes = sock->m_snd_flying_bytes; // Ditto.
3161
3162 *cong_ctl_dropped_bytes = 0;
3163 *cong_ctl_dropped_pkts = 0;
3164 bool loss_event_finished = false;
3165
3166 /* We want to add to retransmission queue (if retransmission is on). We also want to traverse
3167 * snd_flying_pkts_by_when in forward newest->oldest order (for convenience and also to efficiently compute
3168 * cong_ctl_dropped_*). However we want to retransmit in reverse order (oldest->newest). So we
3169 * put the packets to retransmit in the latter order into snd_rexmit_q, at the end of the latter.
3170 * So, if it was [ABC], and we dropped [DEF], then we want to insert to yield [ABCFED] (ABC->ABCFED).
3171 * list<>::insert(it, v) will insert `v` before *it and return iterator to just-inserted element.
3172 * So we can memorize the latter and pass it in as `it` in the next insert(), rinse, repeat.
3173 * In the above example: ABC->ABC(D)->ABC(E)D->ABC(F)ED. // () is inserted element.
3174 * ^ ^ ^ // ^ is "fulcrum": insertion point for insertion following next ->.
3175 *
3176 * snd_rexmit_q_fulcrum_it, the insertion point, is so named due to being the "fulcrum" between the old and
3177 * new parts of snd_rexmit_q. History: Used to use a local new list<> here which would be spliced onto
3178 * the real queue at the end; but IMO this is more elegant (and probably a bit speedier). */
3179 auto& snd_rexmit_q = sock->m_snd_rexmit_q;
3180 decltype(sock->m_snd_rexmit_q)::iterator snd_rexmit_q_fulcrum_it = snd_rexmit_q.end();
3181
3182 // We are to fill this up, so it should not have anything yet.
3183 assert(pkts_marked_to_drop->empty());
3184
3185 auto pkt_it = last_dropped_pkt_it;
3186 while (pkt_it != snd_flying_pkts_by_when.past_oldest())
3187 {
3188 // We can't just ++pkt_it later on, because we are going to erase() at pkt_it soon, invalidating it.
3189 auto next_pkt_it = boost::next(pkt_it);
3190 // Now see end of loop body.
3191
3192 // Accumulate stuff for passing into congestion control at the end.
3193
3194 const Peer_socket::Sent_packet::Ptr sent_pkt = pkt_it->second;
3195 const Peer_socket::Sent_packet::Sent_when& sent_when = sent_pkt->m_sent_when.back();
3196
3197 if (!loss_event_finished)
3198 {
3199 if (// This is part of a new loss event if: There has been no loss event before this...
3200 (sock->m_snd_last_loss_event_when != Fine_time_pt())
3201 // ...OR there has, but this packet was sent after that event was detected.
3202 && (sent_when.m_sent_time < sock->m_snd_last_loss_event_when))
3203 {
3204 /* This is the first packet encountered to be part of a previous loss event. If
3205 * retransmission is off, this will also cause the loop to exit. */
3206 loss_event_finished = true;
3207 }
3208 else
3209 {
3210 // Only got here if this packet and all Dropped packets after it are part of a new loss event.
3211 *cong_ctl_dropped_bytes += sent_pkt->m_size;
3212 ++(*cong_ctl_dropped_pkts);
3213 }
3214 }
3215 // else { Already found end of new loss event, if any, so no need to keep looking for it. }
3216
3217 // Add to retransmission queue if applicable.
3218
3219 if (rexmit_on)
3220 {
3221 if (!ok_to_rexmit_or_close(sock, pkt_it, true)) // Ensure not too many retransmissions already.
3222 /* ^-- defer_delta_check == true: because the only way to get to this method is from
3223 * async_low_lvl_recv(), which will perform event_set_all_check_delta(false) at the end of itself,
3224 * before the boost.asio handler exits. See Node::m_sock_events doc header for details. */
3225 {
3226 return false; // Already closed/logged/etc.
3227 }
3228 // else
3229
3230 /* Save a ref-counted pointer (to what includes packet data) in retransmission queue. We'll soon remove such
3231 * a pointer from snd_flying_pkts*, lowering the ref-count again. In other words, we are moving the sent-packet
3232 * object from snd_flying_pkts* to snd_rexmit_q (Dropped -> In-flight).
3233 *
3234 * Insert at the same position each time to ultimately arrange them in the reversed order that we want. */
3235 snd_rexmit_q_fulcrum_it = snd_rexmit_q.insert(snd_rexmit_q_fulcrum_it, sent_pkt);
3236 ++sock->m_snd_rexmit_q_size;
3237 }
3238
3239 /* Finally, we can erase it from snd_flying_pkts* and adjust snd_flying_bytes.
3240 * Will NOT invalidate other iterators into snd_flying_pkts_by_when.
3241 *
3242 * Also, save in pkts->pkts_marked_to_drop as advertised. */
3243
3244 static_assert
3246 "Scoreboard must not get otherwise changed when a packet is erased.");
3247 pkts_marked_to_drop->push_back(sent_when.m_order_num);
3248 snd_flying_pkts_erase_one(sock, pkt_it);
3249
3250 pkt_it = next_pkt_it;
3251 } // while (pkt_it != snd_flying_pkts_by_when.past_oldest())
3252
3253 // Includes ALL Dropped packets (not just ones from new loss event, if any), so != cong_ctl_dropped_pkts.
3254 *dropped_pkts -= snd_flying_pkts_by_when.size(); // Subtract "after" from "before" to get dropped count.
3255 *dropped_bytes -= sock->m_snd_flying_bytes; // Similar.
3256
3257 if (*cong_ctl_dropped_pkts != 0)
3258 {
3259 // Register that we've detected a NEW loss event (not the same as dropped_data() -- see that elsewhere).
3260 snd_stats.loss_event();
3261 }
3262
3263 return true;
3264} // Node::drop_pkts_on_acks()
3265
3267{
3268 using boost::algorithm::join;
3269 using boost::chrono::symbol_format;
3270 using std::string;
3271 using std::vector;
3272 using std::transform;
3273 using std::ostream;
3274
3275 // We are in thread W.
3276
3277 // This helper of handle_accumulated_acks() just logs the individual acks about to be processed.
3278
3279 // For brevity and a little speed:
3280 using Ack = Ack_packet::Individual_ack;
3281 using Acks = vector<Ack::Ptr>;
3282 const Acks& acked_packets = sock->m_rcv_acked_packets;
3283
3284 auto const logger_ptr = get_logger();
3285 if (logger_ptr && logger_ptr->should_log(log::Sev::S_DATA, get_log_component())) // Very verbose and slow!
3286 {
3287 // Prepare serialization of m_rcv_acked_packets for TRACE logging; quite verbose and slow!
3288 vector<string> ack_strs(acked_packets.size());
3289 transform(acked_packets.begin(), acked_packets.end(), ack_strs.begin(),
3290 [](Ack::Const_ptr ack) -> string
3291 {
3292 return util::ostream_op_string('[', ack->m_seq_num, ", ", int(ack->m_rexmit_id), ", ",
3293 symbol_format,
3294 ack->m_delay, ']'); // "ns," not "nanoseconds."
3295 });
3296 const string ack_str = join(ack_strs, " ");
3297
3298 FLOW_LOG_DATA_WITHOUT_CHECKING("NetFlow worker thread working on [" << sock << "]. "
3299 "Accumulated [ACK] packets with "
3300 "acknowledgments [seq_num, rexmit_id, delay]: "
3301 "[" << ack_str << "].");
3302 } // if (DATA)
3303 else
3304 {
3305 FLOW_LOG_TRACE("NetFlow worker thread working on [" << sock << "]. "
3306 "Accumulated [ACK] packets with "
3307 "[" << acked_packets.size() << "] individual acknowledgments.");
3308 }
3309
3310 if (sock->m_int_state == Peer_socket::Int_state::S_ESTABLISHED)
3311 {
3312 log_snd_window(sock);
3313 }
3314 // else { Why is this possible? Answer: See handle_accumulated_acks() for explanation near similar check. }
3315} // Node::log_accumulated_acks()
3316
3317void Node::drop_timer_action(Peer_socket::Ptr sock, bool drop_all_packets)
3318{
3319 using std::list;
3320 using boost::prior;
3321
3322 // We are in thread W.
3323
3324 // Since we call m_snd_drop_timer->done() when exiting ESTABLISHED, this should hold.
3325 assert(sock->m_int_state == Peer_socket::Int_state::S_ESTABLISHED);
3326
3327 // For brevity and a bit of speed:
3328 auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
3329 auto& snd_flying_pkts_by_seq = sock->m_snd_flying_pkts_by_seq_num;
3330
3331 // Timer must not be running if there are no In-flight packets. Thus it should not have fired.
3332 assert(!snd_flying_pkts_by_when.empty());
3333
3334 /* Drop Timer fired and is telling us to consider Dropped some packets. If drop_all_packets, then
3335 * it's all of them. Otherwise it's just the earliest unacknowledged packet
3336 * (m_snd_flying_pkts_by_sent_when.begin()). */
3337
3338 // Log details of the In-flight packets before we change things.
3339 log_snd_window(sock);
3340
3341 const bool rexmit_on = sock->rexmit_on();
3342 // To check, at the end, whether we've changed can_send() false => true.
3343 const bool could_send_before_drops = can_send(sock);
3344 // To check, at the end, whether we've changed snd_deqable() false => true.
3345 const bool had_rexmit_data_before_drops = !sock->m_snd_rexmit_q.empty();
3346 // Will store ID of the one packet to drop; reserved value 0 will mean ALL packets are dropped.
3347 Peer_socket::order_num_t packet_marked_to_drop_or_drop_all;
3348
3349 // Used below for congestion control.
3350 size_t cong_ctl_dropped_bytes = 0;
3351 size_t cong_ctl_dropped_pkts = 0;
3352
3353 if (drop_all_packets)
3354 {
3355 cong_ctl_dropped_bytes = sock->m_snd_flying_bytes;
3356 cong_ctl_dropped_pkts = snd_flying_pkts_by_when.size();
3357
3358 // Queue them for retransmission, to be sent as soon as CWND provides enough space (could even be immediately).
3359 if (rexmit_on)
3360 {
3361 // Order is from earliest-sent to latest-sent (retransmission in the same order as transmission).
3362 for (Peer_socket::Sent_pkt_by_sent_when_map::Reverse_iterator pkt_it = snd_flying_pkts_by_when.oldest();
3363 pkt_it != snd_flying_pkts_by_when.past_newest();
3364 ++pkt_it)
3365 {
3366 // The forward iterator F pointing to same list element as reverse iterator R is prior(R.base()). Google it.
3367 if (!ok_to_rexmit_or_close(sock, prior(pkt_it.base()), false)) // Ensure not too many retransmissions already.
3368 /* ^-- defer_delta_check == false: because we were invoked from a timer event. Therefore, we will NOT perform
3369 * event_set_all_check_delta(false) before the boost.asio handler exits. Therefore boost.asio
3370 * may sleep (block) before event_set_all_check_delta(false). Therefore that would delay
3371 * delivery of the event to the user. Therefore force the delta check immediately. See
3372 * Node::m_sock_events doc header for details. */
3373 {
3374 return; // Already closed/logged/etc.
3375 }
3376 // else
3377
3378 sock->m_snd_rexmit_q.push_back(pkt_it->second); // Only a ref-counted pointer copy (constant time).
3379 }
3380 sock->m_snd_rexmit_q_size += cong_ctl_dropped_pkts;
3381 }
3382 // else { Just drop it. }
3383
3384 // Update our image of the pipe. For efficiency we use clear() instead of doing it one-by-one above.
3385
3386 // Update byte count.
3387 snd_flying_pkts_updated(sock, snd_flying_pkts_by_when.newest(), snd_flying_pkts_by_when.past_oldest(), false);
3388 snd_flying_pkts_by_when.clear();
3389 snd_flying_pkts_by_seq.clear();
3390
3391 packet_marked_to_drop_or_drop_all = 0; // Means drop all.
3392 }
3393 else
3394 {
3395 // Get the packet that was sent before all the others.
3396 const Peer_socket::Sent_pkt_ordered_by_when_iter& oldest_pkt_it = prior(snd_flying_pkts_by_when.past_oldest());
3397 Peer_socket::Sent_packet::Ptr oldest_pkt = oldest_pkt_it->second;
3398
3399 cong_ctl_dropped_bytes = oldest_pkt->m_size;
3400 cong_ctl_dropped_pkts = 1;
3401
3402 // Queue it for retransmission, to be sent as soon as CWND provides enough space (could even be immediately).
3403 if (rexmit_on)
3404 {
3405 if (!ok_to_rexmit_or_close(sock, oldest_pkt_it, false)) // Ensure not too many retransmissions already.
3406 // ^-- false <= Same as comment above.
3407 {
3408 return; // Already closed/logged/etc.
3409 }
3410 // else
3411
3412 sock->m_snd_rexmit_q.push_back(oldest_pkt); // Only a ref-counted pointer copy (constant time).
3413 ++sock->m_snd_rexmit_q_size;
3414 }
3415 // else { Just drop it. }
3416
3417 // Remember it short-term for the Drop_timer consolidated book-keeping below...
3418 packet_marked_to_drop_or_drop_all = oldest_pkt->m_sent_when.back().m_order_num;
3419
3420 // ...and in fact mark that packet Dropped (update our image of the pipe).
3421 snd_flying_pkts_erase_one(sock, oldest_pkt_it);
3422 }
3423
3424 /* Deal with congestion control. For introduction to the general topic see the large comment
3425 * near the top of handle_accumulated_acks().
3426 *
3427 * Since a Drop Timeout implies a large loss event, the congestion control module must be
3428 * informed. It may adjust the congestion window (used in can_send() and controlling how many
3429 * packets we are allowed to have In-flight at a time), probably downward.
3430 *
3431 * Also, this is a new loss event. Why? (For detailed explanation of what a loss event is, and
3432 * how we keep track of them, see that large comment in handle_accumulated_acks(). It
3433 * may be required to understand the rest of this paragraph.) Certainly this Drop is part of some
3434 * loss event by definition, but is it a new loss event, or merely the previous one (if such
3435 * exists)? Well, a Drop Timeout is, in practice, at least 1 second (which is likely 4 times a
3436 * pretty large RTT of 250 msec) and can also be estimated to be 3 * SRTT. In other words it is
3437 * probably much larger than SRTT, and certainly is at least a little larger than SRTT. Therefore
3438 * most likely any packet(s) Dropped by this DTO were sent after the last loss event (if any) was
3439 * detected. Hence this DTO event is a new loss event. We could explicitly check for this, but
3440 * it seems unnecessarily complex and intuitively unnecessary.
3441 *
3442 * Per handle_accumulated_acks(), when a new loss event is seen, m_snd_last_loss_event_when
3443 * is set to NOW. */
3444
3445 // @todo Arguable if it should be INFO or TRACE. We'll see.
3446 FLOW_LOG_INFO("cong_ctl [" << sock << "] update: Drop Timeout event: "
3447 "Dropped [" << cong_ctl_dropped_bytes << "] bytes = [" << cong_ctl_dropped_pkts << "] packets.");
3448
3449 // MUST call this after, not before, updating m_snd_flying_{packets|bytes} per method doc.
3450 sock->m_snd_cong_ctl->on_drop_timeout(cong_ctl_dropped_bytes, cong_ctl_dropped_pkts);
3451 sock->m_snd_last_loss_event_when = Fine_clock::now();
3452
3453 // Register that there was a timeout, and that bytes were converted from In-flight to Dropped.
3454 sock->m_snd_stats.drop_timeout();
3455 sock->m_snd_stats.dropped_data(cong_ctl_dropped_bytes, cong_ctl_dropped_pkts);
3456
3457 // Now log the "after."
3458 log_snd_window(sock);
3459
3460 // Since we've changed snd_flying_pkts*, Drop_timer events have occurred. Cleanly handle them all in one go.
3461
3462 const Drop_timer::Ptr drop_timer = sock->m_snd_drop_timer;
3463 drop_timer->start_contemporaneous_events();
3464
3465 /* Handle possible effect of above activities on the Drop Timer. (It may get disabled or restarted anew.)
3466 * Why not just do this right when we erase the associated packets from snd_flying_pkts*? We don't want to
3467 * trigger disruptive behavior like possibly retransmitting everything in the middle of all that accounting
3468 * which is not yet complete. Now it's complete, so it's the right time to handle this.
3469 *
3470 * Recall that snd_flying_pkts* have been updated and no longer contain the associated packet(s)'s info. */
3471 if (packet_marked_to_drop_or_drop_all == 0)
3472 {
3473 // Note that this is equivalent to calling ...packet_no_longer_in_flight(P) for all P -- just faster.
3474 drop_timer->on_no_packets_in_flight_any_longer();
3475 }
3476 else // if (packet_marked_to_drop_or_drop_all refers to, in fact, a specific packet)
3477 {
3478 drop_timer->on_packet_no_longer_in_flight(packet_marked_to_drop_or_drop_all);
3479 /* Could also call on_no_packets_in_flight_any_longer() if now none is In-flight, but performance-wise that'd
3480 * be the same; ...packet_no_longer_in_flight() will check the same condition anyway. So don't bother. */
3481 }
3482
3483 drop_timer->end_contemporaneous_events();
3484
3485 /* We've definitely reduced the number of packets we consider In-flight. We may also have added
3486 * packets to retransmission queue (if retransmission is on). Therefore can_send() may now return
3487 * true while at the beginning of the method it returned false; snd_deqable() may now return true
3488 * similarly. So have send_worker() check and send more if possible. See Node::send() for
3489 * discussion of overall strategy on this topic. */
3490 if ((!could_send_before_drops) || (rexmit_on && (!had_rexmit_data_before_drops)))
3491 {
3492 send_worker(sock, false);
3493 // ^-- defer_delta_check == false: for similar reason as in send_worker_check_state() calling send_worker().
3494 }
3495} // Node::drop_timer_action()
3496
3498{
3499 using std::min;
3500 using std::max;
3501 using boost::ratio;
3502 using boost::ratio_subtract;
3503 using boost::ratio_string;
3504 using boost::chrono::round;
3505 using boost::chrono::milliseconds;
3506 using boost::chrono::microseconds;
3507 using boost::chrono::seconds;
3508
3509 // We are in thread W.
3510
3511 // For brevity and a bit of speed:
3512 Fine_duration& srtt = sock->m_snd_smoothed_round_trip_time;
3513 Fine_duration& rtt_var = sock->m_round_trip_time_variance;
3514 Fine_duration& dto = sock->m_snd_drop_timeout;
3515 const Fine_duration& rtt = round_trip_time;
3516
3517 /* An ACK has supplied the given round_trip_time for a specific packet. We are to update the
3518 * smoothed RTT for the socket which is an estimate for the smooth "current" RTT for the socket.
3519 * Use RFC 6298 algorithm for SRTT calculation.
3520 *
3521 * RFC 6298 specifies the formula in "seconds." Of course it need not be seconds; it can be any
3522 * unit. We leave the unit we use unspecified, except to say that we will use the unit of
3523 * Fine_duration, which is the duration type of Fine_clock, which is the highest-resolution clock
3524 * available in the OS/hardware. Since, where possible, we keep using Fine_duration without
3525 * truncation to compute round_trip_time, assuming we don't introduce any unit conversions
3526 * (truncations, roundings) in the below code, the SRTT will maintain those units as well.
3527 * boost::chrono::duration will specifically cause compile failures if we don't explicitly specify
3528 * every truncation-inducing operation (duration_cast<>, round<>, etc.).
3529 *
3530 * BTW, this "unspecified" unit is probably nanoseconds.
3531 *
3532 * Note that the units used do NOT guarantee any particular clock granularity. E.g., I can give
3533 * you the time in milliseconds, but if I always say it in multiples of 1000 milliseconds, then I
3534 * may be working with milliseconds, but the resolution is 1 sec. */
3535
3536 if (srtt == Fine_duration::zero())
3537 {
3538 // First RTT measurement; initialize according to algorithm.
3539 srtt = rtt;
3540 rtt_var = rtt / 2;
3541
3542 // Truncate results to millisecond representation for readability.
3543 FLOW_LOG_TRACE("First SRTT calculation for [" << sock << "]: "
3544 "srtt = [" << round<milliseconds>(srtt) << " = " << srtt << "]; "
3545 "rtt_var = [" << round<milliseconds>(rtt_var) << " = " << rtt_var << "]; "
3546 "rtt = [" << rtt << "].");
3547 }
3548 else // if (SRTT was defined before this sample.)
3549 {
3550 // Subsequent RTT measurements.
3551
3552 // @todo Per last paragraph of RFC 6298-5, we MAY want to clear srtt/rtt_var afer multiple RTOs or maybe idleness.
3553 // (RTO = Retransmission Timeout, though we call it a Drop Timeout more accurately [we don't necessarily
3554 // retransmit on loss in NetFlow, unlike TCP].)
3555
3556 const Fine_duration prev_srtt = srtt;
3557 const Fine_duration prev_rtt_var = rtt_var;
3558
3559 /* Reason I used ratio<> instead of floating point constants: I don't want to use floating
3560 * points in production code that much. I don't necessarily trust it for consistent behavior across platforms...
3561 * and in general I just find integers more predictable/easier to reason about in most contexts of net_flow.
3562 * Reason I used ratio<> instead of just having separate integer constants for numerators and
3563 * denominators: I'd rather have ratio<> do the arithmetic for me (at compile time to boot!). */
3564 using Alpha = ratio<1, 8>; // 1/8, per RFC.
3565 using One_minus_alpha = ratio_subtract<ratio<1>, Alpha>;
3566 using Beta = ratio<1, 4>; // 1/4, per RFC.
3567 using One_minus_beta = ratio_subtract<ratio<1>, Beta>;
3568 // Now I can use X::num and X::den, such that X is the ratio X::num/X::den.
3569
3570 // Compute |srtt - rtt|.
3571 Fine_duration abs_srtt_minus_rtt = srtt - rtt;
3572 if (abs_srtt_minus_rtt.count() < 0)
3573 {
3574 abs_srtt_minus_rtt = -abs_srtt_minus_rtt;
3575 }
3576
3577 // Update the results per RFC.
3578 rtt_var
3579 = rtt_var * One_minus_beta::num / One_minus_beta::den
3580 + abs_srtt_minus_rtt * Beta::num / Beta::den;
3581 srtt
3582 = srtt * One_minus_alpha::num / One_minus_alpha::den
3583 + rtt * Alpha::num / Alpha::den;
3584
3585 // Truncate results to millisecond representation for readability.
3586 FLOW_LOG_TRACE("Next SRTT calculation for [" << sock << "]: "
3587 "srtt = [" << round<milliseconds>(srtt) << " = " << srtt << "]; "
3588 "rtt_var = [" << round<milliseconds>(rtt_var) << " = " << rtt_var << "]; "
3589 "rtt = [" << rtt << "]; "
3590 "prev_srtt = [" << prev_srtt << "]; "
3591 "prev_rtt_var = [" << prev_rtt_var << "]; "
3592 "alpha = " << (ratio_string<Alpha, char>::prefix()) << "; "
3593 "(1 - alpha) = " << (ratio_string<One_minus_alpha, char>::prefix()) << "; "
3594 "beta = " << (ratio_string<Beta, char>::prefix()) << "; "
3595 "(1 - beta) = " << (ratio_string<One_minus_beta, char>::prefix()) << "; "
3596 "|srtt - rtt| = [" << abs_srtt_minus_rtt << "].");
3597 } // else if (SRTT was defined before this sample)
3598
3599 /* Now compute Drop Timeout (DTO), similar to TCP's RTO (Retransmission Timeout): the minimum
3600 * amount of time we give an In-flight packet to get Acknowledged before considering it Dropped.
3601 * Again we use RFC 6298 for DTO computation.
3602 *
3603 * The formula is DTO = srtt + max(G, K * rtt_var), where K = 4 and G is the "clock
3604 * granularity." Additionally, we are to put a floor of 1 second on DTO. Finally, we are allowed
3605 * to put a ceiling on DTO, as long as that ceiling is at least 60 seconds.
3606 *
3607 * G plays an important part in the RTO caclulation algorithm, so we must know it. So what is it?
3608 * We don't know. We do however have a reasonably conservative upper bound; boost.timer
3609 * documentation lists some popular OS+CPU combinations and notes that for none of them does
3610 * high_resolution_timer exceed 5 microseconds. Therefore, let us pick the exceedingly
3611 * conservative G = 500 microseconds = 1/2 milliseconds. */
3612
3613 const Fine_duration clock_resolution_at_least = microseconds(500);
3614 const Fine_duration floor = seconds(1);
3615 const Fine_duration ceiling = sock->opt(sock->m_opts.m_dyn_drop_timeout_ceiling);
3616 const unsigned int k = 4;
3617
3618 const Fine_duration prev_dto = dto;
3619 const Fine_duration rtt_var_k = rtt_var * k;
3620 const Fine_duration srtt_plus_var_term = srtt + max(clock_resolution_at_least, rtt_var_k);
3621 dto = max(srtt_plus_var_term, floor);
3622 dto = min(dto, ceiling);
3623
3624 // Truncate results to millisecond representation for readability.
3625 FLOW_LOG_TRACE("Drop Timeout (DTO) calculation: "
3626 "dto = [" << round<milliseconds>(dto) << " = " << dto << "]; "
3627 "rtt_var * k = [" << rtt_var_k << "]; "
3628 "srtt + max(G, rtt_var * k) = [" << srtt_plus_var_term << "]; "
3629 "k = [" << k << "]; "
3630 "floor = [" << floor << "]; ceiling = [" << ceiling << "]; "
3631 "clock_resolution = [" << clock_resolution_at_least << "]; "
3632 "prev_dto = [" << prev_dto << "].");
3633} // void Node::new_round_trip_time_sample()
3634
3635void Node::log_snd_window(Peer_socket::Const_ptr sock, bool force_verbose_info_logging) const
3636{
3637 using std::vector;
3638 using std::list;
3639 using std::string;
3640 using boost::algorithm::join;
3641 using boost::prior;
3643 using std::flush;
3644
3645 // We're in thread W.
3646
3647 // For brevity and a little speed:
3648 const auto& snd_flying_pkts_by_seq = sock->m_snd_flying_pkts_by_seq_num;
3649 const auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
3650 const size_t num_flying_pkts = snd_flying_pkts_by_seq.size();
3651
3652 // force_verbose_info_logging => log the most detail, as INFO (if INFO logging enabled).
3653
3654 if (snd_flying_pkts_by_seq.empty())
3655 {
3656 // No In-flight packets, so this is brief enough for TRACE as opposed to DATA.
3657 FLOW_LOG_WITH_CHECKING(force_verbose_info_logging ? log::Sev::S_INFO : log::Sev::S_TRACE,
3658 "Send window state for [" << sock << "]: cong_wnd "
3659 "[" << sock->bytes_blocks_str(sock->m_snd_cong_ctl->congestion_window_bytes()) << "]; "
3660 "sent+acked/dropped "
3661 "[" << sock->m_snd_init_seq_num << ", " << sock->m_snd_next_seq_num << ") "
3662 "unsent [" << sock->m_snd_next_seq_num << ", ...).");
3663 return;
3664 }
3665 // else
3666
3667 auto const logger_ptr = get_logger();
3668 if (((!logger_ptr) || (!logger_ptr->should_log(log::Sev::S_DATA, get_log_component()))) &&
3669 (!(force_verbose_info_logging && logger_ptr->should_log(log::Sev::S_INFO, get_log_component()))))
3670 {
3671 // Can't print entire In-flight data structure, but can print a summary, if TRACE enabled.
3673 ("Send window state for [" << sock << "]: cong_wnd "
3674 "[" << sock->bytes_blocks_str(sock->m_snd_cong_ctl->congestion_window_bytes()) << "]; "
3675 "sent+acked/dropped [" << sock->m_snd_init_seq_num << ", " << snd_flying_pkts_by_seq.begin()->first << ") "
3676 "in-flight [" << sock->m_snd_flying_bytes << "] bytes: " << num_flying_pkts << ":{...} "
3677 "unsent [" << sock->m_snd_next_seq_num << ", ...).");
3678 return;
3679 }
3680 // else
3681
3682 // Very verbose and slow!
3683
3684 const bool rexmit_on = sock->rexmit_on();
3685
3686 vector<string> pkt_strs;
3687 pkt_strs.reserve(num_flying_pkts);
3688 for (Peer_socket::Sent_pkt_ordered_by_seq_const_iter pkt_it_it = snd_flying_pkts_by_seq.begin();
3689 pkt_it_it != snd_flying_pkts_by_seq.end();
3690 ++pkt_it_it)
3691 {
3692 Sequence_number start, end;
3693 get_seq_num_range(pkt_it_it->second, &start, &end);
3694
3695 Peer_socket::Sent_packet::Const_ptr sent_pkt = pkt_it_it->second->second;
3696
3697 String_ostream pkt_str_os;
3698 pkt_str_os.os() << '[' << start;
3699 if (rexmit_on)
3700 {
3701 pkt_str_os.os() << '[' << int(sent_pkt->m_packet->m_rexmit_id) << '/' << sent_pkt->m_sent_when.back().m_order_num
3702 << "], ";
3703 }
3704 else
3705 {
3706 pkt_str_os.os() << ", ";
3707 }
3708 pkt_str_os.os() << end << ")<" << sent_pkt->m_acks_after_me << "acks" << flush;
3709
3710 pkt_strs.push_back(pkt_str_os.str());
3711 }
3712
3714 (force_verbose_info_logging ? log::Sev::S_INFO : log::Sev::S_DATA,
3715 "Send window state for [" << sock << "]: cong_wnd "
3716 "[" << sock->bytes_blocks_str(sock->m_snd_cong_ctl->congestion_window_bytes()) << "]; "
3717 "sent+acked/dropped [" << sock->m_snd_init_seq_num << ", " << snd_flying_pkts_by_seq.begin()->first << ") "
3718 "in-flight "
3719 "[" << sock->m_snd_flying_bytes << "] bytes: " << num_flying_pkts << ":{" << join(pkt_strs, " ") <<
3720 "} unsent [" << sock->m_snd_next_seq_num << ", ...).");
3721
3722 if (!rexmit_on)
3723 {
3724 return;
3725 }
3726 // else
3727
3728 // Since retransmission is on, also useful to show the packets sorted by when they were sent.
3729
3730 vector<string> pkt_strs_time;
3731 pkt_strs_time.reserve(num_flying_pkts);
3732 // Note I don't use `auto` only for clarity (to express it is a reverse iterator, hence why didn't use for(:)).
3733 for (Peer_socket::Sent_pkt_by_sent_when_map::Const_reverse_iterator pkt_it = snd_flying_pkts_by_when.const_oldest();
3734 pkt_it != snd_flying_pkts_by_when.const_past_newest();
3735 ++pkt_it)
3736 {
3737 Sequence_number start, end;
3738 // The forward iterator F pointing to same list element as reverse iterator R is prior(R.base()) [sic]. Google it.
3739 get_seq_num_range(prior(pkt_it.base()), &start, &end);
3740
3741 Peer_socket::Sent_packet::Const_ptr sent_pkt = pkt_it->second;
3742
3743 string pkt_str;
3745 start, '[', int(sent_pkt->m_packet->m_rexmit_id), '/',
3746 sent_pkt->m_sent_when.back().m_order_num, "], ", end, ")<",
3747 sent_pkt->m_acks_after_me, "acks");
3748 pkt_strs_time.push_back(pkt_str);
3749 }
3750
3751 // Log it only if it is different (only possible if some retransmitted packets are actually involved).
3752 if (pkt_strs_time != pkt_strs)
3753 {
3755 (force_verbose_info_logging ? log::Sev::S_INFO : log::Sev::S_DATA,
3756 "Sorted by time sent: {" << join(pkt_strs_time, " ") << "}.");
3757 }
3758} // Node::log_snd_window()
3759
3761{
3762 using boost::prior;
3763
3764 const Peer_socket::Sent_pkt_by_seq_num_map& flying_packets = sock->m_snd_flying_pkts_by_seq_num;
3765 if (flying_packets.empty())
3766 {
3767 return Sequence_number(); // Default value. Less than all others.
3768 }
3769 // else
3770
3771 // Get the sequence number of the first datum in the last unhandled packet.
3772 const Peer_socket::Sent_pkt_by_seq_num_map::value_type& highest_val = *(prior(flying_packets.end()));
3773 Sequence_number seq_num = highest_val.first;
3774
3775 // Advance just past the data in that packet to get what we want.
3776 advance_seq_num(&seq_num, highest_val.second->second->m_size);
3777
3778 return seq_num;
3779}
3780
3782{
3783 // using boost::next; // Still ambiguous for some reason (in clang at least).
3784
3785 auto const logger_ptr = get_logger();
3786 if (logger_ptr && logger_ptr->should_log(log::Sev::S_TRACE, get_log_component()))
3787 {
3788 const Peer_socket::Sent_packet& sent_pkt = *pkt_it->second;
3789 const Peer_socket::order_num_t order_num = sent_pkt.m_sent_when.back().m_order_num;
3790 Sequence_number seq_num, seq_num_end;
3791 get_seq_num_range(pkt_it, &seq_num, &seq_num_end);
3792
3793 if (sock->rexmit_on())
3794 {
3796 ("On [" << sock << "] erasing packet [" << seq_num << ", " << seq_num_end << ") "
3797 "order_num [" << order_num << "] rexmit_id [" << int(sent_pkt.m_packet->m_rexmit_id) << "] from "
3798 "snd_flying_pkts* and friends.");
3799 }
3800 else
3801 {
3803 ("On [" << sock << "] erasing packet [" << seq_num << ", " << seq_num_end << ") "
3804 "order_num [" << order_num << "] from snd_flying_pkts* and friends.");
3805 }
3806 }
3807
3808 // Update byte count.
3809 snd_flying_pkts_updated(sock, pkt_it, boost::next(pkt_it), false);
3810
3811 // Finally erase from main structures.
3812 sock->m_snd_flying_pkts_by_seq_num.erase(pkt_it->first);
3813 sock->m_snd_flying_pkts_by_sent_when.erase(pkt_it);
3814
3815 // Note: As advertsied, we do NOT inform sock->m_snd_drop_timer. It is up to the caller to do the right thing there.
3816}
3817
3819 const Sequence_number& seq_num,
3821{
3822 using std::pair;
3823 using std::make_pair;
3824 // using boost::next; // Still ambiguous for some reason (in clang at least).
3825
3826 // For brevity and a bit of speed:
3827 auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
3828
3829#ifndef NDEBUG
3830 const auto insert_result =
3831#endif
3832 snd_flying_pkts_by_when.insert(make_pair(seq_num, sent_pkt));
3833
3834 // In this map, last added (a/k/a last sent) packet = first in the ordering!
3835 const Peer_socket::Sent_pkt_ordered_by_when_iter& pkt_it = snd_flying_pkts_by_when.begin();
3836 assert(insert_result.second); // Sequence numbers must not repeat ever.
3837 assert(insert_result.first == pkt_it); // Check that just-inserted element is ordered at the start.
3838
3839 snd_flying_pkts_updated(sock, pkt_it, boost::next(pkt_it), true); // Update byte count.
3840
3841 // Accordingly, insert packet (in the form of iterator into the above map) into sequence-number-ordered "scoreboard."
3842#ifndef NDEBUG
3843 const auto insert_result_by_seq =
3844#endif
3845 sock->m_snd_flying_pkts_by_seq_num.insert(make_pair(seq_num, pkt_it));
3846
3847 // Check invariant: Key X is in ..._by_sent_when <=> key X is in ..._by_seq_num.
3848 assert(insert_result_by_seq.second);
3849
3850 /* Caution: As noted in the doc header for this method, note that while we've already inserted sent_pkt into
3851 * snd_flying_pkts_by_when, the actual value of sent_pkt->m_sent_when.back() -- the absolute "when" -- isn't ready.
3852 * It will only be finalized once we actually send off the packet (after pacing, if any), in mark_data_packet_sent().
3853 * Nevertheless, we know the packet will be sent sometime fairly soon; and in fact AFTER all the packets
3854 * following it it in snd_flying_pkts_by_when's iterator ordering and in fact BEFORE any packets that
3855 * would be subsequently ahead of it in snd_flying_pkts_by_when's iterator ordering. That is, we can
3856 * place it there now, despite not knowing the _absolute_ time when it be sent, because we are confident about
3857 * its _relative_ order of when it will be sent vs. all the other packets in that structure, past or future. */
3858
3859 // Everything following this point is logging only.
3860
3861 auto const logger_ptr = get_logger();
3862 if ((!logger_ptr) || (!logger_ptr->should_log(log::Sev::S_TRACE, get_log_component())))
3863 {
3864 return;
3865 }
3866 // else
3867
3868 Sequence_number seq_num_end;
3869 get_seq_num_range(pkt_it, 0, &seq_num_end);
3870 if (sock->rexmit_on())
3871 {
3873 ("On [" << sock << "] pushing packet [" << seq_num << ", " << seq_num_end << ") "
3874 "rexmit_id [" << int(sent_pkt->m_packet->m_rexmit_id) << "] onto snd_flying_pkts and friends.");
3875 }
3876 else
3877 {
3879 ("On [" << sock << "] pushing packet [" << seq_num << ", " << seq_num_end << ") "
3880 "onto snd_flying_pkts and friends.");
3881 }
3882}
3883
3887 bool added)
3888{
3889 // We are in thread W.
3890
3891 if (pkt_begin == pkt_end)
3892 {
3893 return; // Wouldn't do anything anyway, but return here to avoid logging.
3894 }
3895
3896 // For brevity and a bit of speed:
3897 const auto& snd_flying_pkts_by_when = sock->m_snd_flying_pkts_by_sent_when;
3898 size_t& snd_flying_bytes = sock->m_snd_flying_bytes;
3899
3900 // Optimization for when they effectively clear() snd_flying_pkts* (e.g., possibly on Drop Timeout):
3901 if ((!added)
3902 && (pkt_begin == snd_flying_pkts_by_when.const_newest())
3903 && (pkt_end == snd_flying_pkts_by_when.const_past_oldest()))
3904 {
3905 snd_flying_bytes = 0;
3906 }
3907 else
3908 {
3909 size_t delta_bytes = 0;
3910 for ( ; pkt_begin != pkt_end; ++pkt_begin)
3911 {
3912 delta_bytes += pkt_begin->second->m_size;
3913 }
3914 added ? (snd_flying_bytes += delta_bytes) : (snd_flying_bytes -= delta_bytes);
3915 }
3916
3917 FLOW_LOG_TRACE("cong_ctl [" << sock << "] update: "
3918 "In-flight [" << sock->bytes_blocks_str(snd_flying_bytes) << "].");
3919}
3920
3923 bool defer_delta_check)
3924{
3925 const Peer_socket::Sent_packet& pkt = *pkt_it->second;
3926
3927 Sequence_number seq_num, seq_num_end;
3928 get_seq_num_range(pkt_it, &seq_num, &seq_num_end);
3929
3930 const unsigned int rexmit_id = pkt.m_packet->m_rexmit_id;
3931 FLOW_LOG_TRACE("On [" << sock << "] attempting to queue for retransmission "
3932 "[" << seq_num << ", " << seq_num_end << "] which has been "
3933 "retransmitted [" << rexmit_id << "] times so far.");
3934 if (rexmit_id == sock->opt(sock->m_opts.m_st_max_rexmissions_per_packet))
3935 {
3936 rst_and_close_connection_immediately(socket_id(sock), sock,
3938 return false;
3939 }
3940 // else
3941 return true;
3942}
3943
3945 const Peer_socket_options* opts)
3946{
3947 return connect_with_metadata(to, boost::asio::buffer(&S_DEFAULT_CONN_METADATA, sizeof(S_DEFAULT_CONN_METADATA)),
3948 err_code, opts);
3949}
3950
3952 const boost::asio::const_buffer& serialized_metadata,
3953 Error_code* err_code,
3954 const Peer_socket_options* sock_opts)
3955{
3956 namespace bind_ns = util::bind_ns;
3958 bind_ns::cref(to), bind_ns::cref(serialized_metadata), _1, sock_opts);
3959 // ^-- Call ourselves and return if err_code is null. If got to present line, err_code is not null.
3960
3961 namespace bind_ns = util::bind_ns;
3964 using bind_ns::bind;
3965
3966 // We are in thread U != W.
3967
3968 if (!running())
3969 {
3971 return Peer_socket::Ptr();
3972 }
3973 // else
3974
3975 // If it's good enough for DATA packets, it's good enough for metadata in SYN.
3976 if (serialized_metadata.size() > max_block_size())
3977 {
3979 return Peer_socket::Ptr();
3980 }
3981
3982 /* Put the rest of the work into thread W. For justification, see big comment in listen().
3983 * Addendum regarding performance: connect() is probably called more frequently than listen(), but
3984 * I doubt the performance impact is serious even so. send() and receive() might be a different
3985 * story. */
3986
3987 Peer_socket::Ptr sock;
3988 /* Load this->connect_worker(...) onto thread W boost.asio work queue.
3989 * We don't return until it finishes; therefore it is fine to do total & capture. */
3990 asio_exec_ctx_post(get_logger(), &m_task_engine, Synchronicity::S_ASYNC_AND_AWAIT_CONCURRENT_COMPLETION,
3991 [&]() { connect_worker(to, serialized_metadata, sock_opts, &sock); });
3992 // If got here, the task has completed in thread W and signaled us to that effect.
3993
3994 // connect_worker() indicates success or failure through this data member.
3995 if (sock->m_disconnect_cause)
3996 {
3997 *err_code = sock->m_disconnect_cause;
3998 return Peer_socket::Ptr(); // sock will go out of scope and thus will be destroyed.
3999 }
4000 // else
4001 err_code->clear();
4002 return sock;
4003} // Node::connect_with_metadata()
4004
4005void Node::connect_worker(const Remote_endpoint& to, const boost::asio::const_buffer& serialized_metadata,
4006 const Peer_socket_options* sock_opts,
4007 Peer_socket::Ptr* sock_ptr)
4008{
4009 using boost::asio::buffer;
4010 using boost::asio::ip::address;
4011
4012 assert(sock_ptr);
4013
4014 // We are in thread W. connect() is waiting for us to set *sock_ptr and return.
4015
4016 // Create new socket and set all members that may be immediately accessed by user in thread U after we're done.
4017
4018 auto& sock = *sock_ptr;
4019 if (sock_opts)
4020 {
4021 /* They provided custom per-socket options. Before we give those to the new socket, let's
4022 * validate them (for proper values and internal consistency, etc.). */
4023
4024 Error_code err_code;
4025 const bool opts_ok = sock_validate_options(*sock_opts, 0, &err_code);
4026
4027 // Due to the advertised interface of the current method, we must create a socket even on error.
4028 sock.reset(sock_create(*sock_opts));
4029
4030 // Now report error if indeed options were invalid. err_code is already set and logged in that case.
4031 if (!opts_ok)
4032 {
4033 sock->m_disconnect_cause = err_code;
4034 return;
4035 }
4036 // else
4037 }
4038 else
4039 {
4040 /* More typically, they did not provide per-socket options. So we just pass our global
4041 * template for the per-socket options to the Peer_socket constructor. The only caveat is
4042 * that template may be concurrently changed, so we must lock it. Could do it with opt(), but
4043 * that introduces an extra copy of the entire struct, so just do it explicitly.
4044 *
4045 * Note: no need to validate; global options (including per-socket ones) are validated
4046 * elsewhere when set. */
4047 Peer_socket* sock_non_ptr;
4048 {
4050 sock_non_ptr = sock_create(m_opts.m_dyn_sock_opts);
4051 }
4052 sock.reset(sock_non_ptr);
4053 }
4054
4055 // Socket created; set members.
4056
4057 sock->m_active_connect = true;
4058 sock->m_node = this;
4060 sock->m_remote_endpoint = to;
4061 // Will be sent in SYN to be deserialized by user on the other side. Save here if we must retransmit SYN.
4062 sock->m_serialized_metadata.assign_copy(serialized_metadata);
4063
4064 /* Initialize the connection's send bandwidth estimator (object that estimates available
4065 * outgoing bandwidth based on incoming acknowledgments). It may be used by m_snd_cong_ctl,
4066 * depending on the strategy chosen, but may be useful in its own right. Hence it's a separate
4067 * object, not inside *m_snd_cong_ctl. */
4068 sock->m_snd_bandwidth_estimator.reset(new Send_bandwidth_estimator(get_logger(), sock));
4069
4070 // Initialize the connection's congestion control strategy based on the configured strategy.
4071 sock->m_snd_cong_ctl.reset
4072 (Congestion_control_selector::create_strategy(sock->m_opts.m_st_cong_ctl_strategy, get_logger(), sock));
4073 // ^-- No need to use opt() yet: user doesn't have socket and cannot set_options() on it yet.
4074
4075 /* Tweak: If they specify the "any" IP address as the destination (which means any interface on
4076 * this machine), response traffic will look as though it's coming from the loopback IP address,
4077 * or another specific IP address -- not "any." Thus it will not be able to be properly
4078 * demultiplexed to this socket, since that will be saved at the "any" address in our data
4079 * structures. So that's an error. */
4080 bool ip_addr_any_error = false;
4081 const address& addr = to.m_udp_endpoint.address(); // Short-hand.
4082 if (addr.is_v4())
4083 {
4084 if (addr.to_v4() == util::Ip_address_v4::any())
4085 {
4086 ip_addr_any_error = true;
4087 }
4088 }
4089 else if (addr.is_v6())
4090 {
4091 if (addr.to_v6() == util::Ip_address_v6::any())
4092 {
4093 ip_addr_any_error = true;
4094 }
4095 }
4096 // else a new version of IP! Yay!
4097 if (ip_addr_any_error)
4098 {
4099 // Mark/log error.
4100 Error_code* err_code = &sock->m_disconnect_cause;
4102 return;
4103 }
4104 // else
4105
4106 // Allocate ephemeral local port.
4107
4108 sock->m_local_port = m_ports.reserve_ephemeral_port(&sock->m_disconnect_cause);
4109 if (sock->m_local_port == S_PORT_ANY)
4110 {
4111 // Error already logged and is in sock->m_disconnect_cause.
4112 return;
4113 }
4114 // else
4115
4116 const Socket_id socket_id = Node::socket_id(sock);
4117 FLOW_LOG_INFO("NetFlow worker thread starting active-connect of [" << sock << "].");
4118
4119 if (util::key_exists(m_socks, socket_id))
4120 {
4121 /* This is an active connect (we're intiating the connection). Therefore in particular it
4122 * should be impossible that our local_port() equals an already existing connection's
4123 * local_port(); Port_space is supposed to prevent the same ephemeral port from being handed out
4124 * to more than one connection. Therefore this must be a programming error. */
4125
4126 FLOW_LOG_WARNING("Cannot add [" << sock << "], because such a connection already exists. "
4127 "This is an ephemeral port collision and "
4128 "constitutes either a bug or an extremely unlikely condition.");
4129
4130 // Mark/log error.
4131 Error_code* err_code = &sock->m_disconnect_cause;
4133
4134 // Return port.
4135 Error_code return_err_code;
4136 m_ports.return_port(sock->m_local_port, &return_err_code);
4137 assert(!return_err_code);
4138
4139 return;
4140 } // if (that socket pair already exists)
4141 // else
4142
4143 /* Try the packet send just below again if SYN not acknowledged within a certain amount of time.
4144 * Give up if that happens too many times. Why do this BEFORE sending packet? Because
4145 * this can fail, in which case we don't want a weird situation where we've sent
4146 * the packet but failed to start the retransmit/timeout timers.
4147 * Update: It can no longer fail, so that reasoning is N/A. Not moving, though, because it's still fine here. */
4148 setup_connection_timers(socket_id, sock, true);
4149
4150 /* Initial Sequence Number (ISN) (save before create_syn() uses it).
4151 * Remember it in case we must retransmit the SYN. (m_snd_next_seq_num may have been further increased by then.) */
4152 Sequence_number& init_seq_num = sock->m_snd_init_seq_num;
4153 init_seq_num = m_seq_num_generator.generate_init_seq_num();
4154 /* Setting this now ensures ALL subsequent copies (essentially, every single Sequence_number on this socket's
4155 * local data number line!) will have the same nice metadata (hence nice logging) too.
4156 * The `+ 1` nuance is explained in class Sequence_number doc header, *Metadata* section. */
4157 init_seq_num.set_metadata('L', init_seq_num + 1, sock->max_block_size());
4158 // Sequence number of first bit of actual data.
4159 sock->m_snd_next_seq_num = init_seq_num + 1;
4160
4161 // Make a SYN packet to send.
4162 auto syn = create_syn(sock);
4163
4164 // Fill out common fields and asynchronously send packet.
4165 if (!async_sock_low_lvl_packet_send_paced(sock,
4167 &sock->m_disconnect_cause))
4168 {
4169 // Error marked and logged already.
4170
4171 // Return port.
4172 Error_code return_err_code;
4173 m_ports.return_port(sock->m_local_port, &return_err_code);
4174 assert(!return_err_code);
4175
4176 // Cancel any timers set up above.
4177 cancel_timers(sock);
4178
4179 return;
4180 }
4181 /* send will happen asynchronously, and the registered completion handler will execute in this
4182 * thread when done (NO SOONER than this method finishes executing). */
4183
4184 // No more erros: Map socket pair to the socket data structure (kind of analogous to a TCP net-stack's TCB structure).
4185 m_socks[socket_id] = sock;
4186
4187 // CLOSED -> SYN_SENT.
4188 sock_set_int_state(sock, Peer_socket::Int_state::S_SYN_SENT);
4189} // Node::connect_worker()
4190
4192 const Peer_socket_options* sock_opts)
4193{
4194 return sync_connect_with_metadata(to, Fine_duration::max(),
4195 boost::asio::buffer(&S_DEFAULT_CONN_METADATA, sizeof(S_DEFAULT_CONN_METADATA)),
4196 err_code, sock_opts);
4197}
4198
4200 const boost::asio::const_buffer& serialized_metadata,
4201 Error_code* err_code, const Peer_socket_options* opts)
4202{
4203 return sync_connect_with_metadata(to, Fine_duration::max(), serialized_metadata, err_code, opts);
4204}
4205
4207 const boost::asio::const_buffer& serialized_metadata,
4208 Error_code* err_code, const Peer_socket_options* sock_opts)
4209{
4210 namespace bind_ns = util::bind_ns;
4212 bind_ns::cref(to), bind_ns::cref(max_wait), bind_ns::cref(serialized_metadata),
4213 _1, sock_opts);
4214 // ^-- Call ourselves and return if err_code is null. If got to present line, err_code is not null.
4215
4216 using util::bind_ns::bind;
4217
4218 // We are in thread U != W.
4219
4220 /* This is actually pretty simple. All we want to do is connect(), which is non-blocking, and
4221 * then block until the connection is ready (at least according to our side). Ready means that
4222 * the socket is Writable (since user has no access to the socket yet, nothing can be loading
4223 * data onto the Send buffer, and obviously the congestion window is clear, so it must be
4224 * Writable). Note that, like BSD sockets, we specifically don't consider a socket Writable
4225 * until in ESTABLISHED internal state. */
4226
4227 /* For the "block until Writable" part, create and load the Event_set. Do this before connect(),
4228 * so that if it fails we don't have to then clean up the socket before returning error to user. */
4229
4230 const Event_set::Ptr event_set = event_set_create(err_code);
4231 if (!event_set)
4232 {
4233 assert(*err_code == error::Code::S_NODE_NOT_RUNNING);
4234 return Peer_socket::Ptr(); // *err_code is set.
4235 }
4236 // Now we know Node is running(); and we have event_set.
4237
4238 // We must clean up event_set at any return point below.
4239 Error_code dummy_prevents_throw;
4240 util::Auto_cleanup event_set_cleanup = util::setup_auto_cleanup([&]()
4241 {
4242 // Eat any error when closing Event_set, as it's unlikely and not interesting to user.
4243 event_set->close(&dummy_prevents_throw);
4244 });
4245
4246 const auto sock = connect_with_metadata(to, serialized_metadata, err_code, sock_opts);
4247 if (!sock)
4248 {
4249 return sock; // *err_code is set. It's probably some user error like an invalid destination.
4250 }
4251 // else we have a socket that has started connecting.
4252
4253 /* We must clean up sock (call sock->close_abruptly(&dummy_prevents_throw)) at any return point (including
4254 * exception throw) below, EXCEPT the success case. Because of the latter, we can't use the
4255 * auto_cleanup trick we used on event_set. So, we'll just have to handle sock cleanup
4256 * manually. */
4257
4258 // Add the one event about which we care.
4259 bool result = event_set->add_wanted_socket<Peer_socket>(sock, Event_set::Event_type::S_PEER_SOCKET_WRITABLE,
4260 &dummy_prevents_throw);
4261 assert(result); // Node is running, so there's no way that should have failed.
4262
4263 // Wait for Writable.
4264 result = event_set->sync_wait(max_wait, err_code);
4265 if (!result)
4266 {
4267 if (*err_code == error::Code::S_EVENT_SET_CLOSED)
4268 {
4269 // It's unlikely, but I guess someone could have destroyed Node during the wait (we do allow that during sleep).
4271 }
4272 else
4273 {
4274 // This is quite common and is analogous to POSIX's EINTR semantics (signal interrupted the blocking call).
4275 assert(*err_code == error::Code::S_WAIT_INTERRUPTED);
4276 }
4277
4278 // Clean up (as discussed above).
4279 sock->close_abruptly(&dummy_prevents_throw); // Eat any error; user doesn't care.
4280 return Peer_socket::Ptr(); // *err_code is set.
4281 } // if (sync_wait() failed)
4282 // else we know event_set is still open, and sync_wait() succeeded.
4283
4284 // OK; either that returned 1 event, or 0 events (timeout).
4285 const bool ready = event_set->events_detected(err_code);
4286 /* Node had not been destroyed by the time sync_wait() finished, and we don't allow simultaneous
4287 * ~Node() outside a blocking sleep (see notes in class Node doc header). The only way this
4288 * failed is if Event_set was closed, and that could only happen if Node was destroyed. */
4289 assert(!*err_code);
4290
4291 if (ready)
4292 {
4293 /* Didn't time out; socket is Writable. However, that does not mean it's Writable for "good"
4294 * reasons. If an error was encountered since the original non-blocking connect (e.g., RST
4295 * received; or handshake timeout expired), then it is now Writable, but any operation like
4296 * send() or receive() will immediately yield an error. If that is the case,
4297 * close_connection_immediately() has set user-visible state to S_CLOSED. So let's check for
4298 * it and return an error in that case.
4299 *
4300 * We could also not; pretend socket is ready and let user discover error when trying to
4301 * transmit. However it seems like a good property to help him out. */
4302
4303 if (sock->state() == Peer_socket::State::S_CLOSED)
4304 {
4305 // No need to cleanup socket; it is already closed.
4306
4307 // Return error as above.
4308 *err_code = sock->m_disconnect_cause; // No need to lock; m_disconnect_cause set and can't change later.
4309 return Peer_socket::Ptr();
4310 }
4311 // else it's probably really ready for action.
4312
4313 return sock; // *err_code is success.
4314 }
4315 // else
4316
4317 // Timed out! Clean up socket, as above, and return null with a specific error (as advertised).
4318 sock->close_abruptly(&dummy_prevents_throw);
4320 return Peer_socket::Ptr();
4321} // Node::sync_connect_impl()
4322
4323void Node::setup_connection_timers(const Socket_id& socket_id, Peer_socket::Ptr sock, bool initial)
4324{
4327 using boost::chrono::microseconds;
4328 using boost::chrono::duration_cast;
4329 using boost::weak_ptr;
4330
4331 // We are in thread W.
4332
4333 Fine_duration rexmit_from_now = sock->opt(sock->m_opts.m_st_connect_retransmit_period);
4334
4335 // Finalize the retransmit scheduled task firing time; and update the # retries statistic.
4336 if (!initial)
4337 {
4338 assert(scheduled_task_fired(get_logger(), sock->m_init_rexmit_scheduled_task));
4339
4340 ++sock->m_init_rexmit_count;
4341 /* This is a bit more precise than leaving rexmit_from_now alone, as it counts from when firing was
4342 * actually scheduled, vs. when the timer was actually triggered by boost.asio. The 2nd addend should be a bit
4343 * negative and thus decrease rexmit_from_now a bit. */
4344 rexmit_from_now += scheduled_task_fires_from_now_or_canceled(get_logger(), sock->m_init_rexmit_scheduled_task);
4345 /* @todo RFC 6298 mandates that this must be doubled after each attempt instead of keeping
4346 * the same value. Doesn't mean we should follow it. */
4347 }
4348
4349 // Firing time is set; start timer. Call that body when task fires, unless it is first canceled.
4350 sock->m_init_rexmit_scheduled_task
4351 = schedule_task_from_now(get_logger(), rexmit_from_now, true, &m_task_engine,
4352 [this, socket_id,
4353 sock_observer = weak_ptr<Peer_socket>(sock)]
4354 (bool)
4355 {
4356 auto sock = sock_observer.lock();
4357 if (sock)
4358 {
4359 handle_connection_rexmit_timer_event(socket_id, sock);
4360 }
4361 // else { Possible or not, allow for this possibility for maintainability. }
4362 });
4363
4364 // Also set up the timeout that will stop these retries from happening.
4365 if (initial)
4366 {
4367 sock->m_connection_timeout_scheduled_task
4369 sock->opt(sock->m_opts.m_st_connect_retransmit_timeout),
4370 true, &m_task_engine,
4371 [this, socket_id,
4372 sock_observer = weak_ptr<Peer_socket>(sock)]
4373 (bool)
4374 {
4375 // We are in thread W.
4376
4377 auto sock = sock_observer.lock();
4378 if (!sock)
4379 {
4380 return; // Possible or not, allow for this possibility for maintainability.
4381 }
4382 // else
4383
4384 FLOW_LOG_INFO("Connection handshake timeout timer [" << sock << "] has been triggered; was on "
4385 "attempt [" << (sock->m_init_rexmit_count + 1) << "].");
4386
4387 assert((sock->m_int_state == Peer_socket::Int_state::S_SYN_SENT)
4388 || (sock->m_int_state != Peer_socket::Int_state::S_SYN_RCVD));
4389
4390 // Timeout. Give up. Send RST, in case they do come to their senses -- but it's too late for us.
4391
4392 /* Close connection in our structures and inform user. Pre-conditions
4393 * assumed by call: sock in m_socks and sock->state() == S_OPEN (yes, since m_int_state ==
4394 * S_SYN_SENT/RCVD); err_code contains the reason for the close (yes). */
4395 rst_and_close_connection_immediately(socket_id, sock, error::Code::S_CONN_TIMEOUT, false);
4396 /* ^-- defer_delta_check == false: for similar reason as when calling send_worker() from
4397 * send_worker_check_state(). */
4398 });
4399 } // if (initial)
4400} // Node::setup_connection_timers()
4401
4403{
4404 using util::Blob;
4405
4406 // We are in thread W.
4407
4408 assert((sock->m_int_state == Peer_socket::Int_state::S_SYN_SENT)
4409 || (sock->m_int_state != Peer_socket::Int_state::S_SYN_RCVD));
4410
4411 // Not an error (so not WARNING), but it's rare and interesting enough for INFO.
4412 FLOW_LOG_INFO("Connection handshake retransmit timer [" << sock << "] triggered; was on "
4413 "attempt [" << (sock->m_init_rexmit_count + 1) << "].");
4414
4415 // Try again. Reproduce the SYN or SYN_ACK... but first set up the next timer.
4416
4417 // Setup the next timer before sending packet for the same reason as in the original SYN/SYN_ACK-sending code.
4418 setup_connection_timers(socket_id, sock, false);
4419
4420 /* Send packet.
4421 * @todo More code reuse? Or save the serialized version inside socket and resend here verbatim? */
4422
4423 Low_lvl_packet::Ptr re_syn_base;
4424 if (sock->m_active_connect)
4425 {
4426 auto syn = create_syn(sock);
4427 re_syn_base = Low_lvl_packet::ptr_cast(syn);
4428 }
4429 else
4430 {
4431 // (Subtlety: As of this writing it wouldn't have changed since original SYN_ACK, but safe>sorry.)
4432 sock->m_rcv_last_sent_rcv_wnd = sock_rcv_wnd(sock);
4433
4434 auto syn_ack = create_syn_ack(sock);
4435 re_syn_base = Low_lvl_packet::ptr_cast(syn_ack);
4436 }
4437
4438 // Fill out common fields and asynchronously send packet.
4439 if (!async_sock_low_lvl_packet_send_or_close_immediately(sock, std::move(re_syn_base), false))
4440 {
4441 /* ^-- defer_delta_check == false: for similar reason as when calling send_worker() from
4442 * send_worker_check_state(). */
4443 return;
4444 }
4445} // Node::handle_connection_rexmit_timer_event()
4446
4448{
4451
4452 // We are in thread W.
4453
4454 /* Cancel any timers. Note that this will NOT prevent a given timer's handler from running.
4455 * It will try to make it run ASAP with operation_aborted error code. However, it may not even
4456 * succeed in that. In particular, if by the time the current handler started the timer handler
4457 * event was already queued inside m_task_engine, then canceling the timer now will not load
4458 * operation_aborted into the handler call; it will instead fire as if the timer really expired
4459 * (which it did). Therefore the timer handler should be careful to check the state of the socket
4460 * and exit if the state is not suitable (in this case, S_CLOSED).
4461 *
4462 * Even so, try to cancel with operation_aborted just to cut down on entropy a bit (at least by
4463 * executing all handlers ASAP).
4464 *
4465 * Update: However, scheduled_task_cancel() will indeed cleanly cancel. `Timer`s are still in direct use
4466 * as well however, so the above still applies to some of the below. */
4467
4468 sock->m_rcv_delayed_ack_timer.cancel();
4469 sock->m_snd_pacing_data.m_slice_timer.cancel();
4470
4471 if (sock->m_init_rexmit_scheduled_task)
4472 {
4473 scheduled_task_cancel(get_logger(), sock->m_init_rexmit_scheduled_task);
4474 sock->m_init_rexmit_scheduled_task = Scheduled_task_handle();
4475 }
4476 if (sock->m_connection_timeout_scheduled_task)
4477 {
4478 scheduled_task_cancel(get_logger(), sock->m_connection_timeout_scheduled_task);
4479 sock->m_connection_timeout_scheduled_task = Scheduled_task_handle();
4480 }
4481 if (sock->m_rcv_in_rcv_wnd_recovery)
4482 {
4483 scheduled_task_cancel(get_logger(), sock->m_rcv_wnd_recovery_scheduled_task);
4484 sock->m_rcv_in_rcv_wnd_recovery = false;
4485 }
4486
4487 if (sock->m_snd_drop_timer)
4488 {
4489 // This Drop_timer guy actually will prevent any callbacks from firing.
4490 sock->m_snd_drop_timer->done();
4491
4492 /* The two `shared_ptr`s (sock and m_snd_drop_timer) point to each other. Nullify this to break the cycle
4493 * and thus avoid memory leak. */
4494 sock->m_snd_drop_timer.reset();
4495 }
4496}
4497
4499{
4500 sock->m_snd_drop_timeout = sock->opt(sock->m_opts.m_st_init_drop_timeout);
4501
4502 const auto on_fail = [this, socket_id, sock](const Error_code& err_code)
4503 {
4504 rst_and_close_connection_immediately(socket_id, sock, err_code, false);
4505 // ^-- defer_delta_check == false: for similar reason as when calling send_worker() from send_worker_check_state().
4506 };
4507 const auto on_timer = [this, socket_id, sock](bool drop_all_packets)
4508 {
4509 drop_timer_action(sock, drop_all_packets);
4510 };
4511
4512 /* Set up the Drop Timer. Basically give it some key fields of sock (DTO value, the In-flight
4513 * queue) and the callbacks to call when events occur, such as the Drop Timer expiring.
4514 * Additionally, when events m_snd_drop_timer wants to know about happen, we will call
4515 * m_snd_drop_timer->on_...(). */
4516 sock->m_snd_drop_timer = Drop_timer::create_drop_timer(get_logger(), &m_task_engine, &sock->m_snd_drop_timeout,
4517 Peer_socket::Ptr(sock), on_fail, on_timer);
4518}
4519
4521 const Function<size_t (size_t max_data_size)>& snd_buf_feed_func,
4522 Error_code* err_code)
4523{
4524 using boost::asio::post;
4525
4526 /* We are in user thread U != W.
4527 * It's important to keep that in mind in this method. In particular, it is absolutely unsafe to
4528 * access m_int_state, which belongs solely to thread W and is never locked. */
4529
4530 // IMPORTANT: The logic here must be consistent with sock_is_writable().
4531
4532 if (!running())
4533 {
4535 return 0;
4536 }
4537 // else
4538
4539 // Pre-condition is that m_mutex is locked already. So EVERYTHING that can be locked, is, including the buffers.
4540
4541 // Pre-condition.
4542 assert(sock->m_state == Peer_socket::State::S_OPEN); // Locked.
4543
4544 if (sock->m_disconnect_cause) // Locked.
4545 {
4546 // Error has been recorded, and we're not CLOSED => we are DISCONNECTING.
4547 assert(sock->m_open_sub_state == Peer_socket::Open_sub_state::S_DISCONNECTING);
4548
4549 /* Disconnection is underway. Adding more data to the Send buffer is pointless; we
4550 * don't allow more data to be queued to be sent after an error (though existing buffered data
4551 * may yet be sent... but that's not relevant here). @todo No graceful close yet. */
4552
4553 // Mark in *err_code and log.
4554 FLOW_ERROR_EMIT_ERROR_LOG_INFO(sock->m_disconnect_cause);
4555 return 0;
4556 }
4557 // else
4558
4559 // No fatal error (socket not disconnecing or closed). However it may still be connecting.
4560
4561 if (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_CONNECTING)
4562 {
4563 /* Here we draw a line in the sand and refuse to buffer any data. We could easily allow
4564 * buffering data even when still S_CONNECTING. However, I am copying BSD socket semantics
4565 * here, as they do seem to be useful. As a user I don't want to think I've "sent" gobs of data
4566 * while there's little to suggest that there's even anyone listening on the other side. */
4567 err_code->clear();
4568 return 0;
4569 }
4570 // else
4571 assert(sock->m_open_sub_state == Peer_socket::Open_sub_state::S_CONNECTED);
4572
4573 const bool was_deqable = snd_deqable(sock); // See below.
4574
4575 /* Write the user-provided data into m_snd_buf; provide the missing argument (max_data_size).
4576 * Round up to a multiple of max-block-size to ensure we never fragment a max-block-size-sized
4577 * chunk of data when they're using unreliable mode! */
4578 const size_t sent = snd_buf_feed_func(sock->max_block_size_multiple(sock->m_opts.m_st_snd_buf_max_size));
4579
4580 // Register that the Send buffer possibly grew.
4581 sock->m_snd_stats.buffer_fed(sock->m_snd_buf.data_size());
4582
4583 /* We've done the minimal thing send() does: added data to the send buffer. Now we may need to
4584 * kick off the actual asynchronous sending of some of these data by thread W. It's important to
4585 * discuss the overall strategy for how that works.
4586 *
4587 * Key question: how does W send low-level packets over UDP? Answer: if there's anything on the
4588 * Send buffer or retransmission queue (if retransmission is enabled), and there is no other
4589 * (congestion control, probably) reason NOT to send packets, then dequeue a packet from
4590 * retransmission queue or Send buffer and send it off to the UDP layer; repeat in a tight loop
4591 * until both Send queues are empty, or there's some reason NOT to send packets (again, congestion
4592 * control). Let's write this in pseudo-code:
4593 *
4594 * DEQ(sock): // Thread W only.
4595 * if (!sendable(sock)):
4596 * return // Slight optimization; perform this first check before locking.
4597 * lock sock // Must lock because sock->m_snd_buf accessible from other threads.
4598 * while (sendable(sock) && deqable(sock)):
4599 * dequeue sock->m_snd_buf -> block
4600 * serialize block into packet
4601 * send packet via UDP
4602 * unlock sock
4603 *
4604 * sendable(sock):
4605 * return <...probably some congestion control condition involving CWND or something>
4606 *
4607 * deqable(sock):
4608 * return !(sock->m_rexmit_q.empty() && sock->m_snd_buf.empty())
4609 *
4610 * When should DEQ(sock) execute? Answer: whenever sendable(sock) and deqable(sock) are true. If
4611 * they're true, but DEQ(sock) doesn't run for time period P, then it's practically like adding
4612 * sleep(P) from the user's point of view. So how do we get DEQ(sock) to execute as soon as those
4613 * conditions are true? Well, running it repeatedly in a thread W tight loop would do it, but
4614 * obviously that's unacceptable.
4615 *
4616 * So consider the initial state after sock enters ESTABLISHED state. sendable(sock) is true;
4617 * deqable(sock) is false. The moment deqable(sock) becomes true, we should execute DEQ(sock); in
4618 * other words in the first sock->send(), as that will add to m_snd_buf. After DEQ(sock) exits,
4619 * there's no need to call DEQ(sock) until again both conditions are true. Therefore, the
4620 * algorithm is: whenever sendable(sock) goes from false to true, and/or deqable(sock) from false
4621 * to true, call DEQ(sock). If inside DEQ(sock) one of the conditions is still false, it will
4622 * quickly return. (Call the latter a NOOP.)
4623 *
4624 * Now we must come up with a scheme that will ensure DEQ(sock) will run very quickly after either
4625 * condition (sendable(sock), deqable(sock)) becomes true; and that will not peg the CPU.
4626 *
4627 * Consider sendable(). Only thread W (transport layer) can determine this value: it depends on
4628 * wholly internal details like packets in-flight and CWND. Therefore sendable(sock) can go
4629 * false->true only in W. Hence W, whenever changing any component that might affect
4630 * sendable(sock) would do:
4631 *
4632 * // ... Something related to sendable(sock) has changed....
4633 * DEQ(sock) // So check and send if possible.
4634 *
4635 * Clearly this calls DEQ(sock) as soon as humanly possible after sendable(sock) becomes true.
4636 * Clearly it wastes no CPU cycles either. OK.
4637 *
4638 * Now consider deqable(). sock->m_snd_buf can only change from empty to non-empty in the
4639 * previous statement (snd_buf_feed_func()). That is in thread U != W. Suppose we write:
4640 *
4641 * SEND(sock, blocks): // Non-W threads only.
4642 * lock sock // Must lock because sock->m_snd_buf accessible from other threads.
4643 * add blocks -> sock->m_snd_buf
4644 * if (sock->m_snd_buf was empty before previous statement)
4645 * // Queue DEQ(sock) for asynchronous execution on thread W as soon as it's free:
4646 * post(W, DEQ(sock))
4647 * unlock sock
4648 *
4649 * Does this call DEQ(sock) as soon as deqable(sock) becomes true? Well, DEQ(sock) can only run
4650 * on thread W, and the enqueuing of blocks can only happen on thread U, and post() will cause
4651 * DEQ(sock) to run as soon as possible. Therefore that's as good as it can be. Is it correct,
4652 * however? The mainstream case is that once "unlock sock" finished in SEND(), thread W will get
4653 * some free time, execute the just-queued DEQ(), and thus everything works out. OK so far.
4654 *
4655 * Since, however, post() is (obviously) asynchronous and done from thread non-W, there is
4656 * potential for other tomfoolery. First consider competing SEND() calls from other threads.
4657 * Because of locking, they will be entirely sequential even from different threads and thus can
4658 * be considered as all in one thread U != W. Now suppose SEND() placed DEQ() onto W, and another
4659 * SEND() executes before DEQ() executes on W. No problem: since only DEQ() can dequeue the Send
4660 * buffer, and the 1st SEND() made the buffer non-empty, the 2nd SEND() will not affect the DEQ()
4661 * situation, since it cannot make m_snd_buf become non-empty after being empty (was already
4662 * non-empty).
4663 *
4664 * Second consider SEND(sock, blocks) executing while a W handler is executing. Now suppose this
4665 * W handler discovers that sendable() may be affected and thus calls DEQ(sock) as shown above;
4666 * meanwhile SEND() posts DEQ(sock) onto W as well. W will wait until SEND(sock, blocks) exits
4667 * (due to the lock) before executing most of DEQ(sock), but when it does it will be ITS DEQ(sock)
4668 * that executes first (regardless of whether the post from thread U happened first). This
4669 * DEQ(sock) will not be a NOOP, which is great. Now, thread W should exit that handler and
4670 * finally execute SEND()'s posted DEQ() -- which will be a NOOP, because the synchronous
4671 * DEQ(sock) from thread W preempted it.
4672 *
4673 * Is this OK? Most likely. It'll spend some extra CPU cycles on the check in the NOOP, but
4674 * that's it. Now, there is some conceivable way that, maybe, such NOOPs could happen a lot in a
4675 * very busy system and perhaps even "bunch" up to peg the CPU. However, after doing many thought
4676 * experiments, I unable to come up with anything actually worrying.
4677 *
4678 * The other way deqable(sock) can become true is if m_rexmit_q was empty but becomes non-empty.
4679 * In other words, if we detect packet as Dropped, we will have added it (if retransmission is on)
4680 * to m_rexmit_q. This can only happen on thread W and thus is handled similarly to
4681 * sendable(sock):
4682 *
4683 * // ... Something related to deqable(sock) has changed....
4684 * DEQ(sock) // So check and send if possible.
4685 *
4686 * So this system should be OK. Now let's map the above pseudocode to actual code.
4687 *
4688 * SEND(sock, blocks) is the very method you're reading now (Peer_socket::send() and
4689 * Node::send(), runs in thread U != W). DEQ(sock) is Node::send_worker(sock) (runs in thread
4690 * W). sendable(sock) is Node::can_send(sock). deqable(sock) is Node::snd_deqable(sock).
4691 * post(W, f) is post(Node::m_task_engine, f).
4692 *
4693 * OK, there is one more small caveat. If DEQ(sock) is placed onto W by SEND(sock, blocks),
4694 * then before this DEQ() is executed, thread W may change the state of sock (for example, close
4695 * it). Therefore, DEQ() must also ensure it's operating in a state where it can send data
4696 * (ESTABLISHED at least), and if not, NOOP. Of course if DEQ() is executed synchronously by W,
4697 * then this is unnecessary (since W code wouldn't execute DEQ() directly unless already in a
4698 * proper state for this). So, send_worker_check_state() is actually a little bit more than just
4699 * DEQ(), while send_worker() is just DEQ(). send() posts send_worker_check_state(), while
4700 * thread W executes send_worker() directly. */
4701
4702 if ((!was_deqable) && (sent != 0))
4703 {
4704 // Possibly send_worker() can send packets now (send buffer went from empty to not).
4705 post(m_task_engine, [this, sock]() { send_worker_check_state(sock); });
4706 }
4707
4708 err_code->clear();
4709 return sent;
4710 // Note that sock->m_mutex is unlocked here (and send_worker() will lock it again when it [probably soon] executes).
4711} // Node::send()
4712
4713bool Node::sock_is_writable(const boost::any& sock_as_any) const
4714{
4715 using boost::any_cast;
4716
4717 const Peer_socket::Const_ptr sock = any_cast<Peer_socket::Ptr>(sock_as_any);
4718
4719 Peer_socket::Lock_guard lock(sock->m_mutex); // Many threads can access/write below state.
4720
4721 /* Our task here is to return true if and only if at this very moment calling sock->send() would
4722 * yield either a return value of > 0 OR a non-success *err_code. In other words, send() would
4723 * return "something." This is used for Event_set machinery.
4724 *
4725 * This should mirror send()'s algorithm. @todo Should send() call this, for code reuse?
4726 * Maybe/maybe not. Consider performance when deciding.
4727 *
4728 * - If state is CLOSED, then some sort of error/terminating condition occurred, so send()
4729 * would return 0 and non-success Error_code == sock->m_disconnect_cause. (Writable.)
4730 * - Otherwise, if state is OPEN+DISCONNECTING, then graceful close (@todo implement it) is
4731 * underway; we do not allow more data to be sent (except what's already in Sent buffer), so
4732 * send() would return 0 and non-success Error_code == sock->m_disconnect_cause.
4733 * (Writable.)
4734 * - Otherwise, if state is OPEN+CONNECTED, and there is Send buffer space, send() would return >
4735 * 0 and no error. (Writable.)
4736 * - The other remaining possibilities:
4737 * - OPEN+CONNECTED but no Send buffer space (returns 0, no error). (Not Writable.)
4738 * - OPEN+CONNECTING -- we don't allow accumulating data in Send buffer (returns 0, no error).
4739 * (Not Writable.) */
4740
4741 return (sock->m_state == Peer_socket::State::S_CLOSED)
4742 || (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_DISCONNECTING)
4743 || ((sock->m_open_sub_state == Peer_socket::Open_sub_state::S_CONNECTED)
4744 && snd_buf_enqable(sock));
4745} // Node::sock_is_writable()
4746
4748{
4749 // See big comment block in Node::send() first.
4750
4751 // We are in thread W.
4752
4753 /* This method can be thought of as the chunk of the finite state machine that defines what
4754 * happens when the "user called send, adding at least 1 block to the send buffer" (@todo: or any data
4755 * at all, if in reliable mode?) event is defined. Therefore, we will have a switch() that will handle every
4756 * state and decide what should happen when that event fires in that state.
4757 *
4758 * send() placed us onto thread W. When send() did so, m_int_state (which it was not allowed to
4759 * check, as only thread W can access it) was at least ESTABLISHED (since state was
4760 * S_OPEN+S_CONNECTED, ensured via assert()). Therefore, we can eliminate several states with
4761 * assert()s: SYN_SENT, SYN_RCVD. */
4762
4763 switch (sock->m_int_state)
4764 {
4766 // Mainstream case.
4767 send_worker(sock, false);
4768 /* ^-- defer_delta_check == false: because we were invoked from thread U != W, we are NOT
4769 * invoked from async_low_lvl_recv(). Therefore, we will NOT perform
4770 * event_set_all_check_delta(false) before the boost.asio handler exits. Therefore boost.asio
4771 * may sleep (block) before event_set_all_check_delta(false). Therefore that would delay
4772 * delivery of the Writable event to the user. Therefore force the delta check immediately.
4773 * See Node::m_sock_events doc header for details. */
4774 break;
4776 // Unlikely but legitimate.
4777 FLOW_LOG_INFO('[' << sock << "] "
4778 "in state [" << sock->m_int_state << "] "
4779 "closed before asynchronous send_worker() could proceed.");
4780 break;
4783 // Crash. See above reasoning.
4784 FLOW_LOG_WARNING('[' << sock << "] "
4785 "in state [" << sock->m_int_state << "] "
4786 "somehow had send() called on it.");
4787 assert(false);
4788 break;
4789 } // switch (sock->m_int_state)
4790} // Node::send_worker_check_state()
4791
4792void Node::send_worker(Peer_socket::Ptr sock, bool defer_delta_check)
4793{
4794 using boost::asio::buffer;
4795 using boost::next;
4796 using boost::ratio;
4797 using boost::ratio_string;
4798 using boost::chrono::milliseconds;
4799 using boost::chrono::round;
4800 using boost::shared_ptr;
4801 using std::list;
4802
4803 // We are in thread W.
4804
4805 // See big comment block in Node::send() first.
4806
4807 // Pre-condition.
4808 assert(sock->m_int_state == Peer_socket::Int_state::S_ESTABLISHED);
4809
4810 /* We are about to potentially send a bunch of DATA packets. Before sending a given packet, we
4811 * will call can_send() which will ask the congestion control module whether there is space in
4812 * what it thinks is the available pipe and return true if so (as well as check rcv_wnd, ensuring
4813 * the receiver's Receive buffer can handle the data once they arrive). However, how it answers
4814 * that question depends on the size of the pipe (m_snd_cong_ctl->congestion_window_bytes(),
4815 * a/k/a CWND). Many (most?) congestion control modules will want to reduce CWND when a
4816 * connection has been idle -- not sending anything, due to no data to be sent in Send buffer --
4817 * for a while. Thus we must call m_snd_cong_ctl->on_idle_timeout() if we've hit Idle Timeout.
4818 *
4819 * The definition of Idle Timeout we use is from TCP RFC 5681-4.1 (and DCCP CCID 2 RFC 4341-5.1).
4820 * It's simple: Idle Timeout is DTO (Drop Timeout) time units since a DATA packet has been last
4821 * sent. While I basically grasp the intuition behind it (if a DTO since even the last-sent
4822 * packet has expired, and no retransmission/further transmission has occurred, then there must
4823 * have been no more data for a while), I can't quite prove to myself that it's exactly right,
4824 * mostly due to the fact that DTO may change over time. It's probably right though, as RFC 4341
4825 * recommends it, even though that protocol is closer to NetFlow than TCP (full selective ACKs).
4826 * Anyway, if we see too many false Idle timeouts, revisit this.
4827 *
4828 * Why check this now? Why not start a proper timer, each time packet is sent, instead and just
4829 * inform m_snd_cong_ctl when it fires? Answer: timer management is somewhat of a pain in the ass
4830 * (as you can see in our other various timers, such as m_snd_drop_timer). Here we have an opportunity
4831 * to simply check the condition and affect CWND right before CWND would be used anyway
4832 * (can_send()). It's simpler, and the performance impact is negligible (it's just a
4833 * Fine_clock::now() call and a comparison). You ask, why not do the same for other timers
4834 * then, in particular the Drop Timer? Answer: for Drop Timer, we really need to know exactly
4835 * when it fires, so that we can Drop In-flight packets right then and possibly send more
4836 * packets (among other things). In this case there is no such requirement; we only care about
4837 * whether the Idle Timeout has tripped when we're about to send something. */
4838
4839 /* To avoid a very close race between DTO and idle timeout, apply a slight factor of > 1 to DTO.
4840 * Using boost::ratio<> instead of a double or something for same reason as in
4841 * new_round_trip_time_sample(). */
4842 using Idle_timeout_dto_factor = ratio<110, 100>;
4843 const Fine_duration idle_timeout
4844 = sock->m_snd_drop_timeout * Idle_timeout_dto_factor::num / Idle_timeout_dto_factor::den;
4845 const Fine_duration since_last_send = Fine_clock::now() - sock->m_snd_last_data_sent_when;
4846
4847 if ((sock->m_snd_last_data_sent_when != Fine_time_pt()) && (since_last_send > idle_timeout))
4848 {
4849 // Arguable if this should be INFO or TRACE. We'll see.
4850 FLOW_LOG_INFO("Idle timeout triggered for [" << sock << "]; "
4851 "last activity [" << round<milliseconds>(since_last_send) << "] ago "
4852 "exceeds idle timeout [" << round<milliseconds>(idle_timeout) << "] "
4853 "= " << (ratio_string<Idle_timeout_dto_factor, char>::prefix()) << " x "
4854 "[" << round<milliseconds>(sock->m_snd_drop_timeout) << "].");
4855 sock->m_snd_cong_ctl->on_idle_timeout();
4856 sock->m_snd_stats.idle_timeout();
4857 }
4858
4859 /* Check networking conditions (presumably congestion control) and flow control (rcv_wnd).
4860 * Ideally this would always be true, but then we'd overwhelm the link when send() is invoked on
4861 * large amounts of data and/or repeatedly. */
4862 if (!can_send(sock))
4863 {
4864 FLOW_LOG_TRACE('[' << sock << "]: "
4865 "Initial check: can_send() is false.");
4866 return;
4867 }
4868 // else can send if there are data to send.
4869
4870 /* Didn't lock sock above, as can_send() depends only on internal state, which is accessed from
4871 * thread W only. This is an optimization to avoid thread contention (with non-W send()s) for the
4872 * lock in the case when congestion control is preventing sends.
4873 *
4874 * Have to lock now, for sock->m_snd_buf access (at least). */
4875
4876 const bool rexmit_on = sock->rexmit_on();
4877 bool writable; // See below.
4878 {
4879 Peer_socket::Lock_guard lock(sock->m_mutex);
4880
4881 // Check whether enough data in retransmission queue or snd_buf to send a packet.
4882 if (!snd_deqable(sock))
4883 {
4884 FLOW_LOG_TRACE('[' << sock << "]: "
4885 "Initial check: can_send() is true, but no data to send.");
4886 return;
4887 }
4888 // else can send >= 1 packet.
4889
4890 // For brevity and a bit of speed:
4891 Socket_buffer& snd_buf = sock->m_snd_buf;
4892 list<Peer_socket::Sent_packet::Ptr>& rexmit_q = sock->m_snd_rexmit_q;
4893 size_t& rexmit_q_size = sock->m_snd_rexmit_q_size;
4894 Sequence_number& snd_next_seq_num = sock->m_snd_next_seq_num;
4895
4896 // @todo Implement graceful close.
4897 assert(sock->m_open_sub_state != Peer_socket::Open_sub_state::S_DISCONNECTING);
4898
4899 FLOW_LOG_TRACE('[' << sock << "]: "
4900 "Initial check: Will send from rexmit queue of size [" << rexmit_q_size << "] and/or "
4901 "Send buffer with total size [" << snd_buf.data_size() << "].");
4902 // Very verbose and CPU-intensive!
4903 FLOW_LOG_DATA("Send buffer data = [\n" << snd_buf << "].");
4904
4905 // Send packets until one or both of can_send() and snd_deqable() become false.
4906 do
4907 {
4908 shared_ptr<Data_packet> data;
4910 bool rexmit = false;
4911
4912 /* Record send time. It's only a temporary value for logging, until we
4913 * actually send packet. However, do generate the permanent m_order_num, which is unique. */
4914 Peer_socket::Sent_packet::Sent_when sent_when{ sock_get_new_snd_order_num(sock), Fine_clock::now(), 0 };
4915
4916 /* To provide the best experience on the receiving side, retransmit before sending new data,
4917 * so that Receive buffer on other side receives data as soon as possible. */
4918 if (rexmit_q.empty())
4919 {
4920 // Nothing in retransmission queue, so something is in Send buffer.
4921
4922 // Create low-level DATA packet.
4923 data = Low_lvl_packet::create_uninit_packet<Data_packet>(get_logger());
4924 data->m_rexmit_id = 0; // First (if retransmission is off, only) send attempt.
4925
4926 // Dequeue one block into the packet's data field.
4927
4928 /* Try to dequeue the head block directly into data.m_data. Because we are operating snd_buf
4929 * with block_size_hint == sock->max_block_size(); and because we don't send unless CWND
4930 * allows for at least max_block_size() bytes to be sent, the following should be a
4931 * constant-time operation (a swap of internal buffers) as opposed to a copy. */
4932 snd_buf.consume_buf_move(&data->m_data, sock->max_block_size());
4933
4934 // snd_deqable() returned true, so there must be at least one byte available.
4935 assert(!data->m_data.empty());
4936
4937 // Set sequence number; then advance the next sequence number variable for the next time we do this.
4938 data->m_seq_num = snd_next_seq_num;
4939 advance_seq_num(&snd_next_seq_num, data);
4940
4941 /* We are just about to send the packet. Assume it has been sent. It is not yet Acknowledged
4942 * and not yet Dropped. Therefore it is now In-flight. We should place its info at the back of
4943 * m_snd_flying_pkts_by_sent_when. We must maintain the invariant w/r/t that structure (see comment
4944 * for m_snd_flying_pkts_by_sent_when).
4945 *
4946 * Purpose of keeping these data: at least for comparison against Congestion Window,
4947 * for congestion control. */
4948
4949 // Guarantee that the new sequence number is > all the currently In-flight ones.
4950 assert(data->m_seq_num >= snd_past_last_flying_datum_seq_num(sock));
4951 /* Therefore we will add the following to the end of the map's ordering. Note we've
4952 * incremented m_snd_next_seq_num already, maintaining that member's invariant relationship
4953 * with m_snd_flying_pkts_by_sent_when. */
4954
4955 // New packet: create new metadata object. Record send time. (The latter will be rewritten later.)
4956 sent_pkt = Peer_socket::Sent_packet::Ptr(new Peer_socket::Sent_packet(rexmit_on, data, sent_when));
4957 }
4958 else // if (!rexmit_q.empty())
4959 {
4960 // Get packet and metadata from front of retransmission queue.
4961 rexmit = true;
4962 sent_pkt = rexmit_q.front();
4963
4964 --rexmit_q_size;
4965 rexmit_q.pop_front();
4966
4967 // We'd saved the packet we sent last time -- just need to update some things before resending.
4968 data = sent_pkt->m_packet;
4969
4970 // Retransmitting -- update retransmit count ID (used to match acks to the acked transmit attempt).
4971 ++data->m_rexmit_id;
4972
4973 // Record the send time of this newest attempt. (If pacing enabled this will be rewritten later.)
4974 sent_pkt->m_sent_when.push_back(sent_when);
4975
4976 // Chronologically, no packets sent after this one have been acked yet, as this packet is new.
4977 sent_pkt->m_acks_after_me = 0;
4978 }
4979
4980 /* Note: We have saved Fine_clock::now() as the send time of the packet. However, especially
4981 * if pacing is enabled, we want to record it at the time it is actually sent (pacing may
4982 * delay it). Even if pacing is disabled, CPU pegging may cause a delay in sending (although
4983 * whether that should "count" is a more philosophical question). With pacing, though, since
4984 * pacing spreads out packets over SRTT, and SRTT is measured based on
4985 * Sent_packet::m_sent_when, RTTs artifically become longer and longer if we record the send
4986 * time now. Anyway, this means m_sent_when.back() should be overwritten when the packet is
4987 * actually sent (which should be very soon, unless pacing is enabled).
4988 * See async_sock_low_lvl_packet_send_paced(). */
4989
4990 // data and sent_pkt are ready.
4991
4992 // Add to snd_flying_pkts* and friends; update byte counts.
4993 snd_flying_pkts_push_one(sock, data->m_seq_num, sent_pkt);
4994
4995 /* By adding to m_snd_flying_pkts_by_sent_when (i.e., increasing In-flight byte count), we may have
4996 * affected the result of can_send(). We do check it at the end of the while () body, so OK. */
4997
4998 // Fill out common fields and asynchronously send packet (packet pacing potentially performed inside).
5001 defer_delta_check))
5002 {
5003 return;
5004 }
5005
5006 sock->m_snd_stats.data_sent(data->m_data.size(), rexmit);
5007 }
5008 while (can_send(sock) && snd_deqable(sock)); // (there is CWND/rcv_wnd space; and either rexmittable or new data)
5009
5010 FLOW_LOG_TRACE('[' << sock << "]; connection [" << sock << "]: "
5011 "Final check: "
5012 "can_send() == [" << can_send(sock) << "]; "
5013 "snd_deqable() == [" << snd_deqable(sock) << "].");
5014
5015 writable = snd_buf_enqable(sock); // Must do before releasing lock.
5016 } // lock
5017
5018 /* Finally, check if the above has dequeued enough of m_snd_buf for it to accept more data from
5019 * user. If so, sock is certainly now Writable. Therefore we should soon inform anyone waiting
5020 * on any Event_sets for sock to become Writable.
5021 *
5022 * Caveat: Similar to that in Node::handle_syn_ack_ack_to_syn_rcvd() at similar point in the
5023 * code.
5024 *
5025 * Also: why do this outside the above locked block? Same reason as similar code in
5026 * handle_data_to_established(). */
5027 if (writable &&
5029 {
5030 // Possibly inform the user for any applicable Event_sets right now.
5031 event_set_all_check_delta(defer_delta_check);
5032 }
5033
5034 /* @todo After we implement graceful close, if we'd emptied m_snd_buf above, then here we should
5035 * advance the graceful close towards the final situation (m_int_state and m_state both
5036 * S_CLOSED). */
5037} // Node::send_worker()
5038
5040{
5041 using std::min;
5042
5043 /* m_snd_cong_ctl is the congestion control module, and its CWND value determines how many bytes can
5044 * be In-flight at any given time. If there are enough free bytes (CWND - In-flight) to send
5045 * data, then we can send. Otherwise we cannot. Easy, except what's "data"? There are two
5046 * reasonable answers. One: a byte or more. Two: min(max-block-size, Send buffer size). The former
5047 * answer is fine but somewhat annoying, because then we have to lock sock here***. The 2nd answer
5048 * clearly works but is potentially a little greedier than necessary (i.e., if the 1st block to
5049 * send is small enough to fit into CWND, but CWND doesn't have max-block-size space).
5050 * However, actually, we pretty much have to choose the 2nd answer regardless, as we don't want to
5051 * fragment max-block-size-sized chunks, if we can help it (in the spirit of the reliability
5052 * guarantee [when running in unreliable mode] made in send() method doc header).
5053 *
5054 * I choose the 2nd answer, because (1) it's easier (no locking of sock); (2) it is used by real
5055 * TCP implementations which keep CWND in multiples of MSS (equivalent of max-block-size); (3)
5056 * it's still safe; and (4) see previous paragraph's end. Regarding safety: it's safe, since
5057 * there can be no deadlock, because even if there's < MBS bytes free, eventually In-flight
5058 * packets will become Acknowledged or Dropped and no longer be In-flight, freeing up CWND space;
5059 * and CWND is guaranteed to always be at least 1 * MBS. Thus eventually can_send() will return
5060 * true.
5061 *
5062 * *** - I am now not sure why I wrote this. Why would we have to lock sock here in that case? */
5063
5064 // We have rcv_wnd also; so pretend previous paragraph has: s/CWND/min(CWND, rcv_wnd)/.
5065
5066 const size_t pipe_taken = sock->m_snd_flying_bytes;
5067 const size_t cong_wnd = sock->m_snd_cong_ctl->congestion_window_bytes();
5068 const size_t& rcv_wnd = sock->m_snd_remote_rcv_wnd; // @todo Any particular reason this has & but not pipe_taken?
5069 // Send no more than the network NOR the other side's Receive buffer can take.
5070 const size_t pipe_total = min(cong_wnd, rcv_wnd);
5071
5072 const bool can
5073 = (pipe_taken < pipe_total) && ((pipe_total - pipe_taken) >= sock->max_block_size());
5074
5075 FLOW_LOG_TRACE("cong_ctl [" << sock << "] info: can_send = [" << can << "]; "
5076 "pipe_taken = [" << sock->bytes_blocks_str(pipe_taken) << "]; "
5077 "cong_wnd = [" << sock->bytes_blocks_str(cong_wnd) << "]; "
5078 "rcv_wnd = [" << sock->bytes_blocks_str(rcv_wnd) << "].");
5079
5080 return can;
5081} // Node::can_send()
5082
5084 const Function<size_t ()>& rcv_buf_consume_func,
5085 Error_code* err_code)
5086{
5087 using boost::asio::post;
5088
5089 /* We are in user thread U != W.
5090 * It's important to keep that in mind in this method. In particular, it is absolutely unsafe to
5091 * access m_int_state, which belongs solely to thread W and is never locked. */
5092
5093 // IMPORTANT: The logic here must be consistent with sock_is_readable().
5094
5095 if (!running())
5096 {
5098 return 0;
5099 }
5100 // else
5101
5102 // Pre-condition is that m_mutex is locked already. So EVERYTHING that can be locked, is, including the buffers.
5103
5104 // Pre-condition.
5105 assert(sock->m_state == Peer_socket::State::S_OPEN); // Locked.
5106 assert((sock->m_open_sub_state == Peer_socket::Open_sub_state::S_CONNECTED) ||
5107 (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_CONNECTING) ||
5108 (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_DISCONNECTING));
5109
5110 /* In the rest of the method we must ensure we handle all the cases (-1a/b/c-, -2-) documented in
5111 * the Peer_socket::receive() documentation header. -3- was already handled by
5112 * Peer_socket::receive() before calling us. */
5113
5114 // Try to dequeue stuff into their buffer.
5115 const bool no_bytes_available = sock->m_rcv_buf.empty();
5116 const size_t bytes_consumed = rcv_buf_consume_func();
5117
5118 if (bytes_consumed != 0)
5119 {
5120 /* Unequivocal: if there was stuff in the Receive buffer and was able to place it into their
5121 * buffer then there is no error. (Even if m_disconnect_cause is not success, we are only
5122 * supposed to report that after the Receive buffer has been emptied.)
5123 *
5124 * This handles case -2-. */
5125 FLOW_LOG_TRACE("User thread receive() for [" << sock << "] "
5126 "has successfully returned [" << bytes_consumed << "] bytes.");
5127 err_code->clear();
5128
5129 /* We have changed (increased) the amount of free space in m_rcv_buf. This has rcv_wnd
5130 * implications. We have to at least check whether we should send a window update to the
5131 * other side. However all such book-keeping must be done in thread W due to the data
5132 * involved; call this->receive_wnd_updated(sock). */
5133 post(m_task_engine, [this, sock]() { receive_wnd_updated(sock); });
5134
5135 if (sock->m_rcv_buf.empty()
5136 && (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_DISCONNECTING))
5137 {
5138 /* We've emptied the Receive buffer; and we're in the middle of a graceful close. (@todo
5139 * Graceful close not yet implemented.) There are two possibilities. One, m_int_state ==
5140 * S_CLOSED. In this case the graceful close, at the transport layer, is over, and the only
5141 * thing stopping us from entering m_state == S_CLOSED (via close_connection_immediately())
5142 * was that the user hadn't read all of m_rcv_buf. In this case thread W should
5143 * close_connection_immediately(). Two, m_int_state may be after ESTABLISHED but before
5144 * CLOSED, in which case thread W still has to finish up graceful closing anyway.
5145 *
5146 * We are in thread W and cannot work with m_int_state, so checking it here is not possible.
5147 * Therefore we put this task onto thread W. */
5148 post(m_task_engine,
5149 [this, sock]() { receive_emptied_rcv_buf_while_disconnecting(sock); });
5150 }
5151 return bytes_consumed;
5152 }
5153 // else if (bytes_consumed == 0)
5154
5155 if (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_CONNECTING)
5156 {
5157 /* This is case -1b-. Since we are CONNECTING, no data could have been received yet (simply
5158 * not at that stage of connection opening), so Receive buffer is empty. */
5159 FLOW_LOG_TRACE("User thread receive() for [" << sock << "] "
5160 "has successfully returned no bytes because still not fully connected.");
5161 err_code->clear();
5162 return 0;
5163 }
5164 // else if (state is CONNECTED or DISCONNECTING)
5165
5166 /* We're CONNECTED or DISCONNECTING but could get no bytes. Let's examine each state.
5167 *
5168 * - CONNECTED: Either they provided a zero-sized target buffer (in which case
5169 * !no_bytes_available), or the Receive buffer is simply empty. Thus this is either -1a- or
5170 * -1c- (no_bytes_available determines which).
5171 *
5172 * - DISCONNECTING: Either:
5173 * - the initial block was too large for the max_data_size they provided in their receive()
5174 * call (in which case !no_bytes_available); or
5175 * - they called close_final() (@todo not yet implemented) and thus the Receive buffer was
5176 * cleared at that time, and all incoming data were ignored after that; thus the Receive
5177 * buffer is empty, but a graceful close is still in progress; or
5178 * - they did not call close_final(), but there is a graceful close in progress, and the
5179 * Receive buffer is simply empty.
5180 * Thus this is either -1a- or -1c-. */
5181
5182 if (!no_bytes_available)
5183 {
5184 // This is case -1c-.
5185 FLOW_LOG_TRACE("User thread receive() for [" << sock << "] "
5186 "has data to return, but the provided buffer size is too small.");
5187 err_code->clear();
5188 return 0;
5189 }
5190 // else if (no_bytes_available)
5191
5192 // This is case -1a-.
5193 FLOW_LOG_TRACE("User thread receive() for [" << sock << "] "
5194 "returning no data because Receive buffer empty.");
5195
5196 err_code->clear();
5197
5198 /* @todo Sigh. There's more. Yes, in some situations we can return 0/success here. In other
5199 * situations, though, we should return 0/<Error_code for graceful close> here. The latter
5200 * case would be in the situations where we know no data is coming, or user has said he doesn't
5201 * care about any more data:
5202 *
5203 * -1- A graceful close was initiated by the OTHER side. (Therefore no data could be coming to
5204 * save into Receive buffer.)
5205 * -2- Only we initiated the graceful close, but it was via close_final(), i.e., user is not
5206 * interested in any incoming data anymore. (Therefore we'll always just ignore any
5207 * incoming DATA and not put it into Receive buffer.)
5208 * -3- Only we initiated the graceful close, and it was via close_start() (i.e., user cares
5209 * about further incoming data); however, the final handshake has reached a state in which
5210 * further data cannot be incoming. (Therefore no data could be coming to save into Receive
5211 * buffer.)
5212 *
5213 * I am not writing code for this logic at this time. The implementations depends on how
5214 * exactly our graceful close works. This entire method, right now, is dead code, since there is
5215 * no graceful close, but I wrote it anyway to provide a skeleton for the future, since I
5216 * already thought about it. However it would be unreasonable to implement the above logic in the
5217 * absence of graceful close in the first place, skeleton or not. Therefore, dead code or not, I
5218 * do the "conservative" thing: return 0/success even in the above situations. Eventually the
5219 * graceful close will complete, at which point we'll return an error anyway, so the user won't be
5220 * left uninformed forever (worst case: the close will time out).
5221 *
5222 * For when we do implement the above logic, some thoughts: Detecting the situation in thread U
5223 * != W may be difficult and may introduce complex synchronization issues. One way
5224 * to do it might be to introduce synchronized bool Peer_socket::m_no_more_rcv_data, which
5225 * starts at false and can become true (but not false again). This member would be set to true,
5226 * by thread W, if and only if one of the above situations is detected by thread W. Then here
5227 * we'd check it, and if it's true, return error; otherwise return success.
5228 *
5229 * IMPORTANT: The logic here must be consistent with sock_is_readable(). */
5230 return 0;
5231} // Node::receive()
5232
5233bool Node::sock_is_readable(const boost::any& sock_as_any) const
5234{
5235 using boost::any_cast;
5236
5237 const Peer_socket::Const_ptr sock = any_cast<Peer_socket::Ptr>(sock_as_any);
5238
5239 Peer_socket::Lock_guard lock(sock->m_mutex); // Many threads can access/write below state.
5240
5241 /* Our task here is to return true if and only if at this very moment calling sock->receive(),
5242 * assuming sufficient user buffer space, would yield either a return value of > 0 OR a
5243 * non-success *err_code. In other words, receive() would return "something." This is used for
5244 * Event_set machinery.
5245 *
5246 * This should mirror receive()'s algorithm. @todo Should receive() call this, for code reuse?
5247 * Maybe/maybe not. Consider performance when deciding.
5248 *
5249 * - If state is CLOSED, then some sort of error/terminating condition occurred, so receive()
5250 * would return 0 and non-success Error_code == sock->m_disconnect_cause. (Readable.)
5251 * - Otherwise, if Receive buffer can be dequeued, receive() would return > 0.
5252 * - Otherwise, if Receive buffer cannot be dequeued, receive() would return 0 and no error. (Not
5253 * Readable.) Note that Receive buffer is guaranteed to be clear when entering non-Readable
5254 * non-error states (OPEN+CONNECTING, OPEN+DISCONNECTING). (Readable.)
5255 *
5256 * @todo Once we implement graceful close, there will be situations where Receive buffer is empty, state is
5257 * OPEN+DISCONNECTING, m_disconnect_cause = <cause of disconnect>, and we should return true (Readable)
5258 * here (only when we also know that no future Receive traffic possible). See receive(). */
5259
5260 return (sock->m_state == Peer_socket::State::S_CLOSED) || rcv_buf_deqable(sock);
5261} // Node::sock_is_readable()
5262
5264{
5265 // We are in thread W.
5266
5267 /* rcv_wnd (free Receive buffer space) is sent to other side opportunistically in ACKs. While
5268 * sender is sending data, they will have a good idea of our rcv_wnd as well. Is that (in a
5269 * one-way-traffic situation) sufficient however? If the sender is not sending data, because the
5270 * application on the sender doesn't provide more data to send, then the discussion is moot.
5271 * What if the sender is not sending data, because we have told it rcv_wnd is 0 (meaning our
5272 * Receive buffer is full)? This can and will happen. For example suppose our application layer
5273 * simply stops reading from Receive buffer for a while, resulting in rcv_wnd 0 sent in one of the
5274 * ACKs. Now sender knows rcv_wnd is 0. Now suppose our application reads off the entire Receive
5275 * buffer. rcv_wnd is now 100%, but since sender is not sending (because it thinks rcv_wnd is
5276 * still 0), there will be no ACKs onto which to add rcv_wnd. Thus the traffic completely stops.
5277 *
5278 * Original RFC 793 (as well as RFC 1122) suggests TCP sender should deal with this by "probing"
5279 * with 1-byte (I think) data segments sent regularly (every RTO; our DTO) in order to trigger
5280 * ACKs, which would eventually expose the non-zero rcv_wnd. To me this seems to have the
5281 * disadvantage of complexity and implications on how we packetize data (especially since in
5282 * unreliable mode we're not supposed to break up contiguous blocks of max-block-size bytes).
5283 * Also it is not as responsive as it could be. Consider that the most common scenario in
5284 * high-speed downloads is that the Receive buffer is exceeded only momentarily (due to thread
5285 * contention on receiver or something) but is then quickly emptied (once the thread contention is
5286 * resolved). In case that happens in a fraction of a second, having the probe occur a DTO later
5287 * wastes a long time. Instead the RECEIVER could take initiative and send an empty ACK with a
5288 * rcv_wnd update. When should it do this? A naive answer would be to do it simply EVERY time
5289 * free Receive buffer space increases. However that'd be terrible, as in a typical scenario
5290 * (where lots of bytes arrive, while user reads off lots of bytes due to them becoming available
5291 * to read) it would explode the number of ACKs. Even in the "sender has stopped due to
5292 * rcv_wnd=0" situation, this would result in a ton of ACKs. Plus it would cause sender to start
5293 * recovering with quite small windows which is inefficient. So the less naive way is to send the
5294 * ACK of our volition if free buffer space has increased by some % of its max capacity (like
5295 * 50%).
5296 *
5297 * This would certainly solve aforementioned situation where Receive buffer fills up momentarily
5298 * but then is quickly cleared. A fraction of a second later, the free space will have increased
5299 * by over 50%, an ACK would go to sender, and sender would work with a nice large rcv_wnd.
5300 * However, if the Receiver only reads off 49% of the data and then stops, traffic would remain
5301 * stuck (even though 49% of the buffer is available). This is where the sender-side probing
5302 * would solve it (slowly); though sender-side unsolicited ACKing on a timer would also do. I
5303 * leave that as a @todo; probably important in a widely-used net_flow; but without it it should be
5304 * sufficient for the initial intended purpose of net_flow. In that use scenario, we count on the
5305 * receiver code to be well behaved and read from Receive buffer as soon as the computer lets it.
5306 *
5307 * With that settled, there is one more concern. This is intuitively clear but is also mentioned
5308 * in RFC 1122-4.2.2.17. Suppose the receiver-initiated ACK after 50% of buffer is cleared is
5309 * dropped by the network. ACKs are not reliable (there are no ACKs of ACKs), so then we're back
5310 * in no-more-traffic-forever land. To solve this, I implement this scheme: Having sent that ACK,
5311 * start a timer and then send it again periodically, until some long time period (something like
5312 * a minute) expires (just in case) OR we get a new DATA packet from the sender. In the latter
5313 * case we're back in business, as it implies sender got our window update. Note that this
5314 * mechanism is not necessary any longer, once we implement sender-side probing as explained
5315 * above. */
5316
5317 // As always, no need to lock m_state, etc., unless we plan to alter them, since no other thread can alter them.
5318
5319 if (sock->m_int_state != Peer_socket::Int_state::S_ESTABLISHED)
5320 {
5321 /* Yes, they emptied Receive buffer. However, we haven't finished the graceful close.
5322 * Therefore -- even though one more barrier to reaching m_state == S_CLOSED has been removed --
5323 * there's nothing further to do at this time. In fact, in certain situations we might even
5324 * get more data onto the Receive buffer! @todo No graceful close yet. */
5325 FLOW_LOG_INFO('[' << sock << "] Receive buffer space freed, "
5326 "but state is now [" << sock->m_int_state << "]; ignoring.");
5327 return;
5328 }
5329 // else if (m_int_state == S_ESTABLISHED)
5330
5331 if (sock->m_rcv_in_rcv_wnd_recovery)
5332 {
5333 /* We have already sent the unsolicited ACK and are currently in the phase where we're
5334 * periodically sending more, until we get some DATA from sender or a long period of time
5335 * passes. Even if we've freed yet another large chunk of the buffer since the last ACK, do
5336 * not start again... just let it continue. */
5337 FLOW_LOG_TRACE('[' << sock << "] Receive buffer space freed, but "
5338 "we are already in rcv_wnd recovery mode. Nothing to do.");
5339 return;
5340 }
5341 // else
5342
5343 // Grab available Receive buffer space.
5344 const size_t rcv_wnd = sock_rcv_wnd(sock);
5345 // @todo That was a ~copy/paste of Node::async_low_lvl_ack_send(). Add code reuse.
5346
5347 const size_t& last_rcv_wnd = sock->m_rcv_last_sent_rcv_wnd;
5348
5349 if (rcv_wnd <= last_rcv_wnd)
5350 {
5351 /* This seems odd, but one can imagine more data arriving between when we were placed onto W's
5352 * task queue and when we executed. So it's not that odd and not worth INFO or WARNING. */
5353 FLOW_LOG_TRACE('[' << sock << "] Receive buffer space freed, but "
5354 "free space [" << sock->bytes_blocks_str(rcv_wnd) << "] <= prev "
5355 "free space [" << sock->bytes_blocks_str(last_rcv_wnd) << "]. Nothing to do.");
5356 return;
5357 }
5358 // else
5359
5360 const size_t diff = rcv_wnd - last_rcv_wnd;
5361 const unsigned int pct = sock->opt(sock->m_opts.m_st_rcv_buf_max_size_to_advertise_percent);
5362 const size_t max_rcv_buf_size = sock->max_block_size_multiple(sock->m_opts.m_st_rcv_buf_max_size);
5363 const size_t min_inc = max_rcv_buf_size * pct / 100;
5364
5365 if (diff < min_inc)
5366 {
5367 // Not big enough increase; wait until more space is freed before informing other side.
5368 FLOW_LOG_TRACE('[' << sock << "] Receive buffer space "
5369 "freed is [" << sock->bytes_blocks_str(diff) << "] since last advertisement; "
5370 "< threshold [" << pct << "%] x "
5371 "[" << sock->bytes_blocks_str(max_rcv_buf_size) << "] = "
5372 "[" << sock->bytes_blocks_str(min_inc) << "]. Not advertising rcv_wnd yet.");
5373 return;
5374 }
5375 // else cool. Let's advertise it.
5376
5377 // This is ~equally as rare as Receive buffer overflows, so this is worth an INFO message.
5378 FLOW_LOG_INFO('[' << sock << "] Receive buffer space "
5379 "freed is [" << sock->bytes_blocks_str(diff) << "] since last advertisement; "
5380 "rcv_wnd = [" << sock->bytes_blocks_str(rcv_wnd) << "]; "
5381 ">= threshold [" << pct << "%] x "
5382 "[" << sock->bytes_blocks_str(max_rcv_buf_size) << "] = "
5383 "[" << sock->bytes_blocks_str(min_inc) << "]. Sending unsolicited rcv_wnd-advertising ACK "
5384 "and entering rcv_wnd recovery.");
5385
5386 // Prevent any further shenanigans (see above), until we exit this mode.
5387 sock->m_rcv_in_rcv_wnd_recovery = true;
5388 // Mark this down, so that we exit this mode eventually.
5389 sock->m_rcv_wnd_recovery_start_time = Fine_clock::now();
5390
5391 // Record we started the mode.
5392 sock->m_rcv_stats.rcv_wnd_recovery_event_start();
5393
5394 async_rcv_wnd_recovery(sock, rcv_wnd);
5395} // Node::receive_wnd_updated()
5396
5398{
5399 using boost::chrono::milliseconds;
5400 using boost::chrono::round;
5401 using boost::weak_ptr;
5402
5403 // We are in thread W.
5404
5405 // As discussed in Node::receive_wnd_updated(), send the ACK and then periodically re-send it until canceled.
5406
5407 // Create an ACK with no packets acknowledged (so just a window update) and send it off.
5408 auto ack = Low_lvl_packet::create_uninit_packet<Ack_packet>(get_logger());
5409 ack->m_rcv_wnd = rcv_wnd;
5410 // Record that it was advertised!
5411 sock->m_rcv_last_sent_rcv_wnd = rcv_wnd;
5412
5415 false))
5416 // ^-- defer_delta_check == false: for similar reason as in send_worker_check_state() calling send_worker().
5417 {
5418 return;
5419 }
5420 // else
5421
5422 // Register one ACK packet we will send ASAP (and that it acknowledged no individual packets).
5423 sock->m_rcv_stats.sent_low_lvl_ack_packet(true);
5424
5425 // ACK queued to send soon. Now, as discussed, protect against it being lost by scheduling a timer.
5426
5427 const Fine_duration fire_when_from_now = sock->opt(sock->m_opts.m_dyn_rcv_wnd_recovery_timer_period);
5428
5429 FLOW_LOG_INFO("Setting timer to fire "
5430 "[" << round<milliseconds>(fire_when_from_now) << "] from now.");
5431
5432 /* As usual, when scheduling a thing we can use the much simpler util::schedule_task_*() API; or the
5433 * full-featured boost.asio Timer. We don't need the advanced features; so the only possible reason
5434 * to go with Timer would be the perf considerations (see schedule_task_from_now() doc header for discussion).
5435 * It is emphatically NOT the case that lots of these tasks are scheduled/fired/canceled per unit time;
5436 * e.g., we see it as rare enough to be OK with an INFO log message. Hence no need to reuse a Timer repeatedly,
5437 * so use the simple API. */
5438
5439 sock->m_rcv_wnd_recovery_scheduled_task
5440 = schedule_task_from_now(get_logger(), fire_when_from_now, true, &m_task_engine,
5441 [this, sock_observer = weak_ptr<Peer_socket>(sock)](bool)
5442 {
5443 // We are in thread W.
5444
5445 auto sock = sock_observer.lock();
5446 if (!sock)
5447 {
5448 return; // Possible or not, allow for this possibility for maintainability.
5449 }
5450 // else
5451
5452 const Fine_duration since_recovery_started = Fine_clock::now() - sock->m_rcv_wnd_recovery_start_time;
5453 if (since_recovery_started > sock->opt(sock->m_opts.m_dyn_rcv_wnd_recovery_max_period))
5454 {
5455 // We've kept ACKing for a long time, and still no data. Give up: it's all up to the sender now.
5456
5457 // This is ~equally as rare as Receive buffer overflows, so this is worth an INFO message.
5458 FLOW_LOG_INFO('[' << sock << "]: still no new DATA arrived since last rcv_wnd advertisement; "
5459 "Time since entering recovery [" << round<milliseconds>(since_recovery_started) << "] expired. "
5460 "Ending rcv_wnd recovery.");
5461 sock->m_rcv_in_rcv_wnd_recovery = false;
5462
5463 // Record we ended in timeout.
5464 sock->m_rcv_stats.rcv_wnd_recovery_event_finish(false);
5465
5466 return;
5467 }
5468 // else
5469
5470 // Still in rcv_wnd recovery. Send another unsolicited ACK (as in receive_wnd_updated()).
5471
5472 // Re-grab available Receive buffer space.
5473 const size_t rcv_wnd = sock_rcv_wnd(sock);
5474
5475 // This is ~equally as rare as Receive buffer overflows, so this is worth an INFO message.
5476 FLOW_LOG_INFO('[' << sock << "]: still no new DATA arrived since last rcv_wnd advertisement; "
5477 "rcv_wnd = [" << sock->bytes_blocks_str(rcv_wnd) << "]; "
5478 "time since entering recovery [" << round<milliseconds>(since_recovery_started) << "]. "
5479 "Sending unsolicited rcv_wnd-advertising ACK and continuing rcv_wnd recovery.");
5480
5481 async_rcv_wnd_recovery(sock, rcv_wnd);
5482 }); // on-scheduled-task-fired
5483} // Node::async_rcv_wnd_recovery()
5484
5486{
5487 using boost::chrono::milliseconds;
5488 using boost::chrono::round;
5490
5491 // We are in thread W.
5492
5493 // We got some good DATA. If we were sending unsolicited window update ACKs, we can now stop.
5494
5495 if (!sock->m_rcv_in_rcv_wnd_recovery)
5496 {
5497 // We weren't.
5498 return;
5499 }
5500 // else
5501
5502 // This is ~equally as rare as Receive buffer overflows, so this is worth an INFO message.
5503 FLOW_LOG_INFO('[' << sock << "]: Canceling rcv_wnd recovery; "
5504 "Time since entering recovery "
5505 "[" << round<milliseconds>(Fine_clock::now() - sock->m_rcv_wnd_recovery_start_time) << "].");
5506
5507 sock->m_rcv_in_rcv_wnd_recovery = false;
5508#ifndef NDEBUG
5509 const bool canceled =
5510#endif
5511 scheduled_task_cancel(get_logger(), sock->m_rcv_wnd_recovery_scheduled_task);
5512 assert(canceled);
5513
5514 // Record we ended in success.
5515 sock->m_rcv_stats.rcv_wnd_recovery_event_finish(true);
5516}
5517
5519{
5520 using std::numeric_limits;
5521
5522 // We are in thread W.
5523
5524 if (!sock->opt(sock->m_opts.m_st_rcv_flow_control_on))
5525 {
5526 /* Flow control disabled, so if we always advertise the same huge value, the other side will
5527 * never stop sending due to rcv_wnd. On this side, we won't activate rcv_wnd recovery, because
5528 * the "last advertised" window will always equal the current window. */
5529 return numeric_limits<size_t>::max();
5530 }
5531 // else
5532
5533 // Grab available Receive buffer space. We have to momentarily lock sock due to access to sock->m_rcv_buf.
5534 size_t rcv_buf_size;
5535 {
5537 rcv_buf_size = sock->m_rcv_buf.data_size();
5538 }
5539
5540 // Add the reassembly queue cumulative stored data size. Why? See sock_data_to_reassembly_q_unless_overflow().
5541 if (sock->rexmit_on())
5542 {
5543 rcv_buf_size += sock->m_rcv_reassembly_q_data_size; // (At least one reason we must be in thread W.)
5544 }
5545
5546 const size_t max_rcv_buf_size = sock->max_block_size_multiple(sock->m_opts.m_st_rcv_buf_max_size);
5547
5548 return (max_rcv_buf_size > rcv_buf_size) ? (max_rcv_buf_size - rcv_buf_size) : 0;
5549}
5550
5552{
5553 // We are in thread W.
5554
5555 /* As always, no need to lock m_state, etc., unless we plan to alter them, since no other thread can alter them.
5556 * ...On the other hand, we are going to be checking m_rcv_buf for emptiness below, and if it's not empty,
5557 * a user thread U != W may be altering it right now by consuming it. So, lock.
5558 *
5559 * Could think about locking later in this function, but this is called so rarely I'd rather not have to
5560 * worry about whether it's OK to do that and just not. */
5561 Peer_socket::Lock_guard lock(sock->m_mutex);
5562
5563 if (sock->m_state == Peer_socket::State::S_CLOSED)
5564 {
5565 /* When were placed onto thread W, state was S_OPEN+S_DISCONNECTING, but before boost.asio
5566 * could execute us, it executed another handler which already moved us to S_CLOSED for
5567 * whatever reason (there are many valid ones). So just don't do anything, as we no longer
5568 * apply. It's kind of interesting, so log INFO message. */
5569 FLOW_LOG_INFO('[' << sock << "] "
5570 "was completely closed before asynchronous "
5571 "receive_emptied_rcv_buf_while_disconnecting() could proceed.");
5572 return;
5573 }
5574 // else
5575
5576 // Sanity-check (we cannot be called until there's a graceful close underway).
5577 assert((sock->m_state == Peer_socket::State::S_OPEN) &&
5578 (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_DISCONNECTING));
5579
5580 const Socket_id socket_id = Node::socket_id(sock);
5581
5582 if (sock->m_int_state != Peer_socket::Int_state::S_CLOSED)
5583 {
5584 /* Yes, they emptied Receive buffer. However, we haven't finished the graceful close.
5585 * Therefore -- even though one more barrier to reaching m_state == S_CLOSED has been removed --
5586 * there's nothing further to do at this time. In fact, in certain situations we might even
5587 * get more data onto the Receive buffer! @todo No graceful close yet. */
5588 FLOW_LOG_TRACE('[' << sock << "] "
5589 "is gracefully closing, and Receive buffer is empty, but graceful close itself not yet finished.");
5590 return;
5591 }
5592 // else if (m_int_state == S_CLOSED)
5593
5594 // Ensure Receive buffer is indeed still empty. (Can still get data while gracefully closing.)
5595 if (!sock->m_rcv_buf.empty())
5596 {
5597 /* Some data arrived between the time we were placed on thread W and boost.asio executing us.
5598 * So we can't do anything; user has to receive() the stuff first, which should call us again. */
5599 FLOW_LOG_TRACE('[' << sock << "] "
5600 "is gracefully closing, but Receive buffer has data again.");
5601 return;
5602 }
5603 // else if (m_int_state == S_CLOSED, and m_rcv_buf is empty)
5604
5605 // Yes, the transport layer final handshake is finished. Since Receive buffer now empty, no more barriers remain.
5606 FLOW_LOG_TRACE('[' << sock << "] "
5607 "is gracefully closing, and Receive buffer is now empty. Ready to permanently close.");
5609 Error_code(), /* err_code == success indicates clean close here. */
5610 false);
5611 /* ^-- defer_delta_check == false: for similar reason as when calling send_worker() from
5612 * send_worker_check_state(). */
5613} // Node::receive_emptied_rcv_buf_while_disconnecting()
5614
5616{
5617 using boost::adopt_lock;
5620
5621 /* We are in user thread U != W.
5622 * It's important to keep that in mind in this method. In particular, it is absolutely unsafe to
5623 * access m_int_state, which belongs solely to thread W and is never locked. */
5624
5625 {
5626 /* WARNING!!! sock->m_mutex is locked, but WE must unlock it before returning! Can't leave that
5627 * to the caller, because we must unlock at a specific point below, right before post()ing
5628 * close_abruptly_worker() onto thread W. Use a Lock_guard that adopts an
5629 * already-locked mutex. */
5630 Peer_socket::Lock_guard lock(sock->m_mutex, adopt_lock);
5631
5632 if (!running())
5633 {
5635 return;
5636 }
5637 // else
5638
5639 // Pre-condition.
5640 assert(sock->m_state == Peer_socket::State::S_OPEN);
5641
5642 /* Put the rest of the work into thread W. For justification, see big comment in listen().
5643 * Addendum regarding performance: close_abruptly() is probably called more frequently than
5644 * listen(), but I doubt the performance impact is serious even so. send() and receive() might be
5645 * a different story. */
5646
5647 // We're done -- must unlock so that thread W can do what it wants to with sock.
5648 } // lock
5649
5650 // Load this onto thread W boost.asio work queue. We don't return until it runs, so [&].
5651 asio_exec_ctx_post(get_logger(), &m_task_engine, Synchronicity::S_ASYNC_AND_AWAIT_CONCURRENT_COMPLETION, [&]()
5652 {
5653 // We are in thread W. Thread U is waiting for us to do our stuff and return.
5654
5655 /* Since we were placed onto thread W, another handler may have been executed before boost.asio
5656 * got to us. Therefore we may already be S_CLOSED. Detect this. */
5657
5658 if (sock->m_state == Peer_socket::State::S_CLOSED) // No need to lock: only W can write to this.
5659 {
5660 // Yep, already closed. sock->m_disconnect_cause is already set to closure reason. Done.
5661 *err_code = sock->m_disconnect_cause;
5662 return;
5663 }
5664 // else
5665
5666 /* Cool, we're not quite closed yet. We could be connecting... or connected... or even in the
5667 * middle of graceful close (@todo that's not yet implemented). Any of those situations allow
5668 * close_abruptly(), just as (indeed because of the fact that) any of those situations allow
5669 * close_connection_immediately() (..., error::...).
5670 *
5671 * Therefore simply do the following. Pre-conditions hold: sock is in m_socks and is S_OPEN
5672 * (because not S_CLOSED); 3rd arg contains failure reason. */
5674 /* ^-- defer_delta_check == false: for similar reason as when calling send_worker() from
5675 * send_worker_check_state(). */
5676
5677 // That set sock->m_disconnect_cause. Closure successful. Done.
5678 err_code->clear(); // Success.
5679 }); // asio_exec_ctx_post()
5680 // If got here, the task has completed in thread W and signaled us to that effect.
5681} // Node::close_abruptly()
5682
5684 const Error_code& err_code, bool defer_delta_check)
5685{
5686 using boost::lexical_cast;
5687 using std::string;
5688
5689 // We are in thread W.
5690
5691 // @todo OK if a graceful close (S_OPEN+S_DISCONNECTING) is already in progress? Below provides for it, but ensure.
5692 assert(sock->m_state == Peer_socket::State::S_OPEN);
5693
5694 if (err_code)
5695 {
5696 FLOW_ERROR_LOG_ERROR(err_code);
5697 FLOW_LOG_INFO("Closing and destroying [" << sock << "] abruptly.");
5698 }
5699 else
5700 {
5701 // m_disconnect_cause has already been set and logged.
5702 FLOW_LOG_INFO("Closing and destroying [" << sock << "] after graceful close.");
5703 }
5704 // Log final state report.
5705 sock_log_detail(sock);
5706
5707 /* Thread safety: we're in thread W, so no need to lock things by default (as most resources can
5708 * also only be accessed from thread W). Exceptions are certain data members in Peer_socket
5709 * sock and Server_socket serv that may have originated it (if it was a passive open). I will
5710 * comment on the locking situation for those data members as they come up in the code. */
5711
5712 // First, set various state in *sock (including emptying Send and Receive buffers and setting m_node = 0).
5713
5714 /* Save the final set of stats for Peer_socket::info(), as the source data will probably get
5715 * purged just below in sock_disconnect_*(). */
5716 sock_load_info_struct(sock, &sock->m_info_on_close);
5717 // We may have to massage it a little more, because some info is set below, by when it's too late.
5718
5719 if (err_code)
5720 {
5721 // sock->m_disconnect_cause has not yet been set; so sock_load_info_struct() did not copy it properly yet. Do so.
5722 sock->m_info_on_close.m_disconnect_cause = err_code;
5723 // Similarly:
5724 sock->m_info_on_close.m_int_state_str = lexical_cast<string>(Peer_socket::Int_state::S_CLOSED);
5725
5726 /* This is an abrupt close. This can be called in any situation once sock is in m_socks. It's
5727 * our responsibility to move directly to transport layer state S_CLOSED and user state
5728 * S_CLOSED. */
5729 sock_set_int_state(sock, Peer_socket::Int_state::S_CLOSED); // Thread W access only; no need to lock.
5730 // Sets S_CLOSED public state (and related data, including m_disconnect_cause). Locked inside.
5731 sock_disconnect_detected(sock, err_code, true);
5732 }
5733 else
5734 {
5735 /* We are in a graceful close and have reached the final stage of it (connection entirely
5736 * closed without having to abruptly close; buffers emptied gracefully by user and/or Node).
5737 * Therefore m_int_state is already S_CLOSED (method pre-condition), so
5738 * we just complete the user-visible state change. */
5739
5740 assert(sock->m_int_state == Peer_socket::Int_state::S_CLOSED); // Thread W access only; no need to lock.
5741 sock_disconnect_completed(sock); // Sets S_CLOSED public state (and related data). Locked inside.
5742 }
5743
5744 // Next, remove sock from our main socket list.
5745
5746#ifndef NDEBUG
5747 const auto erased = 1 ==
5748#endif
5749 m_socks.erase(socket_id);
5750 assert(erased); // S_OPEN => it's in m_socks. Otherwise there's a serious bug somewhere.
5751
5752 // Next, if this potentially is an unaccepted connection, delete it from the corresponding server socket.
5753 if (!sock->m_active_connect)
5754 {
5755 /* What is that Server_socket though? Well, it's in sock->m_originating_serv... but that data
5756 * member can be accessed from a non-W thread, so we'd have to lock it. But the mutex that
5757 * protects it in in *m_originating_serv itself! So it's a chicked/egg problem. However, we
5758 * can find that Server_socket (if it applies to sock) another way: through the port. Its port
5759 * must be the same as local_port. If such a Server_socket exists, cool; and if sock is
5760 * tracked inside it, cool. Otherwise we needn't do anything. */
5761 Port_to_server_map::const_iterator port_to_server_it = m_servs.find(sock->m_local_port);
5762 if (port_to_server_it != m_servs.end()) // Server at same port number exists. Not necessarily our guy though.
5763 {
5764 // If it is our guy, delete us from him.
5765 Server_socket::Ptr serv = port_to_server_it->second;
5766 serv_peer_socket_closed(serv, sock); // Thread-safe (in particular with respect to simultaneous serv->accept()).
5767 }
5768 }
5769
5770 // sock now should not be (directly or indirectly) referenced in any Node data structures.
5771
5772 // Cancel any timers.
5773 cancel_timers(sock);
5774
5775 /* Return the port -- but only if it is an active open. If it's a passive open the port is
5776 * still reserved for the server socket. */
5777 if (sock->m_active_connect)
5778 {
5779 Error_code return_err_code;
5780 m_ports.return_port(sock->m_local_port, &return_err_code);
5781 assert(!return_err_code);
5782 }
5783
5784 /* sock has changed to CLOSED state. Performing sock->receive() or sock->write() would therefore
5785 * certainly return an error. Returning an error from those methods (as opposed to 0 but no
5786 * error) is considered Readable and Writable, respectively (as we want to alert the user to the
5787 * error, so her wait [if any] wakes up and notices the error). Therefore we should soon inform
5788 * anyone waiting on any Event_sets for sock to become Readable or Writable.
5789 *
5790 * Caveat: Similar to that in Node::handle_syn_ack_ack_to_syn_rcvd() at similar point in the
5791 * code. */
5792
5793 // Accumulate the event into the Node store (note: not any Event_set yet).
5794 const bool inserted_rd = m_sock_events[Event_set::Event_type::S_PEER_SOCKET_READABLE].insert(sock).second;
5795 const bool inserted_wr = m_sock_events[Event_set::Event_type::S_PEER_SOCKET_WRITABLE].insert(sock).second;
5796 if (inserted_rd || inserted_wr) // Must always perform both insert()s, hence the use of the 2 variables.
5797 {
5798 // Possibly inform the user for any applicable Event_sets right now.
5799 event_set_all_check_delta(defer_delta_check);
5800 }
5801} // Node::close_connection_immediately()
5802
5804 const Error_code& err_code, bool defer_delta_check)
5805{
5806 // We are in thread W.
5808 close_connection_immediately(socket_id, sock, err_code, defer_delta_check);
5809}
5810
5812{
5813 using util::Blob;
5814
5815 auto syn = Low_lvl_packet::create_uninit_packet<Syn_packet>(get_logger());
5816 // Initial Sequence Number.
5817 syn->m_init_seq_num = sock->m_snd_init_seq_num;
5818 /* Send serialized version of arbitrary user data, which user can deserialize on the other side
5819 * after accepting connection.
5820 * Add const to express we require a copy, not move. */
5821 syn->m_serialized_metadata = static_cast<const Blob&>(sock->m_serialized_metadata);
5822
5823 return syn;
5824}
5825
5827{
5828 auto syn_ack = Low_lvl_packet::create_uninit_packet<Syn_ack_packet>(get_logger());
5829 // Initial Sequence Number (the start of our own series).
5830 syn_ack->m_init_seq_num = sock->m_snd_init_seq_num;
5831 // Random security token.
5832 syn_ack->m_packed.m_security_token = sock->m_security_token;
5833 // Advertise initial rcv_wnd.
5834 syn_ack->m_packed.m_rcv_wnd = sock->m_rcv_last_sent_rcv_wnd;
5835
5836 return syn_ack;
5837}
5838
5840 boost::shared_ptr<const Syn_ack_packet>& syn_ack)
5841{
5842 // Make a packet.
5843 auto syn_ack_ack = Low_lvl_packet::create_uninit_packet<Syn_ack_ack_packet>(get_logger());
5844 // No sequence number (not the initial SYN; not data).
5845 // Security token: give it back to them (they will verify).
5846 syn_ack_ack->m_packed.m_security_token = syn_ack->m_packed.m_security_token;
5847 // Initial receive window is probably the entire, ~empty Receive buffer. Save the advertised rcv_wnd as promised.
5848 syn_ack_ack->m_packed.m_rcv_wnd = sock->m_rcv_last_sent_rcv_wnd = sock_rcv_wnd(sock);
5849
5850 // Fill out common fields and asynchronously send packet.
5852 Low_lvl_packet::ptr_cast(syn_ack_ack),
5853 true); // Warns on error.
5854 // ^-- defer_delta_check == true: for similar reason as in handle_syn_ack_ack_to_syn_rcvd().
5855}
5856
5857void Node::async_low_lvl_ack_send(Peer_socket::Ptr sock, bool defer_delta_check, const Error_code& sys_err_code)
5858{
5859 using boost::chrono::milliseconds;
5860 using boost::chrono::duration_cast;
5861 using std::make_pair;
5862 using std::vector;
5863 using std::numeric_limits;
5864
5865 // We are in thread W.
5866
5867 // Handle the timer-related corner cases (if we were invoked by m_rcv_delayed_ack_timer triggering).
5868
5869 // For brevity and speed:
5870 vector<Peer_socket::Individual_ack::Ptr>& pending_acks = sock->m_rcv_pending_acks;
5871
5872 if (sys_err_code == boost::asio::error::operation_aborted)
5873 {
5874 FLOW_LOG_TRACE("Delayed [ACK] timer [" << sock << "] canceled; "
5875 "pending acknowledgment count [" << pending_acks.size() << "].");
5876 return;
5877 }
5878 // else
5879
5880 FLOW_LOG_TRACE("Delayed [ACK] timer [" << sock << "] triggered, or ACK forced; "
5881 "pending acknowledgment count [" << pending_acks.size() << "].");
5882
5883 if (sys_err_code)
5884 {
5885 FLOW_ERROR_SYS_ERROR_LOG_WARNING(); // Log non-portable error.
5886 // Nothing else to do here. We don't know what this means. So just treat it as if timer was triggered.
5887 }
5888
5889 if (sock->m_int_state != Peer_socket::Int_state::S_ESTABLISHED)
5890 {
5891 /* This is unlikely but legitimate. (Can happen if, by the time the handler that advanced state
5892 * from ESTABLISHED to another state started, this timer also was triggered and thus queued the
5893 * current handler inside m_task_engine.) */
5894 FLOW_LOG_TRACE("Delayed [ACK] timer [" << sock << "] triggered, "
5895 "but socket already in inapplicable state [" << sock->m_int_state << "]. Ignoring.");
5896 return;
5897 }
5898 // else
5899
5900 if (pending_acks.empty())
5901 {
5902 /* This is probably a bug if we're here. However, assert() or connection closure seems a bit
5903 * drastic... carry on. */
5904 FLOW_LOG_WARNING("Delayed [ACK] timer [" << sock << "] triggered, "
5905 "but socket has no pending acknowledgments. This is likely an internal bug. Ignoring.");
5906 return;
5907 }
5908 // else
5909
5910 /* OK, let's do it. Basically just shove all the acknowledgments into an ACK packet. Namely, for
5911 * each one, shove the starting sequence number and the amount of time since we first received it
5912 * (so the other side can subtract that to compute RTT, if it wants).
5913 *
5914 * However we may run out of space and need more ACKs. To keep track of how much space we've
5915 * used, compute an estimate for serializing those two pieces of data and keep adding that for
5916 * each acknowledgment handled. The budget is given by max-block-size; a DATA packet is allowed
5917 * that much payload on top of the normal header stuff, so that should be good enough for us too.
5918 * There's probably some constant overhead on top of that, but it's close enough.
5919 *
5920 * ACK is also used as an opportunistic way to send rcv_wnd to the other side, which informs
5921 * them of how much more data we can take at this time. Naively we should just have rcv_wnd =
5922 * the max buffer size minus the buffer space currently taken, and that is the most accurate
5923 * thing. However RFC 793 ("Window Management Suggestions") and probably other literature
5924 * suggest to (when the available space is increasing) advertise the window in larger steps (so
5925 * withhold the higher rcv_wnd value until it increases even further up to some threshold). For
5926 * now I forego such fanciness. See also the rcv_wnd-related comment in
5927 * Node::receive_wnd_increased() for further reasoning on rcv_wnd (namely surrounding the fact
5928 * that sometimes we must send ACKs with no packets acknowledged to ensure a connection does not
5929 * stall due to a zero rcv_wnd). */
5930
5931 // Grab available Receive buffer space. Save it for later comparison.
5932 const size_t& rcv_wnd = sock->m_rcv_last_sent_rcv_wnd = sock_rcv_wnd(sock);
5933
5934 auto ack = Low_lvl_packet::create_uninit_packet<Ack_packet>(get_logger());
5935 ack->m_rcv_wnd = rcv_wnd; // Advertise receive window. @todo Code reuse?
5936
5937 const size_t max_block_size = sock->max_block_size();
5938 size_t size_est_inc
5940 if (sock->rexmit_on())
5941 {
5942 size_est_inc += sizeof(Low_lvl_packet::rexmit_id_t);
5943 }
5944 assert(size_est_inc <= max_block_size); // At least one has to fit.
5945
5946 const Fine_time_pt time_now = Fine_clock::now();
5947 size_t size_est_so_far = sizeof(Low_lvl_packet::rcv_wnd_t); // How many raw bytes we have, approximately, used.
5948 for (Peer_socket::Individual_ack::Const_ptr ind_ack : pending_acks)
5949 {
5950 if (size_est_so_far + size_est_inc > max_block_size)
5951 {
5952 // Too big. Send off what we have.
5955 defer_delta_check))
5956 {
5957 return;
5958 }
5959 // else
5960
5961 // Register one ACK packet we will send ASAP.
5962 sock->m_rcv_stats.sent_low_lvl_ack_packet(false);
5963
5964 // As async_sock_low_lvl_packet_send_paced() says, we cannot reuse ack's pointed-to-object. Make new one.
5965 ack = Low_lvl_packet::create_uninit_packet<Ack_packet>(get_logger());
5966 ack->m_rcv_wnd = rcv_wnd; // Advertise receive window. @todo Code reuse?
5967
5968 size_est_so_far = sizeof(Low_lvl_packet::rcv_wnd_t);
5969 }
5970
5971 // Add the acknowledgment to the current ACK.
5972
5973 // First sequence number in packet.
5974 const Sequence_number& seq_num = ind_ack->m_seq_num;
5975
5976 // ACK delay for this individual acknowledgment. Compute it; then validate it.
5977
5978 /* @todo In low_lvl_io, we perform packet pacing but currently choose to assign a value of
5979 * 0 bytes to an ACK. That is, while we do preserve the order of DATA and ACK packets -- if
5980 * both happen to be in the outgoing stream -- we do not delay the sending of the ACK once it is
5981 * the next packet to be sent out. However, even so, an ACK's sending may be delayed by the
5982 * pacing applied to DATA packets intermixed with it. Therefore the ACK delay measurement we
5983 * take here may be incorrect (too low) in that case. This can cause overestimated RTTs on the
5984 * sender's side. The to-do is to correct the ACK delay value in a given ACK by adding the
5985 * pacing delay (if any) of the ACK to the individual ACK delays within it. Conceptually this
5986 * is similar to the sent_when value being set when choosing to send a DATA packet and then
5987 * corrected in the pacing module later.
5988 *
5989 * This to-do is not important until we in practice start mixing sending and receiving at the
5990 * application layer... but still -- it's worth knowing that there is a design bug here. */
5991
5992 // Shouldn't be negative.
5993 Fine_duration delay = time_now - ind_ack->m_received_when;
5994 if (delay.count() < 0)
5995 {
5996 /* This is pretty crazy and should not happen according to the documented properties of
5997 * Fine_clock. No need to crash or disconnect though, so do our best.... */
5998 FLOW_LOG_WARNING("Delayed [ACK] timer [" << sock << "] triggered; "
5999 "delay for packet [" << seq_num << ", ...) is "
6000 "negative: [" << delay << "]; using zero.");
6001 delay = Fine_duration::zero();
6002 }
6003
6004 /* Convert whatever resolution Fine_clock uses to milliseconds because we want to keep that
6005 * field of the ACK sized according to how the low-level packet handling code prefers it for
6006 * efficiency. Overflow is possible. Use duration_cast (truncation) instead of rounding,
6007 * because in very low-latency situations the extra microseconds rounding up can cause a
6008 * negative RTT calculation on the other side (when this ACK is received). The ACK handling
6009 * code will just clamp the value at zero on the other side, but let's try to avoid it anyway
6010 * on this side.
6011 *
6012 * @todo This comment appears to be outdated, as Ack_delay_time_unit is just Fine_duration.
6013 * Look into this. */
6014 Ack_packet::Ack_delay_time_unit pkt_delay = duration_cast<Ack_packet::Ack_delay_time_unit>(delay);
6015 const Ack_packet::ack_delay_t MAX_DELAY_VALUE = numeric_limits<Ack_packet::ack_delay_t>::max();
6016 if (uint64_t(pkt_delay.count()) > uint64_t(MAX_DELAY_VALUE))
6017 {
6018 /* This is pretty crazy though not 100% impossible if the CPU is really loaded, or some other
6019 * shenanigans. So do our best.... */
6020 FLOW_LOG_WARNING("Delayed [ACK] timer [" << sock << "] triggered; "
6021 "delay for packet [" << seq_num << ", ...) is [" << pkt_delay << "]; overflow; "
6022 "using max value [" << MAX_DELAY_VALUE << "] units.");
6023 // @todo Maybe there's a more sane ceiling value than the absolute maximum?
6024 pkt_delay = Ack_packet::Ack_delay_time_unit(MAX_DELAY_VALUE);
6025 }
6026
6027 // Finally write the individual acknowledgment.
6028 if (sock->rexmit_on())
6029 {
6030 ack->m_rcv_acked_packets_rexmit_on_out.push_back
6032 ind_ack->m_rexmit_id,
6033 Ack_packet::ack_delay_t(pkt_delay.count())));
6034 }
6035 else
6036 {
6037 ack->m_rcv_acked_packets_rexmit_off_out.push_back
6039 Ack_packet::ack_delay_t(pkt_delay.count())));
6040 }
6041 size_est_so_far += size_est_inc;
6042
6043 // Register one packet of unknown size that we've packaged into an ACK and will send ASAP.
6044 sock->m_rcv_stats.sent_individual_ack();
6045 } // for (ind_ack : pending_acks)
6046
6047 // Don't forget the last non-full ACK, if any.
6048 if ((size_est_so_far != 0)
6051 defer_delta_check)))
6052 {
6053 return;
6054 }
6055
6056 // Register one ACK packet we will send ASAP.
6057 sock->m_rcv_stats.sent_low_lvl_ack_packet(false);
6058
6059 // All serialized to be sent; the timer can start again when a packet must be acknowledged.
6060 pending_acks.clear();
6061
6062 // Register that now there are 0 pending individual acks.
6063 sock->m_rcv_stats.current_pending_to_ack_packets(0);
6064
6065 // Note that all the ACKs are sent off outside this handler and only once UDP is ready.
6066} // Node::async_low_lvl_ack_send()
6067
6069{
6070 // We are in thread W.
6071 return Socket_id{ sock->remote_endpoint(), sock->local_port() };
6072}
6073
6075{
6076 // There is stuff to send if there is anything to retransmit or at least new user data.
6077 return !(sock->m_snd_rexmit_q.empty() && sock->m_snd_buf.empty());
6078}
6079
6081{
6082 // See doc comment for rationale for keeping this in a function.
6083
6084 /* Since 1 block can be at most max-block-size, if that much space is free, then definitely one
6085 * can enqueue onto m_snd_buf. Note that if less than max-block-size space is free, it would
6086 * still be possible to enqueue a smaller block; yet we still return false. We are intentionally
6087 * conservative, because we are guaranteeing ANY one enqueueing will work. More importantly, this
6088 * guarantees our Socket_buffer scheme (see class doc header) to guarantee constant-time
6089 * dequeueing will work.
6090 *
6091 * We're not overly conservative, either; i.e., no one is likely to complain this policy is too
6092 * stingy. */
6093 return sock->m_snd_buf.data_size() + sock->max_block_size()
6094 <= sock->opt(sock->m_opts.m_st_snd_buf_max_size);
6095}
6096
6098{
6099 // See doc comment for rationale for keeping this in a function.
6100 return !sock->m_rcv_buf.empty();
6101}
6102
6104{
6105 // We are in thread W.
6106
6107 FLOW_LOG_TRACE('[' << sock << "] changing state from [" <<
6108 sock->m_int_state << "] to [" << new_state << "].");
6109 sock->m_int_state = new_state;
6110}
6111
6113{
6114 Peer_socket::Lock_guard lock(sock->m_mutex);
6115
6116 // @todo Add TRACE logging.
6117
6118 sock->m_state = state;
6119 if (state == Peer_socket::State::S_OPEN)
6120 {
6121 sock->m_open_sub_state = open_sub_state;
6122 }
6123 else // (state == Peer_socket::State::S_CLOSED)
6124 {
6125 /* Important convention: S_CLOSED means socket is permanently incapable of sending or
6126 * receiving more data. At this point the originating Node removes the socket from its internal
6127 * structures. Therefore, the Node itself may even go away -- while this Peer_socket still
6128 * exists. Since we use shared_ptr when giving our socket objects, that's fine -- but we want to
6129 * avoid returning an invalid Node* in node(). So, when S_CLOSED, sock->m_node = 0. */
6130 sock->m_node = 0;
6131 }
6132}
6133
6134void Node::sock_disconnect_detected(Peer_socket::Ptr sock, const Error_code& disconnect_cause, bool close)
6135{
6136 Peer_socket::Lock_guard lock(sock->m_mutex);
6137
6138 sock->m_disconnect_cause = disconnect_cause;
6139
6140 if (close)
6141 {
6142 // DONE.
6143 sock_set_state(sock, Peer_socket::State::S_CLOSED); // Reentrant mutex => OK.
6144 sock_free_memory(sock);
6145 }
6146 else
6147 {
6148 // This socket is screwed, but let user get any remaining buffer data out.
6149
6150 // Reentrant mutex => OK:
6152 }
6153}
6154
6156{
6157 Peer_socket::Lock_guard lock(sock->m_mutex);
6158
6159 // Sanity-check pre-conditions. (Basically ensure disconnect_detected(err_code, false) was previously called.)
6160 assert(sock->m_disconnect_cause);
6161 assert((sock->m_state == Peer_socket::State::S_OPEN)
6162 && (sock->m_open_sub_state == Peer_socket::Open_sub_state::S_DISCONNECTING));
6163
6164 sock_set_state(sock, Peer_socket::State::S_CLOSED); // Reentrant mutex => OK.
6165 sock_free_memory(sock);
6166}
6167
6169{
6170 sock->m_rcv_buf.clear();
6171 sock->m_snd_buf.clear();
6172 sock->m_rcv_packets_with_gaps.clear();
6173 sock->m_rcv_reassembly_q_data_size = 0;
6174 sock->m_snd_flying_pkts_by_sent_when.clear();
6175 sock->m_snd_flying_pkts_by_seq_num.clear();
6176 sock->m_snd_rexmit_q.clear();
6177 sock->m_serialized_metadata.make_zero(); // clear() does not deallocate, but this does.
6178 sock->m_rcv_syn_rcvd_data_q.clear();
6179 sock->m_rcv_pending_acks.clear();
6180 sock->m_rcv_acked_packets.clear();
6181 sock->m_snd_pacing_data.m_packet_q.clear();
6182
6183 /* Destroy memory stored in m_snd_cong_ctl which may be non-O(1). This is a little questionable;
6184 * maybe should leave it to destructor? However since we store it as a pointer and are to free
6185 * any "significant" memory, and this may be significant, we may as well just delete it. */
6186 sock->m_snd_cong_ctl.reset();
6187 // Same deal.
6188 sock->m_snd_bandwidth_estimator.reset();
6189}
6190
6192{
6193 // We are in thread U != W.
6194
6195 if (!running())
6196 {
6198 return false;
6199 }
6200 // else
6201
6202 /* We just want to replace m_opts with a copy of opts. First validate opts (including with
6203 * respect to m_opts, and also check for invalid values and such), then copy it over. */
6204
6205 // Log new options values. A bit computationally expensive so just use TRACE for now. @todo Reconsider?
6206 FLOW_LOG_TRACE("For [" << sock << "]:\n\n" << opts);
6207
6208 // Will be writing sock->m_opts if all goes well, so must acquire exclusive ownership of m_opts.
6209 Peer_socket::Options_lock lock(sock->m_opts_mutex);
6210
6211 /* Validate the new option set (including ensuring they're not changing static options' values).
6212 * Note that an explicit pre-condition of this method is that m_opts_mutex is locked if needed,
6213 * hence the above locking statement is not below this call. */
6214 if (!sock_validate_options(opts, &sock->m_opts, err_code))
6215 {
6216 return false;
6217 }
6218 // else
6219
6220 // Boo-ya.
6221 sock->m_opts = opts;
6222 return true;
6223} // Node::sock_set_options()
6224
6225/// @cond
6226/* -^- Doxygen, please ignore the following. (Don't want docs generated for temp macro; this is more maintainable
6227 * than specifying the macro name to omit it, in Doxygen-config EXCLUDE_SYMBOLS.) */
6228
6229/* Normaly I try to avoid macro cleverness, but in this case to get a nice printout we need the
6230 * # technique, and also this eliminates quite a bit of repetition. So let's.... */
6231#define VALIDATE_STATIC_OPTION(ARG_opt) \
6232 validate_static_option(opts.ARG_opt, prev_opts->ARG_opt, #ARG_opt, err_code)
6233#define VALIDATE_CHECK(ARG_check) \
6234 validate_option_check(ARG_check, #ARG_check, err_code)
6235
6236// -v- Doxygen, please stop ignoring.
6237/// @endcond
6238
6240 const Peer_socket_options* prev_opts,
6241 Error_code* err_code) const
6242{
6243 /* We are to validate the given set of per-socket option values. If prev_opts, then the context
6244 * is that an already-existing socket (with already-set options) is being called with
6245 * set_options(), i.e. user is modifying options for an existing socket. In that case we must
6246 * ensure that no static (unchangeable) option's value would be changed by this.
6247 *
6248 * If not prev_opts, then the per-socket options within the global per-Node Node_options object
6249 * are being changed. Per-socket options in that context are always dynamic, since if they were
6250 * static, there'd be no point in making the per-socket in the first place. So in that case that
6251 * static option check is to be skipped.
6252 *
6253 * Finally, we must check for individual integrity of the specified values (including consistency
6254 * with other option values). */
6255
6256 using boost::chrono::seconds;
6257 using std::numeric_limits;
6258
6259 // We are in thread U != W or in thread W.
6260
6261 if (prev_opts)
6262 {
6263 /* As explained above, they're trying to change an existing socket's option values. Ensure
6264 * all the static options' values are the same in opts and prev_opts. */
6265
6266 // Explicitly documented pre-condition is that *prev_opts is already locked if necessary. So don't lock.
6267
6268 const bool static_ok
6269 = VALIDATE_STATIC_OPTION(m_st_max_block_size) &&
6270 VALIDATE_STATIC_OPTION(m_st_connect_retransmit_period) &&
6271 VALIDATE_STATIC_OPTION(m_st_connect_retransmit_timeout) &&
6272 VALIDATE_STATIC_OPTION(m_st_snd_buf_max_size) &&
6273 VALIDATE_STATIC_OPTION(m_st_rcv_buf_max_size) &&
6274 VALIDATE_STATIC_OPTION(m_st_rcv_flow_control_on) &&
6275 VALIDATE_STATIC_OPTION(m_st_rcv_buf_max_size_slack_percent) &&
6276 VALIDATE_STATIC_OPTION(m_st_rcv_buf_max_size_to_advertise_percent) &&
6277 VALIDATE_STATIC_OPTION(m_st_rcv_max_packets_after_unrecvd_packet_ratio_percent) &&
6278 VALIDATE_STATIC_OPTION(m_st_delayed_ack_timer_period) &&
6279 VALIDATE_STATIC_OPTION(m_st_max_full_blocks_before_ack_send) &&
6280 VALIDATE_STATIC_OPTION(m_st_rexmit_on) &&
6281 VALIDATE_STATIC_OPTION(m_st_max_rexmissions_per_packet) &&
6282 VALIDATE_STATIC_OPTION(m_st_init_drop_timeout) &&
6283 VALIDATE_STATIC_OPTION(m_st_snd_pacing_enabled) &&
6284 VALIDATE_STATIC_OPTION(m_st_snd_bandwidth_est_sample_period_floor) &&
6285 VALIDATE_STATIC_OPTION(m_st_cong_ctl_strategy) &&
6286 VALIDATE_STATIC_OPTION(m_st_cong_ctl_init_cong_wnd_blocks) &&
6287 VALIDATE_STATIC_OPTION(m_st_cong_ctl_max_cong_wnd_blocks) &&
6288 VALIDATE_STATIC_OPTION(m_st_cong_ctl_cong_wnd_on_drop_timeout_blocks) &&
6289 VALIDATE_STATIC_OPTION(m_st_cong_ctl_classic_wnd_decay_percent) &&
6290 VALIDATE_STATIC_OPTION(m_st_drop_packet_exactly_after_drop_timeout) &&
6291 VALIDATE_STATIC_OPTION(m_st_drop_all_on_drop_timeout) &&
6292 VALIDATE_STATIC_OPTION(m_st_out_of_order_ack_restarts_drop_timer);
6293
6294 if (!static_ok)
6295 {
6296 // validate_static_option() has set *err_code.
6297 return false;
6298 }
6299 // else
6300 } // if (prev_opts)
6301
6302 // Now sanity-check the values themselves. @todo Comment and reconsider these?
6303 const bool checks_ok
6304 = VALIDATE_CHECK(opts.m_st_max_block_size >= 512) &&
6305 VALIDATE_CHECK(opts.m_st_connect_retransmit_period.count() > 0) &&
6306 VALIDATE_CHECK(opts.m_st_connect_retransmit_timeout.count() > 0) &&
6307 VALIDATE_CHECK(opts.m_st_snd_buf_max_size >= 4 * opts.m_st_max_block_size) &&
6308 VALIDATE_CHECK(opts.m_st_rcv_buf_max_size >= 4 * opts.m_st_max_block_size) &&
6310 VALIDATE_CHECK(opts.m_st_rcv_max_packets_after_unrecvd_packet_ratio_percent >= 100) &&
6311 VALIDATE_CHECK(opts.m_st_delayed_ack_timer_period <= seconds(1)) &&
6312 VALIDATE_CHECK(util::in_closed_range(Fine_duration::zero(),
6314 Fine_duration(seconds(1)))) &&
6315 VALIDATE_CHECK(opts.m_st_max_full_blocks_before_ack_send >= 1) &&
6316 VALIDATE_CHECK(opts.m_st_max_rexmissions_per_packet >= 1) &&
6317 VALIDATE_CHECK(opts.m_st_max_rexmissions_per_packet <= numeric_limits<Low_lvl_packet::rexmit_id_t>::max());
6318 VALIDATE_CHECK(opts.m_st_init_drop_timeout.count() > 0) &&
6319 VALIDATE_CHECK(opts.m_st_snd_bandwidth_est_sample_period_floor.count() > 0) &&
6321 VALIDATE_CHECK
6323 VALIDATE_CHECK(opts.m_st_cong_ctl_cong_avoidance_increment_blocks < 20) &&
6324 VALIDATE_CHECK(opts.m_st_cong_ctl_classic_wnd_decay_percent <= 100) &&
6325 VALIDATE_CHECK(util::in_closed_range<size_t>(1, opts.m_st_cong_ctl_cong_wnd_on_drop_timeout_blocks, 10)) &&
6326 VALIDATE_CHECK(opts.m_dyn_drop_timeout_ceiling > 4 * opts.m_st_init_drop_timeout) &&
6327 VALIDATE_CHECK(opts.m_dyn_drop_timeout_backoff_factor >= 1) &&
6328 VALIDATE_CHECK(opts.m_dyn_rcv_wnd_recovery_timer_period.count() > 0);
6329
6330 // On error, validate_option_check() has set *err_code.
6331
6332 return checks_ok;
6333
6334#undef VALIDATE_CHECK
6335#undef VALIDATE_STATIC_OPTION
6336} // Node::sock_validate_options()
6337
6339{
6342 using boost::adopt_lock;
6343
6344 // We are in thread U != W.
6345
6346 Peer_socket_info stats;
6347 {
6348 /* WARNING!!! sock->m_mutex is locked, but WE must unlock it before returning! Can't leave that
6349 * to the caller, because we must unlock at a specific point below, right before post()ing
6350 * sock_info_worker() onto thread W. Use a Lock_guard that adopts an already-locked mutex. */
6351 Peer_socket::Lock_guard lock(sock->m_mutex, adopt_lock);
6352
6353 if (!running())
6354 {
6355 /* This is kind of a weird case, in that sock's Node having stopped running is a problem, but
6356 * in this case they just want the socket stats. The only reason we're in this method --
6357 * calling sock->info() did not simply return the stats itself -- is that there was a danger
6358 * thread W might change the stats, while we'd be copying them. Well, if !running() there is no
6359 * danger of that. So we can just: */
6360 sock_load_info_struct(sock, &stats);
6361 return stats;
6362 }
6363 // else
6364
6365 /* Okay -- Node is running and may change stats's source info at any time. Therefore, since we
6366 * do not have a mutex for all that source info, we place a task on W and set up a future as a
6367 * way for it to inform us it's done. This has a certain performance penalty, but that's better
6368 * than having to lock each time we need to modify this source data throughout W's operations.
6369 * Moreover we warned about the performance penalty in the doc header for Peer_socket::info(). */
6370
6371 // We're done -- must unlock so that thread W can do what it wants to with sock.
6372 } // lock
6373
6374 // Load this onto thread W boost.asio work queue. We don't return until it's done, so [&] is OK.
6375 asio_exec_ctx_post(get_logger(), &m_task_engine, Synchronicity::S_ASYNC_AND_AWAIT_CONCURRENT_COMPLETION,
6376 [&]() { sock_load_info_struct(sock, &stats); });
6377 // If got here, the task has completed in thread W and signaled us to that effect.
6378
6379 return stats;
6380} // Node::sock_info()
6381
6383{
6384 using boost::lexical_cast;
6385 using std::string;
6386
6387 // We are in thread W.
6388
6389 stats->m_rcv = sock->m_rcv_stats.stats();
6390 stats->m_snd = sock->m_snd_stats.stats();
6391
6392 // @todo This is more suitable for the non-existent Node_info and Node::load_info_struct(). (It's not per-socket.)
6394
6395 stats->m_int_state_str = lexical_cast<string>(sock->m_int_state);
6396 stats->m_is_active_connect = sock->m_active_connect;
6397 // No need to lock: no thread but W can write to it.
6398 stats->m_disconnect_cause = sock->m_disconnect_cause;
6399
6400 {
6401 // Gotta lock, as Receive and Send buffers can be modified at any time by thread U at least.
6402 Peer_socket::Lock_guard lock(sock->m_mutex);
6403 stats->m_rcv_buf_size = sock->m_rcv_buf.data_size();
6404 stats->m_snd_buf_size = sock->m_snd_buf.data_size();
6405 }
6406
6407 stats->m_rcv_wnd = sock_rcv_wnd(sock);
6408 stats->m_rcv_wnd_last_advertised = sock->m_rcv_last_sent_rcv_wnd;
6409 stats->m_rcv_reassembly_q_data_size = sock->m_rcv_reassembly_q_data_size;
6410 stats->m_rcv_packets_with_gaps = sock->m_rcv_packets_with_gaps.size();
6412 = sock->m_rcv_syn_rcvd_data_q.empty() ? 0 : sock->m_rcv_syn_rcvd_data_cumulative_size;
6413 stats->m_rcv_syn_rcvd_data_q_size = sock->m_rcv_syn_rcvd_data_q.size();
6414
6415 stats->m_snd_rcv_wnd = sock->m_snd_remote_rcv_wnd;
6416 stats->m_snd_cong_ctl_in_flight_bytes = sock->m_snd_flying_bytes;
6417 stats->m_snd_cong_ctl_in_flight_count = sock->m_snd_flying_pkts_by_sent_when.size();
6418 stats->m_snd_cong_ctl_wnd_bytes = sock->m_snd_cong_ctl->congestion_window_bytes();
6419 stats->m_snd_cong_ctl_wnd_count_approx = stats->m_snd_cong_ctl_wnd_bytes / sock->max_block_size();
6420 stats->m_snd_smoothed_round_trip_time = sock->m_snd_smoothed_round_trip_time;
6421 stats->m_snd_round_trip_time_variance = sock->m_round_trip_time_variance;
6422 stats->m_snd_drop_timeout = sock->m_snd_drop_timeout;
6423 stats->m_snd_pacing_packet_q_size = sock->m_snd_pacing_data.m_packet_q.size();
6424 stats->m_snd_pacing_bytes_allowed_this_slice = sock->m_snd_pacing_data.m_bytes_allowed_this_slice;
6425 stats->m_snd_pacing_slice_start = sock->m_snd_pacing_data.m_slice_start;
6426 stats->m_snd_pacing_slice_period = sock->m_snd_pacing_data.m_slice_period;
6428 = util::to_mbit_per_sec<Send_bandwidth_estimator::Time_unit>
6429 (sock->m_snd_bandwidth_estimator->bandwidth_bytes_per_time());
6430
6431 stats->m_sock_opts = sock->opt(sock->m_opts); // Lock and copy... probably not the fastest thing ever....
6432 stats->m_node_opts = opt(m_opts); // Ditto.
6433}
6434
6436{
6437 // We are in thread W.
6438
6439 /* We are to log details about the given socket. Since the idea is that this would be called on
6440 * the order of at most once or twice a second, we can be as verbose as we think is useful without
6441 * (too much) concern for performance. */
6442
6443 Peer_socket_info stats;
6444 sock_load_info_struct(sock, &stats); // This involves some copying, but, again, we are not too concerned with speed.
6445
6446 FLOW_LOG_INFO("[=== Socket state for [" << sock << "]. ===\n" << stats);
6447
6448 // Log receive and send windows details. Force the logging of the most verbose possible amount of info.
6449 log_snd_window(sock, true);
6450 log_rcv_window(sock, true);
6451 // @todo Should this be inside Peer_socket_info also?
6452
6453 FLOW_LOG_INFO("=== Socket state for [" << sock << "]. ===]");
6454} // Node::sock_log_detail()
6455
6456void Node::advance_seq_num(Sequence_number* seq_num, boost::shared_ptr<const Data_packet> data) // Static.
6457{
6458 /* We just need to increment *seq_num, which points to the start of the data in `data`,
6459 * to a value that points to the data just past the end of the data in `data`. Why is this in a
6460 * separate method? Answer: We may want to change the mapping from sequence number to byte of data. In
6461 * particular the mapping can be one-to-one, as in TCP. Or it can be one sequence number to all bytes in a
6462 * particular packet, which I've seen in certain lesser known custom protocols. This allows us to
6463 * (hopefully) change the code in one place. */
6464
6465 advance_seq_num(seq_num, data->m_data.size());
6466} // Node::advance_seq_num()
6467
6468void Node::advance_seq_num(Sequence_number* seq_num, size_t data_size)
6469{
6470 /* For now go with TCP's convention (one byte to one sequence number, no gaps). While we deal
6471 * with blocks, instead of streams, this may complicate the math a bit and use more sequence
6472 * number space (faster wrapping). However, it would make it easier to adapt the algorithms
6473 * when we move to byte streams; and we currently use a sequence number so large that wrapping
6474 * is impossible. Update: we have moved to streams. */
6475 *seq_num += data_size;
6476}
6477
6478template<typename Packet_map_iter>
6479void Node::get_seq_num_range(const Packet_map_iter& packet_it,
6480 Sequence_number* seq_num_start, Sequence_number* seq_num_end) // Static.
6481{
6482 const Sequence_number& seq_num_start_cref = packet_it->first;
6483 if (seq_num_start)
6484 {
6485 *seq_num_start = seq_num_start_cref;
6486 }
6487 if (seq_num_end)
6488 {
6489 *seq_num_end = seq_num_start_cref;
6490 advance_seq_num(seq_num_end, packet_it->second->m_size);
6491 }
6492}
6493
6495{
6496 // Since m_snd_last_order_num starts at 0, this ensures 0 is reserved, as advertised.
6497 return ++sock->m_snd_last_order_num;
6498}
6499
6501{
6502 // Just make a regular net_flow::Peer_socket.
6503 return sock_create_forward_plus_ctor_args<Peer_socket>(opts);
6504}
6505
6506// Free implementations.
6507
6508std::ostream& operator<<(std::ostream& os, const Peer_socket* sock)
6509{
6510 return
6511 sock
6512 ? (os
6513 << "NetFlow_socket "
6514 << "[" << sock->remote_endpoint() << "]<=>[NetFlow [:" << sock->local_port() << "]] "
6515 "@" << static_cast<const void*>(sock))
6516 : (os << "NetFlow_socket@null");
6517}
6518
6519/// @cond
6520/* -^- Doxygen, please ignore the following. (Don't want docs generated for temp macro; this is more maintainable
6521 * than specifying the macro name to omit it, in Doxygen-config EXCLUDE_SYMBOLS.) */
6522
6523// That's right, I did this. Wanna fight about it?
6524#define STATE_TO_CASE_STATEMENT(ARG_state) \
6525 case Peer_socket::Int_state::S_##ARG_state: \
6526 return os << #ARG_state
6527
6528// -v- Doxygen, please stop ignoring.
6529/// @endcond
6530
6531std::ostream& operator<<(std::ostream& os, Peer_socket::Int_state state)
6532{
6533 switch (state)
6534 {
6535 STATE_TO_CASE_STATEMENT(CLOSED);
6536 STATE_TO_CASE_STATEMENT(SYN_SENT);
6537 STATE_TO_CASE_STATEMENT(SYN_RCVD);
6538 STATE_TO_CASE_STATEMENT(ESTABLISHED);
6539 }
6540 return os;
6541#undef STATE_TO_CASE_STATEMENT
6542}
6543
6544} // namespace flow::net_flow
const Component & get_log_component() const
Returns reference to the stored Component object, particularly as many FLOW_LOG_*() macros expect.
Definition: log.cpp:229
Logger * get_logger() const
Returns the stored Logger pointer, particularly as many FLOW_LOG_*() macros expect.
Definition: log.cpp:224
Interface that the user should implement, passing the implementing Logger into logging classes (Flow'...
Definition: log.hpp:1291
static Congestion_control_strategy * create_strategy(Strategy_choice strategy_choice, log::Logger *logger_ptr, Peer_socket::Const_ptr sock)
Factory method that, given an enum identifying the desired strategy, allocates the appropriate Conges...
Definition: cong_ctl.cpp:101
static Ptr create_drop_timer(log::Logger *logger_ptr, util::Task_engine *node_task_engine, Fine_duration *sock_drop_timeout, Peer_socket::Const_ptr &&sock, const Function< void(const Error_code &err_code)> &timer_failure, const Function< void(bool drop_all_packets)> &timer_fired)
Constructs Drop_timer and returns a ref-counted pointer wrapping it.
Definition: drop_timer.cpp:28
@ S_PEER_SOCKET_WRITABLE
Event type specifying the condition of interest wherein a target Peer_socket sock is such that callin...
@ S_PEER_SOCKET_READABLE
Event type specifying the condition of interest wherein a target Peer_socket sock is such that callin...
An object of this class is a single Flow-protocol networking node, in the sense that: (1) it has a di...
Definition: node.hpp:937
void snd_flying_pkts_updated(Peer_socket::Ptr sock, Peer_socket::Sent_pkt_ordered_by_when_const_iter pkt_begin, const Peer_socket::Sent_pkt_ordered_by_when_const_iter &pkt_end, bool added)
Updates Peer_socket::m_snd_flying_bytes according to an operation (add packets, remove packets) calle...
bool categorize_individual_ack(const Socket_id &socket_id, Peer_socket::Ptr sock, Ack_packet::Individual_ack::Const_ptr ack, bool *dupe_or_late, Peer_socket::Sent_pkt_ordered_by_when_iter *acked_pkt_it)
Helper of perform_accumulated_on_recv_tasks() that categorizes the given accumulated individual ackno...
void handle_data_to_established(const Socket_id &socket_id, Peer_socket::Ptr sock, boost::shared_ptr< Data_packet > packet, bool syn_rcvd_qd_packet)
Handles a just-deserialized, just-demultiplexed, low-level DATA packet delivered to the given peer so...
bool sock_is_writable(const boost::any &sock_as_any) const
Returns true if and only if calling sock->send() with at least some arguments would return either non...
Peer_socket_info sock_info(Peer_socket::Const_ptr sock)
Implementation of sock->info() for socket sock in all cases except when sock->state() == Peer_socket:...
void receive_wnd_updated(Peer_socket::Ptr sock)
Placed by receive() onto W if it has dequeued data from Receive buffer and given it to the user,...
void sock_track_new_data_after_gap_rexmit_off(Peer_socket::Ptr sock, boost::shared_ptr< const Data_packet > packet, size_t data_size, bool *slide, size_t *slide_size)
Helper for handle_data_to_established() that aims to register the given DATA packet as an out-of-orde...
bool sock_data_to_reassembly_q_unless_overflow(Peer_socket::Ptr sock, boost::shared_ptr< Data_packet > packet)
Helper for handle_data_to_established() that aims to register the given DATA packet as an out-of-orde...
static bool ensure_sock_open(Socket_ptr sock, Error_code *err_code)
Helper method that checks whether the given Peer_socket or Server_socket is CLOSED; if so,...
Definition: node.hpp:4141
void send_worker(Peer_socket::Ptr sock, bool defer_delta_check)
Thread W implemention of send(): synchronously or asynchronously send the contents of sock->m_snd_buf...
void handle_accumulated_acks(const Socket_id &socket_id, Peer_socket::Ptr sock)
Helper of perform_accumulated_on_recv_tasks() that handles any incoming acknowledgments and rcv_wnd u...
void async_rcv_wnd_recovery(Peer_socket::Ptr sock, size_t rcv_wnd)
receive_wnd_updated() helper that continues rcv_wnd recovery: that is, sends unsolicited ACK with a r...
void log_accumulated_acks(Peer_socket::Const_ptr sock) const
Helper of handle_accumulated_acks() that logs the about-to-be-handled accumulated individual acknowle...
void sock_free_memory(Peer_socket::Ptr sock)
Helper that clears all non-O(1)-space data structures stored inside sock.
void sock_load_info_struct(Peer_socket::Const_ptr sock, Peer_socket_info *stats) const
Given a Peer_socket, copies all stats info (as available via Peer_socket::info()) from various struct...
void log_snd_window(Peer_socket::Const_ptr sock, bool force_verbose_info_logging=false) const
Logs TRACE or DATA messages thats show the detailed state of the sending sequence number space.
void send_worker_check_state(Peer_socket::Ptr sock)
Helper placed by send() onto W to invoke send_worker() but ensures that the socket has not entered so...
size_t m_low_lvl_max_buf_size
OS-reported m_low_lvl_sock UDP receive buffer maximum size, obtained right after we OS-set that setti...
Definition: node.hpp:3771
Non_blocking_func_ret_type sync_op(typename Socket::Ptr sock, const Function< Non_blocking_func_ret_type()> &non_blocking_func, Non_blocking_func_ret_type would_block_ret_val, Event_set::Event_type ev_type, const Fine_time_pt &wait_until, Error_code *err_code)
Implementation of core blocking transfer methods, namely Peer_socket::sync_send(),...
Definition: node.hpp:3977
size_t sock_max_packets_after_unrecvd_packet(Peer_socket::Const_ptr sock) const
Computes and returns the max size for Peer_socket::m_rcv_packets_with_gaps for sock.
Peer_socket::Sent_pkt_ordered_by_when_iter categorize_pkts_as_dropped_on_acks(Peer_socket::Ptr sock, const boost::unordered_set< Peer_socket::order_num_t > &flying_now_acked_pkts)
Helper of perform_accumulated_on_recv_tasks() that determines the range of In-flight packets that sho...
void rcv_get_first_gap_info(Peer_socket::Const_ptr sock, bool *first_gap_exists, Sequence_number *seq_num_after_first_gap)
Helper for handle_data_to_established() that gets simple info about Peer_socket::m_rcv_packets_with_g...
bool snd_deqable(Peer_socket::Const_ptr sock) const
Return true if and only if there are enough data either in Peer_socket::m_snd_rexmit_q of sock (if re...
void cancel_timers(Peer_socket::Ptr sock)
Cancel any timers and scheduled tasks active in the given socket.
void sock_rcv_buf_now_readable(Peer_socket::Ptr sock, bool syn_rcvd_qd_packet)
Helper for handle_data_to_established() that assumes the given's socket Receive buffer is currently r...
void snd_flying_pkts_erase_one(Peer_socket::Ptr sock, Peer_socket::Sent_pkt_ordered_by_when_iter pkt_it)
Erases (for example if considered Acknowledged or Dropped) a packet struct from the "scoreboard" (Pee...
Opt_type opt(const Opt_type &opt_val_ref) const
Obtain a copy of the value of a given option in a thread-safe manner.
Definition: node.hpp:4180
bool sock_validate_options(const Peer_socket_options &opts, const Peer_socket_options *prev_opts, Error_code *err_code) const
Analogous to validate_options() but checks per-socket options instead of per-Node options.
void handle_accumulated_pending_acks(const Socket_id &socket_id, Peer_socket::Ptr sock)
Helper of perform_accumulated_on_recv_tasks() that handles any additional individual outgoing acknowl...
void receive_wnd_recovery_data_received(Peer_socket::Ptr sock)
Pertaining to the async_rcv_wnd_recovery() mechanism, this handles the event that we have received an...
static Peer_socket::order_num_t sock_get_new_snd_order_num(Peer_socket::Ptr sock)
Returns the "order number" to use for Peer_socket::Sent_packet::Sent_when structure corresponding to ...
Peer_socket::Ptr sync_connect_impl(const Remote_endpoint &to, const Fine_duration &max_wait, const boost::asio::const_buffer &serialized_metadata, Error_code *err_code, const Peer_socket_options *opts)
Implementation core of sync_connect*() that gets rid of templated or missing arguments thereof.
size_t max_block_size() const
The maximum number of bytes of user data per received or sent block on connections generated from thi...
Definition: node.cpp:1112
void snd_flying_pkts_push_one(Peer_socket::Ptr sock, const Sequence_number &seq_num, Peer_socket::Sent_packet::Ptr sent_pkt)
Adds a new packet struct (presumably representing packet to be sent shortly) to the "scoreboard" (Pee...
Syn_packet::Ptr create_syn(Peer_socket::Const_ptr sock)
Helper that creates a new SYN packet object to the extent that is suitable for immediately passing to...
void close_abruptly(Peer_socket::Ptr sock, Error_code *err_code)
Implementation of non-blocking sock->close_abruptly() for socket sock in all cases except when sock->...
static void get_seq_num_range(const Packet_map_iter &packet_it, Sequence_number *seq_num_start, Sequence_number *seq_num_end)
Given an iterator into a Peer_socket::Sent_pkt_by_sent_when_map or Peer_socket::Recv_pkt_map,...
Peer_socket::Ptr sync_connect_with_metadata(const Remote_endpoint &to, const boost::chrono::duration< Rep, Period > &max_wait, const boost::asio::const_buffer &serialized_metadata, Error_code *err_code=0, const Peer_socket_options *opts=0)
A combination of sync_connect() and connect_with_metadata() (blocking connect, with supplied metadata...
Definition: node.hpp:3956
Syn_ack_packet::Ptr create_syn_ack(Peer_socket::Const_ptr sock)
Like create_syn() but for SYN_ACK.
virtual Peer_socket * sock_create(const Peer_socket_options &opts)
Internal factory used for ALL Peer_socket objects created by this Node (including subclasses).
bool snd_buf_enqable(Peer_socket::Const_ptr sock) const
Return true if and only if there is enough free space in Peer_socket::m_snd_buf of sock to enqueue an...
bool can_send(Peer_socket::Const_ptr sock) const
Answers the perennial question of congestion and flow control: assuming there is a DATA packet to sen...
void sock_slide_rcv_next_seq_num(Peer_socket::Ptr sock, size_t slide_size, bool reassembly_in_progress)
Helper for handle_data_to_established() that aims to register a set of received DATA packet data as i...
void sock_log_detail(Peer_socket::Const_ptr sock) const
Logs a verbose state report for the given socket.
static void advance_seq_num(Sequence_number *seq_num, boost::shared_ptr< const Data_packet > data)
Assuming *seq_num points to the start of data.m_data, increments *seq_num to point to the datum just ...
void async_low_lvl_ack_send(Peer_socket::Ptr sock, bool defer_delta_check, const Error_code &sys_err_code=Error_code())
Sends a low-level ACK packet, with all accumulated in Peer_socket::m_rcv_pending_acks of sock individ...
static Sequence_number snd_past_last_flying_datum_seq_num(Peer_socket::Const_ptr sock)
Obtain the sequence number for the datum just past the last (latest) In-flight (i....
Peer_socket::Ptr connect(const Remote_endpoint &to, Error_code *err_code=0, const Peer_socket_options *opts=0)
Initiates an active connect to the specified remote Flow server.
void event_set_all_check_delta(bool defer_delta_check)
For each WAITING Event_set within the Node: checks for any events that hold, and if any do hold,...
Definition: event_set.cpp:1129
void serv_peer_socket_closed(Server_socket::Ptr serv, Peer_socket::Ptr sock)
Records that a Server_socket-contained (i.e., currently un-established, or established but not yet ac...
bool rcv_buf_deqable(Peer_socket::Const_ptr sock) const
Return true if and only if there are enough data in Peer_socket::m_rcv_buf of sock to give the user s...
void async_acknowledge_packet(Peer_socket::Ptr sock, const Sequence_number &seq_num, unsigned int rexmit_id, size_t data_size)
Causes an acknowledgment of the given received packet to be included in a future Ack_packet sent to t...
Socket_id_to_socket_map m_socks
The peer-to-peer connections this Node is currently tracking.
Definition: node.hpp:3792
Peer_socket::Options_lock Options_lock
Short-hand for lock that acquires exclusive access to an Options_mutex.
Definition: node.hpp:1439
static Socket_id socket_id(Peer_socket::Const_ptr sock)
Constructs the socket pair (connection ID) for the given socket.
void handle_syn_ack_to_syn_sent(const Socket_id &socket_id, Peer_socket::Ptr sock, boost::shared_ptr< const Syn_ack_packet > syn_ack)
Handles a just-deserialized, just-demultiplexed low-level SYN_ACK packet delivered to the given peer ...
size_t send(Peer_socket::Ptr sock, const Function< size_t(size_t max_data_size)> &snd_buf_feed_func, Error_code *err_code)
Implementation of non-blocking sock->send() for socket sock in all cases except when sock->state() ==...
void sock_set_int_state(Peer_socket::Ptr sock, Peer_socket::Int_state new_state)
Sets internal state of given socket to the given state and logs a TRACE message about it.
bool sock_is_readable(const boost::any &sock_as_any) const
Returns true if and only if calling sock->receive() with at least some arguments would return either ...
bool async_sock_low_lvl_packet_send_or_close_immediately(const Peer_socket::Ptr &sock, Low_lvl_packet::Ptr &&packet, bool defer_delta_check)
Similar to async_sock_low_lvl_packet_send_paced() except it also calls close_connection_immediately(s...
bool sock_data_to_rcv_buf_unless_overflow(Peer_socket::Ptr sock, boost::shared_ptr< Data_packet > packet)
Helper for handle_data_to_established() that aims to pass the payload of the given DATA packet to the...
bool sock_set_options(Peer_socket::Ptr sock, const Peer_socket_options &opts, Error_code *err_code)
Thread W implementation of sock->set_options().
bool running() const
Returns true if and only if the Node is operating.
Definition: node.cpp:420
Port_to_server_map m_servs
The server sockets this Node is currently tracking.
Definition: node.hpp:3798
Event_set::Ev_type_to_socks_map m_sock_events
All sockets that have been detected to be "ready" (by the Event_set doc header definition) at any poi...
Definition: node.hpp:3830
static const uint8_t S_DEFAULT_CONN_METADATA
Type and value to supply as user-supplied metadata in SYN, if user chooses to use [[a]sync_]connect()...
Definition: node.hpp:1403
void setup_drop_timer(const Socket_id &socket_id, Peer_socket::Ptr sock)
Creates a new Drop Timer and saves it to sock->m_snd_drop_timer.
void handle_ack_to_established(Peer_socket::Ptr sock, boost::shared_ptr< const Ack_packet > ack)
Handles a just-deserialized, just-demultiplexed, low-level ACK packet delivered to the given peer soc...
Peer_socket::Ptr sync_connect(const Remote_endpoint &to, const boost::chrono::duration< Rep, Period > &max_wait, Error_code *err_code=0, const Peer_socket_options *opts=0)
The blocking (synchronous) version of connect().
Definition: node.hpp:3967
void handle_syn_ack_to_established(Peer_socket::Ptr sock, boost::shared_ptr< const Syn_ack_packet > syn_ack)
Handles a just-deserialized, just-demultiplexed, duplicate (equal to already-received SYN_ACK) low-le...
void setup_connection_timers(const Socket_id &socket_id, Peer_socket::Ptr sock, bool initial)
Assuming we've just sent SYN or SYN_ACK, sets up an asynchronous scheduled task to fire within some a...
void log_rcv_window(Peer_socket::Const_ptr sock, bool force_verbose_info_logging=false) const
Logs TRACE or DATA messages that show the detailed state of the receiving sequence number space.
size_t sock_rcv_wnd(Peer_socket::Const_ptr sock) const
Computes and returns the currently correct rcv_wnd value; that is the amount of space free in Receive...
void connect_worker(const Remote_endpoint &to, const boost::asio::const_buffer &serialized_metadata, const Peer_socket_options *opts, Peer_socket::Ptr *sock)
Thread W implementation of connect().
bool drop_pkts_on_acks(Peer_socket::Ptr sock, const Peer_socket::Sent_pkt_ordered_by_when_iter &last_dropped_pkt_it, size_t *cong_ctl_dropped_pkts, size_t *cong_ctl_dropped_bytes, size_t *dropped_pkts, size_t *dropped_bytes, std::vector< Peer_socket::order_num_t > *pkts_marked_to_drop)
Helper of perform_accumulated_on_recv_tasks() that acts on the determination made by categorize_pkts_...
static const Peer_socket::Sent_packet::ack_count_t S_MAX_LATER_ACKS_BEFORE_CONSIDERING_DROPPED
For a given unacknowledged sent packet P, the maximum number of times any individual packet with high...
Definition: node.hpp:3686
bool async_low_lvl_syn_ack_ack_send_or_close_immediately(const Peer_socket::Ptr &sock, boost::shared_ptr< const Syn_ack_packet > &syn_ack)
Helper to create, fully fill out, and asynchronously send via async_sock_low_lvl_packet_send_or_close...
Error_code sock_categorize_data_to_established(Peer_socket::Ptr sock, boost::shared_ptr< const Data_packet > packet, bool *dupe, bool *slide, size_t *slide_size)
Helper for handle_data_to_established() that categorizes the DATA packet received as either illegal; ...
void async_sock_low_lvl_rst_send(Peer_socket::Ptr sock)
Sends an RST to the other side of the given socket asynchronously when possible.
void sock_set_state(Peer_socket::Ptr sock, Peer_socket::State state, Peer_socket::Open_sub_state open_sub_state=Peer_socket::Open_sub_state::S_CONNECTED)
Sets Peer_socket::m_state and Peer_socket::m_open_sub_state.
void receive_emptied_rcv_buf_while_disconnecting(Peer_socket::Ptr sock)
Placed by receive() onto W during a graceful close, after the Receive buffer had been emptied by the ...
void sock_disconnect_detected(Peer_socket::Ptr sock, const Error_code &disconnect_cause, bool close)
Records that thread W shows underlying connection is broken (graceful termination,...
size_t receive(Peer_socket::Ptr sock, const Function< size_t()> &rcv_buf_consume_func, Error_code *err_code)
Implementation of non-blocking sock->receive() for socket sock in all cases except when sock->state()...
void handle_connection_rexmit_timer_event(const Socket_id &socket_id, Peer_socket::Ptr sock)
Handles the triggering of the retransmit timer wait set up by setup_connection_timers(); it will re-s...
Node_options m_opts
This Node's global set of options.
Definition: node.hpp:3704
void close_connection_immediately(const Socket_id &socket_id, Peer_socket::Ptr sock, const Error_code &err_code, bool defer_delta_check)
A thread W method that handles the transition of the given socket from OPEN (any sub-state) to CLOSED...
void sock_disconnect_completed(Peer_socket::Ptr sock)
While in S_OPEN+S_DISCONNECTING state (i.e., after beginning a graceful close with sock_disconnect_de...
Fine_duration compute_rtt_on_ack(Peer_socket::Sent_packet::Const_ptr flying_pkt, const Fine_time_pt &time_now, Ack_packet::Individual_ack::Const_ptr ack, const Peer_socket::Sent_packet::Sent_when **sent_when) const
Helper of perform_accumulated_on_recv_tasks() that computes the RTT implied by a given individual ack...
Peer_socket::Ptr connect_with_metadata(const Remote_endpoint &to, const boost::asio::const_buffer &serialized_metadata, Error_code *err_code=0, const Peer_socket_options *opts=0)
Same as connect() but sends, as part of the connection handshake, the user-supplied metadata,...
void new_round_trip_time_sample(Peer_socket::Ptr sock, Fine_duration round_trip_time)
Handles a just-computed new RTT (round trip time) measurement for an individual packet earlier sent: ...
bool ok_to_rexmit_or_close(Peer_socket::Ptr sock, const Peer_socket::Sent_pkt_ordered_by_when_iter &pkt_it, bool defer_delta_check)
Checks whether the given sent packet has been retransmitted the maximum number of allowed times; if s...
util::Task_engine m_task_engine
The main loop engine, functioning in the single-threaded-but-asynchronous callback-based "reactor" st...
Definition: node.hpp:3739
Port_space m_ports
Flow port space for both client and server sockets. All threads may access this.
Definition: node.hpp:3777
void rst_and_close_connection_immediately(const Socket_id &socket_id, Peer_socket::Ptr sock, const Error_code &err_code, bool defer_delta_check)
Asynchronously send RST to the other side of the given socket and close_connection_immediately().
void drop_timer_action(Peer_socket::Ptr sock, bool drop_all_packets)
Handles a Drop_timer (Peer_socket::m_snd_drop_timer) event in ESTABLISHED state by dropping the speci...
A class that keeps a Peer_socket_receive_stats data store, includes methods to conveniently accumulat...
void good_data_accepted_packet(size_t data)
Indicates good_data_packet(), and these data are not dropped (so either delivered into Receive buffer...
void good_data_dropped_reassembly_q_overflow_packet(size_t data)
Indicates good_data_packet(), but these data are dropped due to insufficient Receive reassembly queue...
void presumed_dropped_data(size_t data)
Indicates that one or more unreceived data packets have been considered Dropped due to the number of ...
void good_data_delivered_packet(size_t data)
Indicates good_data_accepted_packet(), and these data are delivered into Receive buffer (either immed...
void late_or_dupe_to_send_ack_packet(size_t data)
Indicates that late_or_dupe_data_packet() and therefore an individual acknowledgment for this packet ...
void total_data_packet(size_t data)
Indicates one DATA packet has been received on socket.
void good_to_send_ack_packet(size_t data)
Indicates that good_data_delivered_packet() and therefore an individual acknowledgment for this packe...
void good_data_packet(size_t data)
Indicates total_data_packet(), and these data are new and acceptable into Receive buffer assuming the...
void error_data_packet(size_t data)
Indicates total_data_packet(), but there is some error about the sequence numbers so that they are no...
void buffer_fed(size_t size)
Indicates the Receive buffer was enqueued with data from network (so its data_size() increased).
void good_data_first_qd_packet(size_t data)
Indicates good_data_accepted_packet(), and these data are, upon receipt, queued for reassembly (not i...
void good_data_dropped_buf_overflow_packet(size_t data)
Indicates good_data_packet(), but these data are dropped due to insufficient Receive buffer space.
void late_or_dupe_data_packet(size_t data)
Indicates total_data_packet(), but the arrived data have either already been received before or (more...
A peer (non-server) socket operating over the Flow network protocol, with optional stream-of-bytes an...
size_t get_connect_metadata(const boost::asio::mutable_buffer &buffer, Error_code *err_code=0) const
Obtains the serialized connect metadata, as supplied by the user during the connection handshake.
size_t max_block_size_multiple(const size_t &opt_val_ref, const unsigned int *inflate_pct_val_ptr=0) const
Returns the smallest multiple of max_block_size() that is >= the given option value,...
bool sync_send_reactor_pattern_impl(const Fine_time_pt &wait_until, Error_code *err_code)
Helper similar to sync_send_impl() but for the null_buffers versions of sync_send().
std::map< Sequence_number, Sent_pkt_ordered_by_when_iter > Sent_pkt_by_seq_num_map
Short-hand for m_snd_flying_pkts_by_seq_num type; see that data member.
bool sync_receive_reactor_pattern_impl(const Fine_time_pt &wait_until, Error_code *err_code)
Helper similar to sync_receive_impl() but for the null_buffers versions of sync_receive().
Remote_endpoint m_remote_endpoint
See remote_endpoint(). Should be set before user gets access to *this and not changed afterwards.
size_t sync_receive(const Mutable_buffer_sequence &target, const boost::chrono::duration< Rep, Period > &max_wait, Error_code *err_code=0)
Blocking (synchronous) version of receive().
util::Blob m_serialized_metadata
If !m_active_connect, this contains the serialized metadata that the user supplied on the other side ...
size_t node_sync_send(const Function< size_t(size_t max_data_size)> &snd_buf_feed_func_or_empty, const Fine_time_pt &wait_until, Error_code *err_code)
This is to sync_send() as node_send() is to send().
Error_code m_disconnect_cause
The Error_code causing disconnection (if one has occurred or is occurring) on this socket; otherwise ...
Peer_socket(log::Logger *logger_ptr, util::Task_engine *task_engine, const Peer_socket_options &opts)
Constructs object; initializes most values to well-defined (0, empty, etc.) but not necessarily meani...
Definition: peer_socket.cpp:37
Sequence_number m_rcv_init_seq_num
The Initial Sequence Number (ISN) contained in the original Syn_packet or Syn_ack_packet we received.
const Remote_endpoint & remote_endpoint() const
Intended other side of the connection (regardless of success, failure, or current State).
State
State of a Peer_socket.
@ S_OPEN
Future reads or writes may be possible. A socket in this state may be Writable or Readable.
@ S_CLOSED
Neither future reads nor writes are possible, AND Node has disowned the Peer_socket.
Open_sub_state
The sub-state of a Peer_socket when state is State::S_OPEN.
@ S_CONNECTED
This Peer_socket was created through a passive connect (Node::accept() and the like) or an active con...
@ S_CONNECTING
This Peer_socket was created through an active connect (Node::connect() and the like),...
@ S_DISCONNECTING
This Peer_socket was created through a passive connect (Node::accept() and the like) or an active con...
size_t sync_send(const Const_buffer_sequence &data, const boost::chrono::duration< Rep, Period > &max_wait, Error_code *err_code=0)
Blocking (synchronous) version of send().
~Peer_socket() override
Boring virtual destructor. Note that deletion is to be handled exclusively via shared_ptr,...
Definition: peer_socket.cpp:77
Error_code disconnect_cause() const
The error code that perviously caused state() to become State::S_CLOSED, or success code if state is ...
Sequence_number::seq_num_t order_num_t
Short-hand for order number type. 0 is reserved. Caution: Keep in sync with Drop_timer::packet_id_t.
flow_port_t local_port() const
The local Flow-protocol port chosen by the Node (if active or passive open) or user (if passive open)...
flow_port_t m_local_port
See local_port(). Should be set before user gets access to *this and not changed afterwards.
friend class Send_bandwidth_estimator
Stats modules have const access to all socket internals.
bool set_options(const Peer_socket_options &opts, Error_code *err_code=0)
Dynamically replaces the current options set (options()) with the given options set.
size_t node_send(const Function< size_t(size_t max_data_size)> &snd_buf_feed_func, Error_code *err_code)
Non-template helper for template send() that forwards the send() logic to Node::send().
bool rexmit_on() const
Whether retransmission is enabled on this connection.
size_t node_sync_receive(const Function< size_t()> &rcv_buf_consume_func_or_empty, const Fine_time_pt &wait_until, Error_code *err_code)
This is to sync_receive() as node_receive() is to receive().
util::Lock_guard< Mutex > Lock_guard
Short-hand for RAII lock guard of Mutex.
Int_state
The state of the socket (and the connection from this end's point of view) for the internal state mac...
@ S_ESTABLISHED
Public state is OPEN+CONNECTED; in our opinion the connection is established.
@ S_SYN_SENT
Public state is OPEN+CONNECTING; user requested active connect; we sent SYN and are awaiting response...
@ S_CLOSED
Closed (dead or new) socket.
@ S_SYN_RCVD
Public state is OPEN+CONNECTING; other side requested passive connect via SYN; we sent SYN_ACK and ar...
util::Lock_guard< Options_mutex > Options_lock
Short-hand for lock that acquires exclusive access to an Options_mutex.
void close_abruptly(Error_code *err_code=0)
Acts as if fatal error error::Code::S_USER_CLOSED_ABRUPTLY has been discovered on the connection.
size_t max_block_size() const
The maximum number of bytes of user data per received or sent packet on this connection.
Node * node() const
Node that produced this Peer_socket.
Definition: peer_socket.cpp:95
Peer_socket_info info() const
Returns a structure containing the most up-to-date stats about this connection.
Recvd_pkt_map::iterator Recvd_pkt_iter
Short-hand for m_rcv_packets_with_gaps iterator type.
Mutex m_mutex
This object's mutex.
Sent_pkt_by_sent_when_map::iterator Sent_pkt_ordered_by_when_iter
Short-hand for m_snd_flying_pkts_by_sent_when iterator type.
Sent_pkt_by_seq_num_map::const_iterator Sent_pkt_ordered_by_seq_const_iter
Short-hand for m_snd_flying_pkts_by_seq_num const iterator type.
Peer_socket_info m_info_on_close
This is the final set of stats collected at the time the socket was moved to S_CLOSED m_state.
bool ensure_open(Error_code *err_code) const
Helper that is equivalent to Node::ensure_sock_open(this, err_code).
Sent_pkt_by_sent_when_map::const_iterator Sent_pkt_ordered_by_when_const_iter
Short-hand for m_snd_flying_pkts_by_sent_when const iterator type.
Opt_type opt(const Opt_type &opt_val_ref) const
Analogous to Node::opt() but for per-socket options.
std::string bytes_blocks_str(size_t bytes) const
Helper that, given a byte count, returns a string with that byte count and the number of max_block_si...
Peer_socket_options m_opts
This socket's per-socket set of options.
Peer_socket_options options() const
Copies this socket's option set and returns that copy.
Options_mutex m_opts_mutex
The mutex protecting m_opts.
std::map< Sequence_number, boost::shared_ptr< Received_packet > > Recvd_pkt_map
Short-hand for m_rcv_packets_with_gaps type; see that data member.
Recvd_pkt_map::const_iterator Recvd_pkt_const_iter
Short-hand for m_rcv_packets_with_gaps const iterator type.
Open_sub_state m_open_sub_state
See state().
size_t node_receive(const Function< size_t()> &rcv_buf_consume_func, Error_code *err_code)
Non-template helper for template receive() that forwards the receive() logic to Node::receive().
State state(Open_sub_state *open_sub_state=0) const
Current State of the socket.
Definition: peer_socket.cpp:85
void return_port(flow_port_t port, Error_code *err_code)
Return a previously reserved port (of any type).
Definition: port_space.cpp:175
An internal net_flow sequence number identifying a piece of data.
Definition: seq_num.hpp:126
void set_metadata(char num_line_id=0, const Sequence_number &zero_point=Sequence_number(), seq_num_delta_t multiple_size=0)
Updates the full set of metadata (used at least for convenient convention-based logging but not actua...
Definition: seq_num.cpp:268
uint64_t seq_num_t
Raw sequence number type.
Definition: seq_num.hpp:138
Internal net_flow class that implements a socket buffer, as used by Peer_socket for Send and Receive ...
void consume_buf_move(util::Blob *target_buf, size_t max_data_size)
Consumes (removes from the front of the internal byte buffer and returns them to the caller) a byte s...
size_t data_size() const
The total number of bytes of application-layer data stored in this object.
Properties of various container types.
Definition: traits.hpp:43
typename Value_list::const_reverse_iterator Const_reverse_iterator
Type for reverse iterator pointing into an immutable structure of this type.
typename Value_list::reverse_iterator Reverse_iterator
Type for reverse iterator pointing into a mutable structure of this type.
std::pair< Iterator, bool > insert(Value const &key_and_mapped)
Attempts to insert the given key/mapped-value pair into the map.
static Ptr ptr_cast(const From_ptr &ptr_to_cast)
Provides syntactic-sugary way to perform a static_pointer_cast<> from a compatible smart pointer type...
boost::shared_ptr< Peer_socket > Ptr
Short-hand for ref-counted pointer to mutable values of type Target_type::element_type (a-la T*).
Const_target_ptr Const_ptr
Short-hand for ref-counted pointer to immutable values of type Target_type::element_type (a-la T cons...
Similar to ostringstream but allows fast read-only access directly into the std::string being written...
#define FLOW_ERROR_SYS_ERROR_LOG_WARNING()
Logs a warning about the (often errno-based or from a library) error code in sys_err_code.
Definition: error.hpp:269
#define FLOW_ERROR_EXEC_AND_THROW_ON_ERROR(ARG_ret_type, ARG_method_name,...)
Narrow-use macro that implements the error code/exception semantics expected of most public-facing Fl...
Definition: error.hpp:357
#define FLOW_ERROR_LOG_ERROR(ARG_val)
Logs a warning about the given error code using FLOW_LOG_WARNING().
Definition: error.hpp:233
#define FLOW_ERROR_EMIT_ERROR(ARG_val)
Sets *err_code to ARG_val and logs a warning about the error using FLOW_LOG_WARNING().
Definition: error.hpp:202
#define FLOW_ERROR_EMIT_ERROR_LOG_INFO(ARG_val)
Identical to FLOW_ERROR_EMIT_ERROR(), but the message logged has flow::log::Sev::S_INFO severity inst...
Definition: error.hpp:218
#define FLOW_LOG_DATA(ARG_stream_fragment)
Logs a DATA message into flow::log::Logger *get_logger() with flow::log::Component get_log_component(...
Definition: log.hpp:242
#define FLOW_LOG_INFO(ARG_stream_fragment)
Logs an INFO message into flow::log::Logger *get_logger() with flow::log::Component get_log_component...
Definition: log.hpp:197
#define FLOW_LOG_WITHOUT_CHECKING(ARG_sev, ARG_stream_fragment)
Identical to FLOW_LOG_WITH_CHECKING() but foregoes the filter (Logger::should_log()) check.
Definition: log.hpp:532
#define FLOW_LOG_WARNING(ARG_stream_fragment)
Logs a WARNING message into flow::log::Logger *get_logger() with flow::log::Component get_log_compone...
Definition: log.hpp:152
#define FLOW_LOG_WITH_CHECKING(ARG_sev, ARG_stream_fragment)
Logs a message of the specified severity into flow::log::Logger *get_logger() with flow::log::Compone...
Definition: log.hpp:489
#define FLOW_LOG_TRACE_WITHOUT_CHECKING(ARG_stream_fragment)
Logs a TRACE message into flow::log::Logger *get_logger() with flow::log::Component get_log_component...
Definition: log.hpp:354
#define FLOW_LOG_DATA_WITHOUT_CHECKING(ARG_stream_fragment)
Logs a DATA message into flow::log::Logger *get_logger() with flow::log::Component get_log_component(...
Definition: log.hpp:372
#define FLOW_LOG_TRACE(ARG_stream_fragment)
Logs a TRACE message into flow::log::Logger *get_logger() with flow::log::Component get_log_component...
Definition: log.hpp:227
Synchronicity
Enumeration indicating the manner in which asio_exec_ctx_post(), and various boost....
Definition: async_fwd.hpp:223
void asio_exec_ctx_post(log::Logger *logger_ptr, Execution_context *exec_ctx, Synchronicity synchronicity, Task &&task)
An extension of boost.asio's post() and dispatch() free function templates, this free function templa...
Definition: util.hpp:31
bool exec_void_and_throw_on_error(const Func &func, Error_code *err_code, util::String_view context)
Equivalent of exec_and_throw_on_error() for operations with void return type.
Definition: error.hpp:168
@ S_DATA
Message satisfies Sev::S_TRACE description AND contains variable-length structure (like packet,...
@ S_TRACE
Message indicates any condition that may occur with great frequency (thus verbose if logged).
@ S_INFO
Message indicates a not-"bad" condition that is not frequent enough to be of severity Sev::S_TRACE.
@ S_CONN_TIMEOUT
Other side did not complete connection handshake within the allowed time; perhaps no one is listening...
@ S_USER_CLOSED_ABRUPTLY
User code on this side abruptly closed connection; other side may be informed of this.
@ S_CONN_RESET_TOO_MANY_REXMITS
Connection reset because a packet has been retransmitted too many times.
@ S_SEQ_NUM_IMPLIES_CONNECTION_COLLISION
Other side has sent packet with sequence number that implies a port collision between two connections...
@ S_SEQ_NUM_ARITHMETIC_FAILURE
Other side has sent packets with inconsistent sequence numbers.
@ S_CONN_METADATA_TOO_LARGE
During connection user supplied metadata that is too large.
@ S_CANNOT_CONNECT_TO_IP_ANY
Cannot ask to connect to "any" IP address. Use specific IP address.
@ S_WAIT_USER_TIMEOUT
A blocking (sync_) or background-blocking (async_) operation timed out versus user-supplied time limi...
@ S_WAIT_INTERRUPTED
A blocking (sync_) or background-blocking (async_) operation was interrupted, such as by a signal.
@ S_INTERNAL_ERROR_SYSTEM_ERROR_ASIO_TIMER
Internal error: System error: Something went wrong with boost.asio timer subsystem.
@ S_EVENT_SET_CLOSED
Attempted operation on an event set, when that event set was closed.
@ S_INTERNAL_ERROR_PORT_COLLISION
Internal error: Ephemeral port double reservation allowed.
@ S_NODE_NOT_RUNNING
Node not running.
Flow module containing the API and implementation of the Flow network protocol, a TCP-inspired stream...
Definition: node.cpp:25
uint16_t flow_port_t
Logical Flow port type (analogous to a UDP/TCP port in spirit but in no way relevant to UDP/TCP).
const flow_port_t S_PORT_ANY
Special Flow port value used to indicate "invalid port" or "please pick a random available ephemeral ...
Definition: port_space.cpp:33
std::ostream & operator<<(std::ostream &os, const Congestion_control_selector::Strategy_choice &strategy_choice)
Serializes a Peer_socket_options::Congestion_control_strategy_choice enum to a standard ostream – the...
Definition: cong_ctl.cpp:146
bool key_exists(const Container &container, const typename Container::key_type &key)
Returns true if and only if the given key is present at least once in the given associative container...
Definition: util.hpp:276
Auto_cleanup setup_auto_cleanup(const Cleanup_func &func)
Provides a way to execute arbitrary (cleanup) code at the exit of the current block.
Definition: util.hpp:282
std::string buffers_dump_string(const Const_buffer_sequence &data, const std::string &indentation, size_t bytes_per_line)
Identical to buffers_to_ostream() but returns an std::string instead of writing to a given ostream.
Definition: util.hpp:481
bool subtract_with_floor(Minuend *minuend, const Subtrahend &subtrahend, const Minuend &floor)
Performs *minuend -= subtrahend, subject to a floor of floor.
Definition: util.hpp:299
Integer ceil_div(Integer dividend, Integer divisor)
Returns the result of the given non-negative integer divided by a positive integer,...
Definition: util.hpp:233
bool in_open_open_range(T const &min_val, T const &val, T const &max_val)
Returns true if and only if the given value is within the given range, given as a (low,...
Definition: util.hpp:270
Scheduled_task_handle schedule_task_from_now(log::Logger *logger_ptr, const Fine_duration &from_now, bool single_threaded, Task_engine *task_engine, Scheduled_task_handler &&task_body_moved)
Schedule the given function to execute in a certain amount of time: A handy wrapper around Timer (asi...
Definition: sched_task.hpp:34
bool scheduled_task_fired(log::Logger *logger_ptr, Scheduled_task_const_handle task)
Returns whether a previously scheduled (by schedule_task_from_now() or similar) task has already fire...
Definition: sched_task.cpp:238
bool in_open_closed_range(T const &min_val, T const &val, T const &max_val)
Returns true if and only if the given value is within the given range, given as a (low,...
Definition: util.hpp:254
void ostream_op_to_string(std::string *target_str, T const &... ostream_args)
Writes to the specified string, as if the given arguments were each passed, via << in sequence,...
Definition: util.hpp:342
Fine_duration scheduled_task_fires_from_now_or_canceled(log::Logger *logger_ptr, Scheduled_task_const_handle task)
Returns how long remains until a previously scheduled (by schedule_task_from_now() or similar) task f...
Definition: sched_task.cpp:200
boost::shared_ptr< Scheduled_task_handle_state > Scheduled_task_handle
Black-box type that represents a handle to a scheduled task as scheduled by schedule_task_at() or sch...
bool in_closed_range(T const &min_val, T const &val, T const &max_val)
Returns true if and only if the given value is within the given range, inclusive.
Definition: util.hpp:246
boost::shared_ptr< void > Auto_cleanup
Helper type for setup_auto_cleanup().
Definition: util_fwd.hpp:205
bool scheduled_task_cancel(log::Logger *logger_ptr, Scheduled_task_handle task)
Attempts to prevent the execution of a previously scheduled (by schedule_task_from_now() or similar) ...
Definition: sched_task.cpp:26
boost::asio::io_service Task_engine
Short-hand for boost.asio event service, the central class of boost.asio.
Definition: util_fwd.hpp:135
Blob_with_log_context<> Blob
A concrete Blob_with_log_context that compile-time-disables Basic_blob::share() and the sharing API d...
Definition: blob_fwd.hpp:60
boost::system::error_code Error_code
Short-hand for a boost.system error code (which basically encapsulates an integer/enum error code and...
Definition: common.hpp:502
Flow_log_component
The flow::log::Component payload enumeration comprising various log components used by Flow's own int...
Definition: common.hpp:632
Fine_clock::duration Fine_duration
A high-res time duration as computed from two Fine_time_pts.
Definition: common.hpp:410
Fine_clock::time_point Fine_time_pt
A high-res time point as returned by Fine_clock::now() and suitable for precise time math in general.
Definition: common.hpp:407
unsigned char uint8_t
Byte. Best way to represent a byte of binary data. This is 8 bits on all modern systems.
Definition: common.hpp:385
Specifies the outgoing (pre-serialization) acknowledgment of a single received Data_packet,...
Equivalent of Individual_ack_rexmit_off but for sockets with retransmission enabled.
Specifies the incoming (post-deserialization) acknowledgment of a single received Data_packet.
boost::shared_ptr< const Individual_ack > Const_ptr
Short-hand for ref-counted pointer to immutable objects of this class.
uint64_t ack_delay_t
Type used to store the ACK delay for a given individual acknowledged packet.
Fine_duration Ack_delay_time_unit
Ack_delay_time_unit(1) is the duration corresponding to the ack_delay_t value 1; and proportionally f...
uint32_t rcv_wnd_t
Type used to store the size of m_rcv_wnd member in a couple of different packet types.
uint8_t rexmit_id_t
Type used to store the retransmission count in DATA and ACK packets.
The data nugget uniquely identifying a peer-to-peer connection from a remote endpoint to a port in th...
Definition: node.hpp:3904
Metadata describing the data sent in the acknowledgment of an individual received packet.
boost::shared_ptr< const Individual_ack > Const_ptr
Short-hand for ref-counted pointer to immutable objects of this class.
boost::shared_ptr< Individual_ack > Ptr
Short-hand for ref-counted pointer to mutable objects of this class.
Metadata (and data, if retransmission is on) for a packet that has been received (and,...
const size_t m_size
Number of bytes in the Data_packet::m_data field of that packet.
Received_packet(log::Logger *logger_ptr, size_t size, util::Blob *src_data)
Constructs object by storing size of data and, if so instructed, the data themselves.
util::Blob m_data
Byte sequence equal to that of Data_packet::m_data of the packet.
Data store to keep timing related info when a packet is sent out.
const order_num_t m_order_num
Order number of the packet.
size_t m_sent_cwnd_bytes
The congestion window size (in bytes) that is used when the packet is sent out.
Fine_time_pt m_sent_time
The timestamp when the packet is sent out.
Metadata (and data, if retransmission is on) for a packet that has been sent one (if retransmission i...
Sent_packet(bool rexmit_on, boost::shared_ptr< Data_packet > packet, const Sent_when &sent_when)
Constructs object with the given values and m_acks_after_me at zero.
std::vector< Sent_when > m_sent_when
Time stamps, order numbers, and other info at the times when the different attempts (including origin...
const size_t m_size
Number of bytes in the Data_packet::m_data field of the sent packet.
const boost::shared_ptr< Data_packet > m_packet
If retransmission is on, this is the DATA packet itself that was sent; otherwise null.
uint16_t ack_count_t
Type used for m_acks_after_me.
ack_count_t m_acks_after_me
The number of times any packet with m_sent_when.back().m_order_num > this->m_sent_when....
A data store that keeps stats about the a Peer_socket connection.
Definition: info.hpp:456
Peer_socket_send_stats m_snd
Stats for outgoing direction of traffic. As opposed to the other m_snd_* members, this typically accu...
Definition: info.hpp:511
Node_options m_node_opts
Per-node options currently set on the socket's Node.
Definition: info.hpp:651
size_t m_low_lvl_max_buf_size
The UDP receive buffer maximum size, as reported by an appropriate call to the appropriate getsockopt...
Definition: info.hpp:526
size_t m_rcv_buf_size
The number of bytes in the internal Receive buffer.
Definition: info.hpp:549
size_t m_rcv_wnd_last_advertised
The last rcv_wnd (receive window) size sent to sender (not necessarily received; packets can be lost)...
Definition: info.hpp:555
Fine_duration m_snd_pacing_slice_period
In pacing, the duration of the current pacing time slice.
Definition: info.hpp:629
size_t m_rcv_reassembly_q_data_size
If rexmit_on is false then 0; otherwise the total DATA payload in the reassembly queue of the socket.
Definition: info.hpp:558
size_t m_snd_pacing_bytes_allowed_this_slice
This many bytes worth of DATA packets may still be sent, at this time, within the time slice defined ...
Definition: info.hpp:635
Peer_socket_options m_sock_opts
Per-socket options currently set on the socket.
Definition: info.hpp:644
size_t m_snd_buf_size
The number of bytes in the internal Send buffer.
Definition: info.hpp:590
size_t m_rcv_syn_rcvd_data_cumulative_size
Total size of DATA payload queued while waiting for SYN_ACK_ACK in SYN_RCVD state.
Definition: info.hpp:573
size_t m_rcv_syn_rcvd_data_q_size
Number of DATA packets queued while waiting for SYN_ACK_ACK in SYN_RCVD state.
Definition: info.hpp:576
std::string m_int_state_str
The internal state of the socket, rendered into string (e.g., "SYN_RECEIVED" or "ESTABLISHED").
Definition: info.hpp:533
Fine_time_pt m_snd_pacing_slice_start
In pacing, the time point marking the beginning of the current pacing time slice.
Definition: info.hpp:626
size_t m_snd_cong_ctl_in_flight_count
In congestion control, the current sent data packets that have been neither acknowledged nor consider...
Definition: info.hpp:611
size_t m_snd_cong_ctl_in_flight_bytes
In congestion control, the current sent data bytes that have been neither acknowledged nor considered...
Definition: info.hpp:608
double m_snd_est_bandwidth_mbit_per_sec
Estimate of the currently available (to this connection) outgoing bandwidth, in megabits per second.
Definition: info.hpp:641
size_t m_rcv_wnd
Receive window size = max Receive buffer space minus space taken. Infinity if flow control disabled.
Definition: info.hpp:552
size_t m_rcv_packets_with_gaps
Number of DATA packets tracked in structure tracking all valid received packets such at least one pac...
Definition: info.hpp:570
size_t m_snd_cong_ctl_wnd_bytes
In congestion control, the current congestion window (number of outgoing data bytes allowed In-flight...
Definition: info.hpp:599
Fine_duration m_snd_smoothed_round_trip_time
Estimated current round trip time of packets, computed as a smooth value over the past individual RTT...
Definition: info.hpp:614
Error_code m_disconnect_cause
If the socket is closing or closed, this is the reason for the closure; otherwise the default-constru...
Definition: info.hpp:539
size_t m_snd_cong_ctl_wnd_count_approx
In congestion control, the approximate equivalent of m_snd_cong_ctl_in_flight_bytes as a full packet ...
Definition: info.hpp:602
size_t m_snd_rcv_wnd
The receive window (rcv_wnd a/k/a free Receive buffer space) value of the peer socket on the other si...
Definition: info.hpp:596
bool m_is_active_connect
true if this is the "client" socket (connect()ed); false otherwise (accept()ed).
Definition: info.hpp:536
size_t m_snd_pacing_packet_q_size
In pacing, number of packets currently queued to be sent out by the pacing module.
Definition: info.hpp:623
Fine_duration m_snd_round_trip_time_variance
RTTVAR used for m_snd_smoothed_round_trip_time calculation; it is the current RTT variance.
Definition: info.hpp:617
Peer_socket_receive_stats m_rcv
Stats for incoming direction of traffic. As opposed to the other m_rcv_* members, this typically accu...
Definition: info.hpp:508
Fine_duration m_snd_drop_timeout
Drop Timeout: how long a given packet must remain unacknowledged to be considered dropped due to Drop...
Definition: info.hpp:620
A set of low-level options affecting a single Peer_socket.
Definition: options.hpp:36
Fine_duration m_st_init_drop_timeout
Once socket enters ESTABLISHED state, this is the value for Peer_socket::m_snd_drop_timeout until the...
Definition: options.hpp:226
unsigned int m_st_max_rexmissions_per_packet
If retransmission is enabled and a given packet is retransmitted this many times and has to be retran...
Definition: options.hpp:220
size_t m_st_rcv_buf_max_size
Maximum number of bytes that the Receive buffer can hold.
Definition: options.hpp:141
size_t m_st_cong_ctl_max_cong_wnd_blocks
The constant that determines the CWND limit in Congestion_control_classic_data::congestion_window_at_...
Definition: options.hpp:296
Fine_duration m_st_snd_bandwidth_est_sample_period_floor
When estimating the available send bandwidth, each sample must be compiled over at least this long of...
Definition: options.hpp:267
unsigned int m_st_cong_ctl_cong_avoidance_increment_blocks
The multiple of max-block-size by which to increment CWND in congestion avoidance mode after receivin...
Definition: options.hpp:306
size_t m_st_cong_ctl_cong_wnd_on_drop_timeout_blocks
On Drop Timeout, set congestion window to this value times max-block-size.
Definition: options.hpp:299
size_t m_st_cong_ctl_init_cong_wnd_blocks
The initial size of the congestion window, given in units of max-block-size-sized blocks.
Definition: options.hpp:277
bool m_st_rexmit_on
Whether to enable reliability via retransmission.
Definition: options.hpp:214
size_t m_st_snd_buf_max_size
Maximum number of bytes that the Send buffer can hold.
Definition: options.hpp:134
Fine_duration m_st_connect_retransmit_period
How often to resend SYN or SYN_ACK while SYN_ACK or SYN_ACK_ACK, respectively, has not been received.
Definition: options.hpp:121
Fine_duration m_dyn_rcv_wnd_recovery_timer_period
When the mode triggered by rcv-buf-max-size-to-advertise-percent being exceeded is in effect,...
Definition: options.hpp:333
Fine_duration m_st_connect_retransmit_timeout
How long from the first SYN or SYN_ACK to allow for connection handshake before aborting connection.
Definition: options.hpp:124
size_t m_st_max_full_blocks_before_ack_send
If there are at least this many TIMES max-block-size bytes' worth of individual acknowledgments to be...
Definition: options.hpp:198
Fine_duration m_st_delayed_ack_timer_period
The maximum amount of time to delay sending ACK with individual packet's acknowledgment since receivi...
Definition: options.hpp:191
unsigned int m_dyn_drop_timeout_backoff_factor
Whenever the Drop Timer fires, upon the requisite Dropping of packet(s), the DTO (Drop Timeout) is se...
Definition: options.hpp:325
size_t m_st_max_block_size
The size of block that we will strive to (and will, assuming at least that many bytes are available i...
Definition: options.hpp:114
unsigned int m_st_cong_ctl_classic_wnd_decay_percent
In classic congestion control, RFC 5681 specifies the window should be halved on loss; this option al...
Definition: options.hpp:314
unsigned int m_st_rcv_buf_max_size_to_advertise_percent
% of rcv-buf-max-size that has to be freed, since the last receive window advertisement,...
Definition: options.hpp:171
unsigned int m_st_rcv_max_packets_after_unrecvd_packet_ratio_percent
The limit on the size of Peer_socket::m_rcv_packets_with_gaps, expressed as what percentage the maxim...
Definition: options.hpp:183
Fine_duration m_dyn_drop_timeout_ceiling
Ceiling to impose on the Drop Timeout.
Definition: options.hpp:317
Represents the remote endpoint of a Flow-protocol connection; identifies the UDP endpoint of the remo...
Definition: endpoint.hpp:93
util::Udp_endpoint m_udp_endpoint
UDP address (IP address/UDP port) where the Node identified by this endpoint bound its low-level UDP ...
Definition: endpoint.hpp:97
#define FLOW_UTIL_WHERE_AM_I_STR()
Same as FLOW_UTIL_WHERE_AM_I() but evaluates to an std::string.
Definition: util_fwd.hpp:971