Porting PicoTCP WIP

This commit is contained in:
2025-10-29 14:29:06 +01:00
parent 6722f42e68
commit 815c2239fe
464 changed files with 235009 additions and 24 deletions

View File

@ -12,12 +12,21 @@ CFLAGS += -I. \
-I./std \
-I./flanterm/src \
-I$(ROOT)/share \
-I./picotcp/include \
-I./picotcp/modules \
-I./port_picotcp \
-DPRINTF_INCLUDE_CONFIG_H=1 \
-DLFS_NO_ASSERT \
-DLFS_NO_DEBUG \
-DLFS_NO_WARN \
-DLFS_NO_ERROR \
-DUACPI_BAREBONES_MODE
-DPICO_MOP2 \
-DPICO_SUPPORT_TCP \
-DPICO_SUPPORT_UDP \
-DPICO_SUPPORT_ETH \
-DPICO_SUPPORT_ICMP4 \
-DPICO_SUPPORT_IPV4 \
-DPICO_SUPPORT_MUTEX \
ifeq ($(PUTCHAR_),fb)
CFLAGS += -DPUTCHAR_=PUTCHAR_FB
@ -58,10 +67,15 @@ SRCFILES += $(call GRABSRC, \
path \
rbuf \
ipc/pipe \
ipc/netsock \
dev \
randcrypto \
time \
diskpart \
netdev \
port_picotcp \
port_picotcp/modules \
picotcp/stack \
)
CFILES := $(call GET_CFILES, $(SRCFILES))

View File

@ -9,7 +9,7 @@
//Regular text
#define BLK "\e[0;30m"
#define RED "\e[0;31m"
#define RED1 "\e[0;31m"
#define GRN "\e[0;32m"
#define YEL "\e[0;33m"
#define BLU "\e[0;34m"
@ -78,7 +78,7 @@
#define BHWHT "\e[1;97m"
//Reset
#define reset "\e[0m"
#define RESET "\e[0m"
#define CRESET "\e[0m"
#define COLOR_RESET "\e[0m"

View File

@ -0,0 +1,112 @@
#include <stdint.h>
#include <stddef.h>
#include "hal/hal.h"
#include "spinlock/spinlock.h"
#include "netsock.h"
#include "errors.h"
#include "pico_socket.h"
#include "dlmalloc/malloc.h"
#include "util/util.h"
#include "sysdefs/ipcnetsock.h"
#include "kprintf.h"
IpcNetSocks IPC_NETSOCKS;
void ipc_netsockinit(void) {
hal_memset(&IPC_NETSOCKS, 0, sizeof(IPC_NETSOCKS));
spinlock_init(&IPC_NETSOCKS.spinlock);
}
void ipc_netsock_event(uint16_t ev, struct pico_socket *sock1) {
IpcNetSock *netsock, *netsocktmp;
spinlock_acquire(&IPC_NETSOCKS.spinlock);
LL_FOREACH_SAFE(IPC_NETSOCKS.netsocks, netsock, netsocktmp) {
if (netsock->picosock == sock1)
break;
}
spinlock_release(&IPC_NETSOCKS.spinlock);
if (netsock == NULL) {
return;
}
spinlock_acquire(&netsock->spinlock);
/* if (ev & PICO_SOCK_EV_RD) { */
/* uint8_t *buf = dlmalloc(IPC_PIPE_MAX); */
/* int32_t len = 0; */
/* int32_t read; */
/* do { */
/* read = pico_socket_read(netsock->picosock, buf + len, IPC_PIPE_MAX - len); */
/* if (read > 0) { */
/* len += read; */
/* } */
/* } while(read > 0); */
/* ipc_pipewrite(netsock->datapipe, buf, len); */
/* dlfree(buf); */
/* } */
/* if (ev & PICO_SOCK_EV_WR) { */
/* uint8_t *buf = dlmalloc(IPC_PIPE_MAX); */
/* int32_t read = ipc_piperead(netsock->datapipe, buf, IPC_PIPE_MAX); */
/* if (read > 0) { */
/* pico_socket_write(netsock->picosock, buf, read); */
/* } */
/* dlfree(buf); */
/* } */
/* if (ev & PICO_SOCK_EV_FIN) { */
/* // normal close */
/* } */
/* if (ev & PICO_SOCK_EV_CLOSE) { */
/* pico_socket_shutdown(netsock->picosock, PICO_SHUT_WR); */
/* } */
rbuft_push(&netsock->eventbuffer, &ev);
spinlock_release(&netsock->spinlock);
}
IpcNetSock *ipc_netsockmake(uint16_t net, uint16_t proto, uint16_t port) {
IpcNetSock *netsock = dlmalloc(sizeof(*netsock));
if (netsock == NULL) {
return NULL;
}
netsock->picosock = pico_socket_open(net, proto, &ipc_netsock_event);
if (netsock->picosock == NULL) {
goto err_sock_open;
}
spinlock_init(&netsock->spinlock);
netsock->datapipe = dlmalloc(sizeof(*netsock->datapipe));
ipc_pipeinit(netsock->datapipe, (uint64_t)-1);
uint8_t *eventbuffer = dlmalloc(sizeof(IpcNetSockEventBuffer) * IPC_NETSOCK_EVENTBUFFER_MAX);
rbuft_init(&netsock->eventbuffer, eventbuffer, sizeof(IpcNetSockEventBuffer), IPC_NETSOCK_EVENTBUFFER_MAX);
uint16_t port_be = short_be(port);
struct pico_ip4 inaddr_any = {0};
pico_socket_bind(netsock->picosock, &inaddr_any, &port_be);
spinlock_acquire(&IPC_NETSOCKS.spinlock);
LL_APPEND(IPC_NETSOCKS.netsocks, netsock);
spinlock_release(&IPC_NETSOCKS.spinlock);
return netsock;
err_sock_open:
dlfree(netsock);
return NULL;
}
int32_t ipc_netsocklisten(IpcNetSock *netsock, size_t maxlisteners) {
spinlock_acquire(&netsock->spinlock);
int32_t r = pico_socket_listen(netsock->picosock, (int)maxlisteners);
spinlock_release(&netsock->spinlock);
return r == 0 ? E_OK : E_NETSOCKLISTEN;
}

View File

@ -0,0 +1,32 @@
#ifndef NETSOCK_NETSOCK_H_
#define NETSOCK_NETSOCK_H_
#include <stdbool.h>
#include "spinlock/spinlock.h"
#include "pico_socket.h"
#include "ipc/pipe/pipe.h"
#include "rbuf/rbuf.h"
typedef uint16_t IpcNetSockEventBuffer;
#define IPC_NETSOCK_EVENTBUFFER_MAX 512
typedef struct IpcNetSock {
struct IpcNetSock *next;
struct pico_socket *picosock;
IpcPipe *datapipe;
RBufT eventbuffer;
SpinLock spinlock;
} IpcNetSock;
typedef struct {
IpcNetSock *netsocks;
SpinLock spinlock;
} IpcNetSocks;
extern IpcNetSocks IPC_NETSOCKS;
void ipc_netsockinit(void);
IpcNetSock *ipc_netsockmake(uint16_t net, uint16_t proto, uint16_t port);
int32_t ipc_netsocklisten(IpcNetSock *netsock, size_t maxlisteners);
#endif // NETSOCK_NETSOCK_H_

View File

@ -15,6 +15,9 @@
#include "randcrypto/randcrypto.h"
#include "time/time.h"
#include "diskpart/diskpart.h"
#include "netdev/netdev.h"
#include "ipc/netsock/netsock.h"
#include "pico_stack.h"
void log_bootinfo(void) {
char buf[100];
@ -50,6 +53,9 @@ void kmain(void) {
diskpart_init();
baseimg_init();
vfs_init();
pico_stack_init();
netdev_init();
ipc_netsockinit();
proc_init();
for(;;);

View File

@ -27,7 +27,7 @@
#ifdef KPRINTF_COLORS
# include "ansi_colors.h"
# define ERR(component, fmt, ...) kprintf(CRESET "[" RED component CRESET "]: " fmt, ##__VA_ARGS__)
# define ERR(component, fmt, ...) kprintf(CRESET "[" RED1 component CRESET "]: " fmt, ##__VA_ARGS__)
#else
# define ERR(component, fmt, ...) kprintf("["component"]: "fmt, ##__VA_ARGS__)
#endif

56
kernel/netdev/netdev.c Normal file
View File

@ -0,0 +1,56 @@
#include <stdint.h>
#include <stddef.h>
#include "netdev.h"
#include "spinlock/spinlock.h"
#include "kprintf.h"
#include "dlmalloc/malloc.h"
#include "errors.h"
#include "util/util.h"
#include "hal/hal.h"
#include "pico_device.h"
#include "pico_dev_loop.h"
#include "pico_ipv4.h"
NetDevList NETDEV_LIST;
void netdev_init(void) {
spinlock_init(&NETDEV_LIST.spinlock);
NETDEV_LIST.head = NULL;
LOG("netdev", "init\n");
netdev_create(NETDEV_LOOPBACK, "127.0.0.1", "255.255.255.0");
}
NetDev *netdev_create(int32_t ndtype, const char *ipaddrstring, const char *netmaskstring) {
NetDev *nd = dlmalloc(sizeof(*nd));
if (nd == NULL) {
return NULL;
}
spinlock_acquire(&NETDEV_LIST.spinlock);
nd->_magic = NETDEV_MAGIC;
spinlock_init(&nd->spinlock);
switch (ndtype) {
case NETDEV_LOOPBACK: {
nd->picodev = pico_loop_create();
} break;
default:
dlfree(nd);
spinlock_release(&NETDEV_LIST.spinlock);
return NULL;
}
pico_string_to_ipv4(ipaddrstring, &nd->ipaddr4.addr);
pico_string_to_ipv4(netmaskstring, &nd->netmask4.addr);
pico_ipv4_link_add(nd->picodev, nd->ipaddr4, nd->netmask4);
LL_APPEND(NETDEV_LIST.head, nd);
spinlock_release(&NETDEV_LIST.spinlock);
return nd;
}
// TODO: delete

36
kernel/netdev/netdev.h Normal file
View File

@ -0,0 +1,36 @@
#ifndef NETDEV_NETDEV_H_
#define NETDEV_NETDEV_H_
#include <stdint.h>
#include <stddef.h>
#include "spinlock/spinlock.h"
#include "compiler/attr.h"
#include "pico_device.h"
#include "pico_ipv4.h"
enum {
NETDEV_LOOPBACK,
};
#define NETDEV_MAGIC 0xB00B
typedef struct NetDev {
struct pico_device *picodev;
uint32_t _magic;
struct NetDev *next;
SpinLock spinlock;
struct pico_ip4 ipaddr4, netmask4;
} NetDev;
typedef struct {
NetDev *head;
SpinLock spinlock;
} NetDevList;
extern NetDevList NETDEV_LIST;
void netdev_init(void);
NetDev *netdev_create(int32_t ndtype, const char *ipaddrstring, const char *netmaskstring);
#endif // NETDEV_NETDEV_H_

28
kernel/picotcp/.gitignore vendored Normal file
View File

@ -0,0 +1,28 @@
*.d
*.o
*.a
*.out
*.swp
tags
build
UNIT_*
core
core.*
.DS_Store
cscope.files
cscope.out
*.so
*.aux
*.pdf
*.toc
*.gz
*.log
*.pyc
*.elf
*.gcov
*.gcda
*.gcno
*.expand
*.pcap
.ycm_extra_conf.py
.clang_complete

View File

@ -0,0 +1,13 @@
before_install:
- sudo apt-get update -qq
- sudo apt-get install -y vde2 check libvdeplug2-dev libpcap-dev psmisc
- sudo pip install cpp-coveralls
- make clean
- rm -f *.gc*
install: make GCOV=1 && make units ARCH=faulty GCOV=1 && make test GCOV=1
language: c
script:
- ./test/coverage.sh
after_success:
- coveralls --exclude test/ --exclude modules/ptsocket --exclude build --exclude modules/pico_dev_mock.c --exclude modules/pico_dev_null.c --exclude modules/pico_dev_pcap.c --exclude modules/pico_dev_tap_windows.c --exclude modules/pico_dev_tun.c --gcov-options='\-lp'

8
kernel/picotcp/CONTRIBUTING.md Executable file
View File

@ -0,0 +1,8 @@
External contributions to picoTCP are very welcome. We do, however, ask that you sign the Contributor License Agreement.
We don't ask you to sign away your copyright. The CLA simply grants us an additional license on the code you wrote. This allows us to also use picoTCP in commercial projects, which enables us to keep investing time and money in creating a better TCP/IP stack.
Please read the [Agreement](https://docs.google.com/forms/d/1-z6lsT75l6ZIrgHGEWrWdHylJ6xxpjc7FwGfL2ilDFU/viewform), and if you agree with it, fill in your information.
You will receive a mail with a timestamp. Please modify our [CLA confirmation page](https://github.com/tass-belgium/picotcp/wiki/picoTCP-CLA-Confirmation-Page), adding the timestamp and your github username. This way we can be sure that nobody else filled in your info in the form.
Pull requests by people who haven't signed the CLA will, unfortunately, have to be rejected.

8
kernel/picotcp/COPYING Normal file
View File

@ -0,0 +1,8 @@
PicoTCP. Copyright (c) 2012-2017 Altran Intelligent Systems.
Released under the GNU General Public License, version 2, or (at your option)
version 3.
See LICENSE.GPLv2 and LICENSE.GPLv3 for details.
Different licensing models may exist, at the sole discretion of
the Copyright holders.

View File

@ -0,0 +1,339 @@
GNU GENERAL PUBLIC LICENSE
Version 2, June 1991
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
software--to make sure the software is free for all its users. This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it. (Some other Free Software Foundation software is covered by
the GNU Lesser General Public License instead.) You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
this service if you wish), that you receive source code or can get it
if you want it, that you can change the software or use pieces of it
in new free programs; and that you know you can do these things.
To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have. You must make sure that they, too, receive or can get the
source code. And you must show them these terms so they know their
rights.
We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
distribute and/or modify the software.
Also, for each author's protection and ours, we want to make certain
that everyone understands that there is no warranty for this free
software. If the software is modified by someone else and passed on, we
want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.
Finally, any free program is threatened constantly by software
patents. We wish to avoid the danger that redistributors of a free
program will individually obtain patent licenses, in effect making the
program proprietary. To prevent this, we have made it clear that any
patent must be licensed for everyone's free use or not licensed at all.
The precise terms and conditions for copying, distribution and
modification follow.
GNU GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License applies to any program or other work which contains
a notice placed by the copyright holder saying it may be distributed
under the terms of this General Public License. The "Program", below,
refers to any such program or work, and a "work based on the Program"
means either the Program or any derivative work under copyright law:
that is to say, a work containing the Program or a portion of it,
either verbatim or with modifications and/or translated into another
language. (Hereinafter, translation is included without limitation in
the term "modification".) Each licensee is addressed as "you".
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running the Program is not restricted, and the output from the Program
is covered only if its contents constitute a work based on the
Program (independent of having been made by running the Program).
Whether that is true depends on what the Program does.
1. You may copy and distribute verbatim copies of the Program's
source code as you receive it, in any medium, provided that you
conspicuously and appropriately publish on each copy an appropriate
copyright notice and disclaimer of warranty; keep intact all the
notices that refer to this License and to the absence of any warranty;
and give any other recipients of the Program a copy of this License
along with the Program.
You may charge a fee for the physical act of transferring a copy, and
you may at your option offer warranty protection in exchange for a fee.
2. You may modify your copy or copies of the Program or any portion
of it, thus forming a work based on the Program, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) You must cause the modified files to carry prominent notices
stating that you changed the files and the date of any change.
b) You must cause any work that you distribute or publish, that in
whole or in part contains or is derived from the Program or any
part thereof, to be licensed as a whole at no charge to all third
parties under the terms of this License.
c) If the modified program normally reads commands interactively
when run, you must cause it, when started running for such
interactive use in the most ordinary way, to print or display an
announcement including an appropriate copyright notice and a
notice that there is no warranty (or else, saying that you provide
a warranty) and that users may redistribute the program under
these conditions, and telling the user how to view a copy of this
License. (Exception: if the Program itself is interactive but
does not normally print such an announcement, your work based on
the Program is not required to print an announcement.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Program, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.
In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may copy and distribute the Program (or a work based on it,
under Section 2) in object code or executable form under the terms of
Sections 1 and 2 above provided that you also do one of the following:
a) Accompany it with the complete corresponding machine-readable
source code, which must be distributed under the terms of Sections
1 and 2 above on a medium customarily used for software interchange; or,
b) Accompany it with a written offer, valid for at least three
years, to give any third party, for a charge no more than your
cost of physically performing source distribution, a complete
machine-readable copy of the corresponding source code, to be
distributed under the terms of Sections 1 and 2 above on a medium
customarily used for software interchange; or,
c) Accompany it with the information you received as to the offer
to distribute corresponding source code. (This alternative is
allowed only for noncommercial distribution and only if you
received the program in object code or executable form with such
an offer, in accord with Subsection b above.)
The source code for a work means the preferred form of the work for
making modifications to it. For an executable work, complete source
code means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to
control compilation and installation of the executable. However, as a
special exception, the source code distributed need not include
anything that is normally distributed (in either source or binary
form) with the major components (compiler, kernel, and so on) of the
operating system on which the executable runs, unless that component
itself accompanies the executable.
If distribution of executable or object code is made by offering
access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.
4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License. Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
void, and will automatically terminate your rights under this License.
However, parties who have received copies, or rights, from you under
this License will not have their licenses terminated so long as such
parties remain in full compliance.
5. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Program or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Program (or any work based on the
Program), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Program or works based on it.
6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties to
this License.
7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Program at all. For example, if a patent
license would not permit royalty-free redistribution of the Program by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Program.
If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply and the section as a whole is intended to apply in other
circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
may add an explicit geographical distribution limitation excluding
those countries, so that distribution is permitted only in or among
countries not thus excluded. In such case, this License incorporates
the limitation as if written in the body of this License.
9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation. If the Program does not specify a version number of
this License, you may choose any version ever published by the Free Software
Foundation.
10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission. For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
make exceptions for this. Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.
NO WARRANTY
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
Also add information on how to contact you by electronic and paper mail.
If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:
Gnomovision version 69, Copyright (C) year name of author
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, the commands you use may
be called something other than `show w' and `show c'; they could even be
mouse-clicks or menu items--whatever suits your program.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the program, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
`Gnomovision' (which makes passes at compilers) written by James Hacker.
<signature of Ty Coon>, 1 April 1989
Ty Coon, President of Vice
This General Public License does not permit incorporating your program into
proprietary programs. If your program is a subroutine library, you may
consider it more useful to permit linking proprietary applications with the
library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License.

View File

@ -0,0 +1,674 @@
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
<program> Copyright (C) <year> <name of author>
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<http://www.gnu.org/licenses/>.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
<http://www.gnu.org/philosophy/why-not-lgpl.html>.

27
kernel/picotcp/MODTREE Normal file
View File

@ -0,0 +1,27 @@
RTOS:
IPV4: ETH
IPV6:
DEVLOOP:
CRC:
ETH:
TCP: IPV4
UDP: IPV4
IPV4FRAG: IPV4
NAT: IPV4 UDP
ICMP4: IPV4
MCAST: UDP
PING: ICMP4
DHCP_CLIENT: UDP
DHCP_SERVER: UDP
DNS_CLIENT: UDP
IPFILTER: IPV4
OLSR: MCAST
SLAACV4: IPV4
SNTP_CLIENT: DNS_CLIENT
TFTP: UDP
MDNS: MCAST
DNS_SD: MDNS
AODV: IPV4 UDP
PPP: IPV4
6LOWPAN: IPV6
IEEE802154: 6LOWPAN

508
kernel/picotcp/Makefile Normal file
View File

@ -0,0 +1,508 @@
-include ../../config.mk
-include ../../tools/kconfig/.config
OS:=$(shell uname)
CC:=$(CROSS_COMPILE)gcc
LD:=$(CROSS_COMPILE)ld
AR:=$(CROSS_COMPILE)ar
RANLIB:=$(CROSS_COMPILE)ranlib
SIZE:=$(CROSS_COMPILE)size
STRIP_BIN:=$(CROSS_COMPILE)strip
TEST_LDFLAGS=-pthread $(PREFIX)/modules/*.o $(PREFIX)/lib/*.o -lvdeplug
UNIT_LDFLAGS=-lcheck -lm -pthread -lrt -lsubunit
UNIT_CFLAGS= $(CFLAGS) -Wno-missing-braces
LIBNAME:="libpicotcp.a"
PREFIX?=$(PWD)/build
DEBUG?=1
PROFILE?=0
PERF?=0
ENDIAN?=little
STRIP?=0
RTOS?=0
GENERIC?=0
PTHREAD?=0
ADDRESS_SANITIZER?=1
GCOV?=0
# Default compiled-in protocols
#
TCP?=1
UDP?=1
ETH?=1
IPV4?=1
IPV4FRAG?=1
IPV6FRAG?=0
NAT?=1
ICMP4?=1
MCAST?=1
DEVLOOP?=1
PING?=1
DHCP_CLIENT?=1
DHCP_SERVER?=1
DNS_CLIENT?=1
MDNS?=1
DNS_SD?=1
SNTP_CLIENT?=1
IPFILTER?=1
CRC?=1
OLSR?=0
SLAACV4?=1
TFTP?=1
AODV?=1
MEMORY_MANAGER?=0
MEMORY_MANAGER_PROFILING?=0
TUN?=0
TAP?=0
PCAP?=0
PPP?=1
6LOWPAN?=0
IEEE802154?=0
IPC?=0
CYASSL?=0
WOLFSSL?=0
POLARSSL?=0
#IPv6 related
IPV6?=1
TEST?=0
ifeq ($(TEST),1)
6LOWPAN=1
IEEE802154=1
endif
UNITS?=0
ifeq ($(UNITS),1)
6LOWPAN=1
IEEE802154=1
ARCH=faulty
endif
UNITS_MM?=0
ifeq ($(UNITS_MM),1)
6LOWPAN=1
IEEE802154=1
MEMORY_MANAGER=1
endif
EXTRA_CFLAGS+=-DPICO_COMPILE_TIME=`date +%s`
EXTRA_CFLAGS+=$(PLATFORM_CFLAGS)
CFLAGS=-I$(PREFIX)/include -Iinclude -Imodules $(EXTRA_CFLAGS)
# options for adding warnings
CFLAGS+= -Wall -W -Wextra -Wshadow -Wcast-qual -Wwrite-strings -Wundef -Wdeclaration-after-statement
CFLAGS+= -Wconversion -Wcast-align -Wmissing-prototypes
# options for supressing warnings
CFLAGS+= -Wno-missing-field-initializers
ifeq ($(CC),clang)
CFLAGS+= -Wunreachable-code-break -Wpointer-bool-conversion -Wmissing-variable-declarations
endif
ifeq ($(OS),Darwin)
LIBSIZE=stat -f%z
ifeq ($(SIZE),size)
SUMSIZE=$(SIZE)
else
SUMSIZE=$(SIZE) -t
endif
else
LIBSIZE=du -b
SUMSIZE=$(SIZE) -t
endif
ifeq ($(DEBUG),1)
CFLAGS+=-ggdb
else
ifeq ($(PERF), 1)
CFLAGS+=-O3
else
CFLAGS+=-Os
endif
endif
ifeq ($(PROFILE),1)
CFLAGS+=-pg
endif
ifeq ($(TFTP),1)
MOD_OBJ+=$(LIBBASE)modules/pico_strings.o $(LIBBASE)modules/pico_tftp.o
OPTIONS+=-DPICO_SUPPORT_TFTP
endif
ifeq ($(AODV),1)
MOD_OBJ+=$(LIBBASE)modules/pico_aodv.o
OPTIONS+=-DPICO_SUPPORT_AODV
endif
ifeq ($(GENERIC),1)
CFLAGS+=-DGENERIC
endif
ifeq ($(PTHREAD),1)
CFLAGS+=-DPICO_SUPPORT_PTHREAD
endif
ifneq ($(ENDIAN),little)
CFLAGS+=-DPICO_BIGENDIAN
endif
ifneq ($(RTOS),0)
OPTIONS+=-DPICO_SUPPORT_RTOS
endif
ifeq ($(ARCH),cortexm4-hardfloat)
CFLAGS+=-DCORTEX_M4_HARDFLOAT -mcpu=cortex-m4 -mthumb -mlittle-endian -mfpu=fpv4-sp-d16 -mfloat-abi=hard -mthumb-interwork -fsingle-precision-constant
endif
ifeq ($(ARCH),cortexm4-softfloat)
CFLAGS+=-DCORTEX_M4_SOFTFLOAT -mcpu=cortex-m4 -mthumb -mlittle-endian -mfloat-abi=soft -mthumb-interwork
endif
ifeq ($(ARCH),cortexm3)
CFLAGS+=-DCORTEX_M3 -mcpu=cortex-m3 -mthumb -mlittle-endian -mthumb-interwork
endif
ifeq ($(ARCH),cortexm0plus)
CFLAGS+=-DCORTEX_M0PLUS -mcpu=cortex-m0plus -mthumb -mlittle-endian -mthumb-interwork
endif
ifeq ($(ARCH),arm9)
CFLAGS+=-DARM9 -mcpu=arm9e -march=armv5te -gdwarf-2 -Wall -marm -mthumb-interwork -fpack-struct
endif
ifeq ($(ADDRESS_SANITIZER),1)
TEST_LDFLAGS+=-fsanitize=address -fno-omit-frame-pointer
endif
ifeq ($(GCOV),1)
TEST_LDFLAGS+=-lgcov --coverage
CFLAGS+=-fprofile-arcs -ftest-coverage
endif
ifeq ($(ARCH),faulty)
CFLAGS+=-DFAULTY -DUNIT_TEST
ifeq ($(ADDRESS_SANITIZER),1)
CFLAGS+=-fsanitize=address
endif
CFLAGS+=-fno-omit-frame-pointer
UNITS_OBJ+=test/pico_faulty.o
TEST_OBJ+=test/pico_faulty.o
DUMMY_EXTRA+=test/pico_faulty.o
endif
ifeq ($(ARCH),msp430)
CFLAGS+=-DMSP430
endif
ifeq ($(ARCH),esp8266)
CFLAGS+=-DESP8266 -Wl,-EL -fno-inline-functions -nostdlib -mlongcalls -mtext-section-literals
endif
ifeq ($(ARCH),mt7681)
CFLAGS+=-DMT7681 -fno-builtin -ffunction-sections -fno-strict-aliasing -m16bit -mabi=2 -mbaseline=V2 -mcpu=n9 -mno-div -mel -mmw-count=8 -mno-ext-mac -mno-dx-regs
endif
ifeq ($(ARCH),pic24)
CFLAGS+=-DPIC24 -c -mcpu=24FJ256GA106 -MMD -MF -g -omf=elf \
-mlarge-code -mlarge-data -msmart-io=1 -msfr-warn=off
endif
ifeq ($(ARCH),pic32)
CFLAGS+=-DPIC32
endif
ifeq ($(ARCH),atmega128)
CFLAGS+=-Wall -mmcu=atmega128 -DAVR
endif
ifeq ($(ARCH),none)
CFLAGS+=-DARCHNONE
endif
ifeq ($(ARCH),shared)
CFLAGS+=-fPIC
endif
%.o:%.c deps
$(CC) -c $(CFLAGS) -o $@ $<
CORE_OBJ= stack/pico_stack.o \
stack/pico_frame.o \
stack/pico_device.o \
stack/pico_protocol.o \
stack/pico_socket.o \
stack/pico_socket_multicast.o \
stack/pico_tree.o \
stack/pico_md5.o
POSIX_OBJ+= modules/pico_dev_vde.o \
modules/pico_dev_tun.o \
modules/pico_dev_ipc.o \
modules/pico_dev_tap.o \
modules/pico_dev_mock.o
include rules/debug.mk
ifneq ($(ETH),0)
include rules/eth.mk
endif
ifneq ($(IPV4),0)
include rules/ipv4.mk
endif
ifneq ($(IPV4FRAG),0)
include rules/ipv4frag.mk
endif
ifneq ($(ICMP4),0)
include rules/icmp4.mk
endif
ifneq ($(TCP),0)
include rules/tcp.mk
endif
ifneq ($(UDP),0)
include rules/udp.mk
endif
ifneq ($(MCAST),0)
include rules/mcast.mk
include rules/igmp.mk
include rules/mld.mk
endif
ifneq ($(NAT),0)
include rules/nat.mk
endif
ifneq ($(DEVLOOP),0)
include rules/devloop.mk
endif
ifneq ($(DHCP_CLIENT),0)
include rules/dhcp_client.mk
endif
ifneq ($(DHCP_SERVER),0)
include rules/dhcp_server.mk
endif
ifneq ($(DNS_CLIENT),0)
include rules/dns_client.mk
endif
ifneq ($(MDNS),0)
include rules/mdns.mk
endif
ifneq ($(DNS_SD),0)
include rules/dns_sd.mk
endif
ifneq ($(IPFILTER),0)
include rules/ipfilter.mk
endif
ifneq ($(CRC),0)
include rules/crc.mk
endif
ifneq ($(OLSR),0)
include rules/olsr.mk
endif
ifneq ($(SLAACV4),0)
include rules/slaacv4.mk
endif
ifneq ($(IPV6),0)
include rules/ipv6.mk
endif
ifneq ($(MEMORY_MANAGER),0)
include rules/memory_manager.mk
endif
ifneq ($(MEMORY_MANAGER_PROFILING),0)
OPTIONS+=-DPICO_SUPPORT_MM_PROFILING
endif
ifneq ($(SNTP_CLIENT),0)
include rules/sntp_client.mk
endif
ifneq ($(TUN),0)
include rules/tun.mk
endif
ifneq ($(TAP),0)
include rules/tap.mk
endif
ifneq ($(PCAP),0)
include rules/pcap.mk
endif
ifneq ($(PPP),0)
include rules/ppp.mk
endif
ifneq ($(6LOWPAN), 0)
include rules/6lowpan.mk
endif
ifneq ($(IPC),0)
include rules/ipc.mk
endif
ifneq ($(CYASSL),0)
include rules/cyassl.mk
endif
ifneq ($(WOLFSSL),0)
include rules/wolfssl.mk
endif
ifneq ($(POLARSSL),0)
include rules/polarssl.mk
endif
all: mod core lib
core: $(CORE_OBJ)
@mkdir -p $(PREFIX)/lib
@mv stack/*.o $(PREFIX)/lib
mod: $(MOD_OBJ)
@mkdir -p $(PREFIX)/modules
@mv modules/*.o $(PREFIX)/modules || echo
posix: all $(POSIX_OBJ)
@mv modules/*.o $(PREFIX)/modules || echo
TEST_ELF= test/picoapp.elf
TEST6_ELF= test/picoapp6.elf
test: posix
@if [ $(TEST) -eq 0 ]; then \
echo "\n\nsmoke tests should be compiled with TEST=1 from now on!"; \
exit 1; \
fi
@mkdir -p $(PREFIX)/test/
@make -C test/examples PREFIX=$(PREFIX)
@echo -e "\t[CC] picoapp.o"
@$(CC) -c -o $(PREFIX)/examples/picoapp.o test/picoapp.c $(CFLAGS) -Itest/examples
@echo -e "\t[LD] $@"
@$(CC) -g -o $(TEST_ELF) -I include -I modules -I $(PREFIX)/include -Wl,--start-group $(TEST_LDFLAGS) $(TEST_OBJ) $(PREFIX)/examples/*.o -Wl,--end-group
@mv test/*.elf $(PREFIX)/test
@install $(PREFIX)/$(TEST_ELF) $(PREFIX)/$(TEST6_ELF)
tst: test
$(PREFIX)/include/pico_defines.h:
@mkdir -p $(PREFIX)/lib
@mkdir -p $(PREFIX)/include
@bash ./mkdeps.sh $(PREFIX) $(OPTIONS)
deps: $(PREFIX)/include/pico_defines.h
lib: mod core
@cp -f include/*.h $(PREFIX)/include
@cp -fa include/arch $(PREFIX)/include
@cp -f modules/*.h $(PREFIX)/include
@echo -e "\t[AR] $(PREFIX)/lib/$(LIBNAME)"
@$(AR) cru $(PREFIX)/lib/$(LIBNAME) $(PREFIX)/modules/*.o $(PREFIX)/lib/*.o \
|| $(AR) cru $(PREFIX)/lib/$(LIBNAME) $(PREFIX)/lib/*.o
@echo -e "\t[RANLIB] $(PREFIX)/lib/$(LIBNAME)"
@$(RANLIB) $(PREFIX)/lib/$(LIBNAME)
@test $(STRIP) -eq 1 && (echo -e "\t[STRIP] $(PREFIX)/lib/$(LIBNAME)" \
&& $(STRIP_BIN) $(PREFIX)/lib/$(LIBNAME)) \
|| echo -e "\t[KEEP SYMBOLS] $(PREFIX)/lib/$(LIBNAME)"
@echo -e "\t[LIBSIZE] `$(LIBSIZE) $(PREFIX)/lib/$(LIBNAME)`"
@echo -e "`$(SUMSIZE) $(PREFIX)/lib/$(LIBNAME)`"
loop: mod core
mkdir -p $(PREFIX)/test
@$(CC) -c -o $(PREFIX)/modules/pico_dev_loop.o modules/pico_dev_loop.c $(CFLAGS)
@$(CC) -c -o $(PREFIX)/loop_ping.o test/loop_ping.c $(CFLAGS) -ggdb
units: mod core lib $(UNITS_OBJ) $(MOD_OBJ)
@if [ $(UNITS) -eq 0 ]; then \
echo "\n\nunit tests should be compiled with UNITS=1 from now on!"; \
exit 1; \
fi
@echo -e "\n\t[UNIT TESTS SUITE]"
@mkdir -p $(PREFIX)/test
@echo -e "\t[CC] units.o"
@$(CC) -g -c -o $(PREFIX)/test/units.o test/units.c $(UNIT_CFLAGS) -I stack -I modules -I includes -I test/unit -DUNIT_TEST
@echo -e "\t[LD] $(PREFIX)/test/units"
@$(CC) -o $(PREFIX)/test/units $(UNIT_CFLAGS) $(PREFIX)/test/units.o $(UNIT_LDFLAGS) \
$(UNITS_OBJ) $(PREFIX)/modules/pico_aodv.o \
$(PREFIX)/modules/pico_fragments.o
@$(CC) -o $(PREFIX)/test/modunit_pico_protocol.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_protocol.c stack/pico_tree.c $(UNIT_LDFLAGS) $(UNITS_OBJ)
@$(CC) -o $(PREFIX)/test/modunit_pico_frame.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_frame.c stack/pico_tree.c $(UNIT_LDFLAGS) $(UNITS_OBJ)
@$(CC) -o $(PREFIX)/test/modunit_seq.elf $(UNIT_CFLAGS) -I. test/unit/modunit_seq.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_tcp.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_tcp.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_dns_client.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_dns_client.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_dns_common.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_dns_common.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_mdns.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_mdns.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_dns_sd.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_dns_sd.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_dev_loop.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_dev_loop.c $(UNIT_LDFLAGS) $(UNITS_OBJ)
@$(CC) -o $(PREFIX)/test/modunit_ipv6_nd.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_ipv6_nd.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_ethernet.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_ethernet.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_pico_stack.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_stack.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_tftp.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_tftp.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_sntp_client.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_sntp_client.c $(UNIT_LDFLAGS) $(UNITS_OBJ)
@$(CC) -o $(PREFIX)/test/modunit_ipfilter.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_ipfilter.c stack/pico_tree.c $(UNIT_LDFLAGS) $(UNITS_OBJ)
@$(CC) -o $(PREFIX)/test/modunit_aodv.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_aodv.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_fragments.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_fragments.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_queue.elf $(UNIT_CFLAGS) -I. test/unit/modunit_queue.c $(UNIT_LDFLAGS) $(UNITS_OBJ)
@$(CC) -o $(PREFIX)/test/modunit_dev_ppp.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_dev_ppp.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_mld.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_mld.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_igmp.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_igmp.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_hotplug_detection.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_hotplug_detection.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_802154.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_802154.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_6lowpan.elf $(UNIT_CFLAGS) -I. -I test/examples test/unit/modunit_pico_6lowpan.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_strings.elf $(UNIT_CFLAGS) -I. test/unit/modunit_pico_strings.c $(UNIT_LDFLAGS) $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
devunits: mod core lib
@echo -e "\n\t[UNIT TESTS SUITE: device drivers]"
@mkdir -p $(PREFIX)/test/unit/device/
@echo -e "\t[CC] picotcp_mock.o"
@$(CC) -c -o $(PREFIX)/test/unit/device/picotcp_mock.o $(CFLAGS) -I stack -I modules -I includes -I test/unit test/unit/device/picotcp_mock.c
@$(CC) -c -o $(PREFIX)/test/unit/device/unit_dev_vde.o $(CFLAGS) -I stack -I modules -I includes -I test/unit test/unit/device/unit_dev_vde.c
@echo -e "\t[LD] $(PREFIX)/test/devunits"
@$(CC) -o $(PREFIX)/test/devunits $(CFLAGS) -I stack $(PREFIX)/test/unit/device/*.o -lcheck -lm -pthread -lrt
units_mm: mod core lib
@if [ $(UNITS_MM) -eq 0 ]; then \
echo "\n\nMM unit tests should be compiled with UNITS_MM=1 from now on!"; \
exit 1; \
fi
@echo -e "\n\t[UNIT TESTS SUITE]"
@mkdir -p $(PREFIX)/test
@echo -e "\t[CC] units_mm.o"
@$(CC) -c -o $(PREFIX)/test/units_mm.o test/unit/unit_mem_manager.c $(CFLAGS) -I stack -I modules -I includes -I test/unit
@echo -e "\t[LD] $(PREFIX)/test/units"
@$(CC) -o $(PREFIX)/test/units_mm $(CFLAGS) $(PREFIX)/test/units_mm.o -lcheck -lm -pthread -lrt
clean:
@echo -e "\t[CLEAN] $(PREFIX)/"
@rm -rf $(PREFIX) tags
mbed:
@echo -e "\t[Creating PicoTCP.zip]"
@rm -f PicoTCP.zip
@cp include/pico_socket.h include/socket.tmp
@echo "#define MBED\n" > include/mbed.tmp
@cat include/mbed.tmp include/socket.tmp > include/pico_socket.h
@zip -0 PicoTCP.zip -r include modules stack -x include/arch/ include/arch/* include/pico_config.h include/*.tmp modules/pico_dev_*
@rm include/pico_socket.h include/mbed.tmp
@mv include/socket.tmp include/pico_socket.h
style:
@find . -iname "*.[c|h]" | xargs uncrustify --replace -l C -c uncrustify.cfg || true
@find . -iname "*unc-backup*" |xargs rm || true
dummy: mod core lib $(DUMMY_EXTRA)
@echo testing configuration...
@$(CC) -c -o test/dummy.o test/dummy.c $(CFLAGS)
@$(CC) -o dummy test/dummy.o $(DUMMY_EXTRA) $(PREFIX)/lib/libpicotcp.a $(LDFLAGS) $(CFLAGS)
@echo done.
@rm -f test/dummy.o dummy
ppptest: test/ppp.c lib
gcc -ggdb -c -o ppp.o test/ppp.c -I $(PREFIX)/include/ -I $(PREFIX)/modules/ $(CFLAGS)
gcc -o ppp ppp.o $(PREFIX)/lib/libpicotcp.a $(LDFLAGS) $(CFLAGS)
rm -f ppp.o
.PHONY: coverity
coverity:
@make clean
@cov-build --dir $(PREFIX)/cov-int make
@tar czvf $(PREFIX)/coverity.tgz -C $(PREFIX) cov-int
FORCE:

View File

@ -0,0 +1,403 @@
-include ../../config.mk
-include ../../tools/kconfig/.config
WATCOM_PATH:=/opt/watcom
CC:=$(WATCOM_PATH)/binl/$(CROSS_COMPILE)wcc386
LD:=$(WATCOM_PATH)/binl/$(CROSS_COMPILE)wcl386
AR:=$(WATCOM_PATH)/binl/$(CROSS_COMPILE)wlib
RANLIB:=$(WATCOM_PATH)/binl/$(CROSS_COMPILE)ranlib
SIZE:=$(CROSS_COMPILE)size
STRIP_BIN:=$(CROSS_COMPILE)strip
TEST_LDFLAGS=-pthread $(PREFIX)/modules/*.o $(PREFIX)/lib/*.o -lvdeplug
LIBNAME:=libpicotcp.a
PREFIX?=$(PWD)/build
DEBUG?=1
PROFILE?=0
PERF?=0
ENDIAN?=little
STRIP?=0
RTOS?=0
GENERIC?=0
PTHREAD?=0
ADDRESS_SANITIZER?=1
# Default compiled-in protocols
#
TCP?=1
UDP?=1
ETH?=1
IPV4?=1
IPV4FRAG?=1
IPV6FRAG?=0
NAT?=1
ICMP4?=1
MCAST?=1
DEVLOOP?=1
PING?=1
DHCP_CLIENT?=1
DHCP_SERVER?=1
DNS_CLIENT?=1
MDNS?=1
DNS_SD?=1
SNTP_CLIENT?=1
IPFILTER?=1
CRC?=1
OLSR?=0
SLAACV4?=1
TFTP?=1
AODV?=1
MEMORY_MANAGER?=0
MEMORY_MANAGER_PROFILING?=0
TUN?=0
TAP?=0
PCAP?=0
PPP?=0
CYASSL?=0
WOLFSSL?=0
POLARSSL?=0
#IPv6 related
IPV6?=1
EXTRA_CFLAGS+=-dPICO_COMPILE_TIME=`date +%s`
EXTRA_CFLAGS+=$(PLATFORM_CFLAGS)
CFLAGS=-i=$(WATCOM_PATH)/h -i=$(PREFIX)/include -i=include -i=modules $(EXTRA_CFLAGS) -q
ifeq ($(DEBUG),1)
CFLAGS+=-od -of -d9
else
ifeq ($(PERF), 1)
CFLAGS+=
else
CFLAGS+=
endif
endif
ifeq ($(TFTP),1)
MOD_OBJ+=$(LIBBASE)modules/pico_strings.o $(LIBBASE)modules/pico_tftp.o
OPTIONS+=-dPICO_SUPPORT_TFTP
endif
ifeq ($(AODV),1)
MOD_OBJ+=$(LIBBASE)modules/pico_aodv.o
OPTIONS+=-dPICO_SUPPORT_AODV
endif
ifeq ($(GENERIC),1)
CFLAGS+=-dGENERIC
endif
ifeq ($(PTHREAD),1)
CFLAGS+=-dPICO_SUPPORT_PTHREAD
endif
ifneq ($(ENDIAN),little)
CFLAGS+=-dPICO_BIGENDIAN
endif
ifneq ($(RTOS),0)
OPTIONS+=-dPICO_SUPPORT_RTOS
endif
ifeq ($(ARCH),cortexm4-hardfloat)
CFLAGS+=-dCORTEX_M4_HARDFLOAT -mcpu=cortex-m4 -mthumb -mlittle-endian -mfpu=fpv4-sp-d16 -mfloat-abi=hard -mthumb-interwork -fsingle-precision-constant
endif
ifeq ($(ARCH),cortexm4-softfloat)
CFLAGS+=-dCORTEX_M4_SOFTFLOAT -mcpu=cortex-m4 -mthumb -mlittle-endian -mfloat-abi=soft -mthumb-interwork
endif
ifeq ($(ARCH),cortexm3)
CFLAGS+=-dCORTEX_M3 -mcpu=cortex-m3 -mthumb -mlittle-endian -mthumb-interwork
endif
ifeq ($(ARCH),arm9)
CFLAGS+=-dARM9 -mcpu=arm9e -march=armv5te -gdwarf-2 -Wall -marm -mthumb-interwork -fpack-struct
endif
ifeq ($(ADDRESS_SANITIZER),1)
TEST_LDFLAGS+=-fsanitize=address -fno-omit-frame-pointer
endif
ifeq ($(ARCH),faulty)
CFLAGS+=-dFAULTY -dUNIT_TEST
CFLAGS+=-fsanitize=address -fno-omit-frame-pointer
UNITS_OBJ+=test/pico_faulty.o
TEST_OBJ+=test/pico_faulty.o
DUMMY_EXTRA+=test/pico_faulty.o
endif
ifeq ($(ARCH),msp430)
CFLAGS+=-dMSP430
endif
ifeq ($(ARCH),esp8266)
CFLAGS+=-dESP8266 -Wl,-EL -fno-inline-functions -nostdlib -mlongcalls -mtext-section-literals
endif
ifeq ($(ARCH),mt7681)
CFLAGS+=-dMT7681 -fno-builtin -ffunction-sections -fno-strict-aliasing -m16bit -mabi=2 -mbaseline=V2 -mcpu=n9 -mno-div -mel -mmw-count=8 -mno-ext-mac -mno-dx-regs
endif
ifeq ($(ARCH),pic24)
CFLAGS+=-dPIC24 -mcpu=24FJ256GA106 -MMD -MF -g -omf=elf \
-mlarge-code -mlarge-data -msmart-io=1 -msfr-warn=off
endif
ifeq ($(ARCH),atmega128)
CFLAGS+=-Wall -mmcu=atmega128 -dAVR
endif
ifeq ($(ARCH),none)
CFLAGS+=-dARCHNONE
endif
ifeq ($(ARCH),shared)
CFLAGS+=-fPIC
endif
%.o:%.c deps
$(CC) $(CFLAGS) -fo=$@ $<
CORE_OBJ= stack/pico_stack.o \
stack/pico_frame.o \
stack/pico_device.o \
stack/pico_protocol.o \
stack/pico_socket.o \
stack/pico_socket_multicast.o \
stack/pico_tree.o \
stack/pico_md5.o
POSIX_OBJ+= modules/pico_dev_vde.o \
modules/pico_dev_tun.o \
modules/pico_dev_tap.o \
modules/pico_dev_mock.o
ifneq ($(ETH),0)
include rules/eth.mk
endif
ifneq ($(IPV4),0)
include rules/ipv4.mk
endif
ifneq ($(IPV4FRAG),0)
include rules/ipv4frag.mk
endif
ifneq ($(ICMP4),0)
include rules/icmp4.mk
endif
ifneq ($(TCP),0)
include rules/tcp.mk
endif
ifneq ($(UDP),0)
include rules/udp.mk
endif
ifneq ($(MCAST),0)
include rules/mcast.mk
include rules/igmp.mk
endif
ifneq ($(NAT),0)
include rules/nat.mk
endif
ifneq ($(DEVLOOP),0)
include rules/devloop.mk
endif
ifneq ($(DHCP_CLIENT),0)
include rules/dhcp_client.mk
endif
ifneq ($(DHCP_SERVER),0)
include rules/dhcp_server.mk
endif
ifneq ($(DNS_CLIENT),0)
include rules/dns_client.mk
endif
ifneq ($(MDNS),0)
include rules/mdns.mk
endif
ifneq ($(DNS_SD),0)
include rules/dns_sd.mk
endif
ifneq ($(IPFILTER),0)
include rules/ipfilter.mk
endif
ifneq ($(CRC),0)
include rules/crc.mk
endif
ifneq ($(OLSR),0)
include rules/olsr.mk
endif
ifneq ($(SLAACV4),0)
include rules/slaacv4.mk
endif
ifneq ($(IPV6),0)
include rules/ipv6.mk
endif
ifneq ($(MEMORY_MANAGER),0)
include rules/memory_manager.mk
endif
ifneq ($(MEMORY_MANAGER_PROFILING),0)
OPTIONS+=-dPICO_SUPPORT_MM_PROFILING
endif
ifneq ($(SNTP_CLIENT),0)
include rules/sntp_client.mk
endif
ifneq ($(TUN),0)
include rules/tun.mk
endif
ifneq ($(TAP),0)
include rules/tap.mk
endif
ifneq ($(PCAP),0)
include rules/pcap.mk
endif
ifneq ($(PPP),0)
include rules/ppp.mk
endif
ifneq ($(CYASSL),0)
include rules/cyassl.mk
endif
ifneq ($(WOLFSSL),0)
include rules/wolfssl.mk
endif
ifneq ($(POLARSSL),0)
include rules/polarssl.mk
endif
all: mod core lib
core: $(CORE_OBJ)
@mkdir -p $(PREFIX)/lib
@mv stack/*.o $(PREFIX)/lib
mod: $(MOD_OBJ)
@mkdir -p $(PREFIX)/modules
@mv modules/*.o $(PREFIX)/modules || echo
posix: all $(POSIX_OBJ)
@mv modules/*.o $(PREFIX)/modules || echo
TEST_ELF= test/picoapp.elf
TEST6_ELF= test/picoapp6.elf
test: posix
@mkdir -p $(PREFIX)/test/
@make -C test/examples PREFIX=$(PREFIX)
@echo -e "\t[CC] picoapp.o"
@$(CC) -c -o $(PREFIX)/examples/picoapp.o test/picoapp.c $(CFLAGS) -Itest/examples
@echo -e "\t[LD] $@"
@$(CC) -g -o $(TEST_ELF) -I include -I modules -I $(PREFIX)/include -Wl,--start-group $(TEST_LDFLAGS) $(TEST_OBJ) $(PREFIX)/examples/*.o -Wl,--end-group
@mv test/*.elf $(PREFIX)/test
@install $(PREFIX)/$(TEST_ELF) $(PREFIX)/$(TEST6_ELF)
tst: test
$(PREFIX)/include/pico_defines.h:
@mkdir -p $(PREFIX)/lib
@mkdir -p $(PREFIX)/include
@bash ./mkdeps.sh $(PREFIX) $(OPTIONS)
deps: $(PREFIX)/include/pico_defines.h
lib: mod core
@cp -f include/*.h $(PREFIX)/include
@cp -fa include/arch $(PREFIX)/include
@cp -f modules/*.h $(PREFIX)/include
@echo -e "\t[AR] $(PREFIX)/lib/$(LIBNAME)"
$(AR) -q -b -n -fag -o=$(PREFIX)/lib/$(LIBNAME) $(PREFIX)/modules/*.o $(PREFIX)/lib/*.o
@echo || $(AR) cru $(PREFIX)/lib/$(LIBNAME) $(PREFIX)/lib/*.o
@echo -e "\t[RANLIB] $(PREFIX)/lib/$(LIBNAME)"
@$(RANLIB) $(PREFIX)/lib/$(LIBNAME)
@echo -e "\t[LIBSIZE] `du -b $(PREFIX)/lib/$(LIBNAME)`"
loop: mod core
mkdir -p $(PREFIX)/test
@$(CC) -c -o $(PREFIX)/modules/pico_dev_loop.o modules/pico_dev_loop.c $(CFLAGS)
@$(CC) -c -o $(PREFIX)/loop_ping.o test/loop_ping.c $(CFLAGS) -ggdb
units: mod core lib $(UNITS_OBJ) $(MOD_OBJ)
@echo -e "\n\t[UNIT TESTS SUITE]"
@mkdir -p $(PREFIX)/test
@echo -e "\t[CC] units.o"
@$(CC) -c -o $(PREFIX)/test/units.o test/units.c $(CFLAGS) -I stack -I modules -I includes -I test/unit -dUNIT_TEST
@echo -e "\t[LD] $(PREFIX)/test/units"
@$(CC) -o $(PREFIX)/test/units $(CFLAGS) $(PREFIX)/test/units.o -lcheck -lm -pthread -lrt \
$(UNITS_OBJ) $(PREFIX)/modules/pico_aodv.o \
$(PREFIX)/modules/pico_fragments.o
@$(CC) -o $(PREFIX)/test/modunit_pico_protocol.elf $(CFLAGS) -I. test/unit/modunit_pico_protocol.c stack/pico_tree.c -lcheck -lm -pthread -lrt $(UNITS_OBJ)
@$(CC) -o $(PREFIX)/test/modunit_pico_frame.elf $(CFLAGS) -I. test/unit/modunit_pico_frame.c stack/pico_tree.c -lcheck -lm -pthread -lrt $(UNITS_OBJ)
@$(CC) -o $(PREFIX)/test/modunit_seq.elf $(CFLAGS) -I. test/unit/modunit_seq.c -lcheck -lm -pthread -lrt $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_tcp.elf $(CFLAGS) -I. test/unit/modunit_pico_tcp.c -lcheck -lm -pthread -lrt $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_dns_client.elf $(CFLAGS) -I. test/unit/modunit_pico_dns_client.c -lcheck -lm -pthread -lrt $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_dns_common.elf $(CFLAGS) -I. test/unit/modunit_pico_dns_common.c -lcheck -lm -pthread -lrt $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_mdns.elf $(CFLAGS) -I. test/unit/modunit_pico_mdns.c -lcheck -lm -pthread -lrt $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_dns_sd.elf $(CFLAGS) -I. test/unit/modunit_pico_dns_sd.c -lcheck -lm -pthread -lrt $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_dev_loop.elf $(CFLAGS) -I. test/unit/modunit_pico_dev_loop.c -lcheck -lm -pthread -lrt $(UNITS_OBJ)
@$(CC) -o $(PREFIX)/test/modunit_ipv6_nd.elf $(CFLAGS) -I. test/unit/modunit_pico_ipv6_nd.c -lcheck -lm -pthread -lrt $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_pico_stack.elf $(CFLAGS) -I. test/unit/modunit_pico_stack.c -lcheck -lm -pthread -lrt $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_tftp.elf $(CFLAGS) -I. test/unit/modunit_pico_tftp.c -lcheck -lm -pthread -lrt $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_sntp_client.elf $(CFLAGS) -I. test/unit/modunit_pico_sntp_client.c -lcheck -lm -pthread -lrt $(UNITS_OBJ)
@$(CC) -o $(PREFIX)/test/modunit_ipfilter.elf $(CFLAGS) -I. test/unit/modunit_pico_ipfilter.c stack/pico_tree.c -lcheck -lm -pthread -lrt $(UNITS_OBJ)
@$(CC) -o $(PREFIX)/test/modunit_aodv.elf $(CFLAGS) -I. test/unit/modunit_pico_aodv.c -lcheck -lm -pthread -lrt $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_fragments.elf $(CFLAGS) -I. test/unit/modunit_pico_fragments.c -lcheck -lm -pthread -lrt $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
@$(CC) -o $(PREFIX)/test/modunit_queue.elf $(CFLAGS) -I. test/unit/modunit_queue.c -lcheck -lm -pthread -lrt $(UNITS_OBJ)
@$(CC) -o $(PREFIX)/test/modunit_dev_ppp.elf $(CFLAGS) -I. test/unit/modunit_pico_dev_ppp.c -lcheck -lm -pthread -lrt $(UNITS_OBJ) $(PREFIX)/lib/libpicotcp.a
devunits: mod core lib
@echo -e "\n\t[UNIT TESTS SUITE: device drivers]"
@mkdir -p $(PREFIX)/test/unit/device/
@echo -e "\t[CC] picotcp_mock.o"
@$(CC) -c -o $(PREFIX)/test/unit/device/picotcp_mock.o $(CFLAGS) -I stack -I modules -I includes -I test/unit test/unit/device/picotcp_mock.c
@$(CC) -c -o $(PREFIX)/test/unit/device/unit_dev_vde.o $(CFLAGS) -I stack -I modules -I includes -I test/unit test/unit/device/unit_dev_vde.c
@echo -e "\t[LD] $(PREFIX)/test/devunits"
@$(CC) -o $(PREFIX)/test/devunits $(CFLAGS) -I stack $(PREFIX)/test/unit/device/*.o -lcheck -lm -pthread -lrt
units_mm: mod core lib
@echo -e "\n\t[UNIT TESTS SUITE]"
@mkdir -p $(PREFIX)/test
@echo -e "\t[CC] units_mm.o"
@$(CC) -c -o $(PREFIX)/test/units_mm.o test/unit/unit_mem_manager.c $(CFLAGS) -I stack -I modules -I includes -I test/unit
@echo -e "\t[LD] $(PREFIX)/test/units"
@$(CC) -o $(PREFIX)/test/units_mm $(CFLAGS) $(PREFIX)/test/units_mm.o -lcheck -lm -pthread -lrt
clean:
@echo -e "\t[CLEAN] $(PREFIX)/"
@rm -rf $(PREFIX) tags
mbed:
@echo -e "\t[Creating PicoTCP.zip]"
@rm -f PicoTCP.zip
@cp include/pico_socket.h include/socket.tmp
@echo "#define MBED\n" > include/mbed.tmp
@cat include/mbed.tmp include/socket.tmp > include/pico_socket.h
@zip -0 PicoTCP.zip -r include modules stack -x include/arch/ include/arch/* include/pico_config.h include/*.tmp modules/pico_dev_*
@rm include/pico_socket.h include/mbed.tmp
@mv include/socket.tmp include/pico_socket.h
style:
@find . -iname "*.[c|h]" | xargs -x uncrustify --replace -l C -c uncrustify.cfg || true
@find . -iname "*unc-backup*" |xargs -x rm || true
dummy: mod core lib $(DUMMY_EXTRA)
@echo testing configuration...
@$(CC) -c -o test/dummy.o test/dummy.c $(CFLAGS)
@$(CC) -o dummy test/dummy.o $(DUMMY_EXTRA) $(PREFIX)/lib/libpicotcp.a $(LDFLAGS) $(CFLAGS)
@echo done.
@rm -f test/dummy.o dummy
ppptest: test/ppp.c lib
gcc -ggdb -c -o ppp.o test/ppp.c -I build/include/ -I build/modules/ $(CFLAGS)
gcc -o ppp ppp.o build/lib/libpicotcp.a $(LDFLAGS) $(CFLAGS)
rm -f ppp.o
FORCE:

234
kernel/picotcp/README.md Normal file
View File

@ -0,0 +1,234 @@
# picoTCP
---------------
Welcome to the one and only <font color=ff00f0>picoTCP repository</font>.
picoTCP is a small-footprint, modular TCP/IP stack designed for embedded systems and the Internet of Things. It's actively being developed by *[Altran Intelligent Systems](http://intelligent-systems.altran.com/)*.
This code is released under the terms of GNU GPL v2 and GNU GPL v3. Some rights reserved.
Other licenses may apply at the sole discretion of the copyright holders.
Learn how to use picoTCP in your project by going through the **Getting Started guide** on our [GitHub wiki](https://github.com/tass-belgium/picotcp/wiki).
For more information send us an email or contact us on [Twitter](https://twitter.com/picotcp), [Facebook](https://www.facebook.com/picoTCP) or [Reddit](http://www.reddit.com/r/picotcp/).
Wondering about picoTCP's code quality? Check [our TiCS score](http://162.13.112.57:42506/tiobeweb/TICS/TqiDashboard.html#axes=Project()&metric=tqi&sel=Project(PicoTCP_rel))
---------------
## Continuous integration
Functional tests:
[![Jenkins autotest](http://162.13.84.104:8080/buildStatus/icon?job=picoTCP_Rel/PicoTCP_rel_autotest)](http://162.13.84.104:8080/job/picoTCP_Rel/job/PicoTCP_rel_autotest) -
Unit tests :
[![Jenkins unit tests](http://162.13.84.104:8080/buildStatus/icon?job=picoTCP_Rel/PicoTCP_rel_unit_tests)](http://162.13.84.104:8080/job/picoTCP_Rel/job/PicoTCP_rel_unit_tests) -
RFC compliance :
[![Jenkins RFC Compliance](http://162.13.84.104:8080/buildStatus/icon?job=picoTCP_Rel/PicoTCP_rel_RF_mbed)](http://162.13.84.104:8080/job/picoTCP_Rel/job/PicoTCP_rel_RF_mbed) -
TICS quality :
[![Jenkins TICS](http://162.13.84.104:8080/buildStatus/icon?job=picoTCP_Rel/PicoTCP_rel_TICS)](http://162.13.84.104:8080/job/picoTCP_Rel/job/PicoTCP_rel_TICS/)
Coverity Scan Build status:
[![Coverity Scan Build Status](https://scan.coverity.com/projects/7944/badge.svg)](https://scan.coverity.com/projects/7944)
---------------
## It runs on (pretty much) everything
By keeping interfaces simple, the porting effort to new platforms and OSses are very low. To give you an indication: porting to a new platform can be done in 3 days or less, a new OS in a single day and if you really go crazy, you can do an initial port in a single evening. Different platforms, mean different compilers, thats why we continuously compile our stack with a bunch of them. The following list shows some of the currently supported platforms, device drivers and compilers.
### PicoTCP has been used with
**Platforms picoTCP runs on**:
ARM Cortex-M series (ST Micro STM, NXP LPC, TI Stellaris, Freescale K64F),
ARM ARM9-series (ST Micro STR9),
Texas Instruments (MSP430),
Microchip (PIC24, PIC32),
Atmel (AVR 8bit),
Linux (User space (TUN/TAP), Kernel space),
Windows (User space (TAP))
**Network devices picoTCP has worked with**:
BCM43362 (IEEE 802.11), MRF24WG (IEEE 802.11), LPC Ethernet ENET/EMAC (IEEE 802.3), Stellaris Ethernet (IEEE 802.3), STM32 Ethernet (IEEE 802.3), Wiznet W5100 (IEEE 802.3), USB CDC-ECM (CDC1.2), PPP, Virtual drivers ( TUN/TAP, VDE, Libpcap)
**(RT)OSes picoTCP has been integrated into**:
No OS / Bare metal, FreeRTOS, mbed-RTOS, Frosted, linux / POSIX, MS DOS, MS Windows
**Libraries picoTCP has been integrated with**:
wolfSSL, mbedTLS, Mongoose RESTful library, MicroPython
**Compilers picoTCP compiles under**:
GCC, Clang, TCC, ARM-RCVT, IAR, XC-16, XC-32, MSP-GCC, AVR-GCC
Unfortunately we can't release all the code, a.o. because some parts depend on code or binaries that aren't GPL compatible, some parts were developed under a commercial contract, and some consist of very rough proof-of-concept code.
If you want to know more about the availability under the commercial license, or the possibility of using our expert services for porting or driver development, feel free to contact us at picotcp@altran.com.
Your favorite not in the list? Check out the wiki for information and examples on how to port picoTCP to a new platform!
---------------
## Highly configurable and modular design
Features are developed as modules in picoTCP, allowing you to pick the features you want in your application. This results in the smallest possible stack that remains compliant with the internet standards. The schematic below provides an overview of all implemented protocols.
![modular](https://s1.postimg.org/139xbnv7lb/image.png)
---------------
## Simple example
### Preparations
This example uses Ubuntu 14.04. It works on other linux distibutions as well, though you may need to change some package names. See [setting up the environment](https://github.com/tass-belgium/picotcp/wiki/Setting-up-the-environment#prerequisite-packages) for some more info.
```bash
sudo apt-get install git check vde2 libvdeplug2-dev libpcap0.8-dev openvpn wireshark
git clone https://github.com/tass-belgium/picotcp
cd picotcp
make TAP=1
cd ..
```
### The code
Then make a new directory, e.g. `example`, and create a file with the following content :
[//]: # (The code below is pulled through our CI - please leave the code extractor comments intact!)
[//]: # (code extractor start)
```C
#include <time.h>
#include <pico_stack.h>
#include <pico_ipv4.h>
#include <pico_icmp4.h>
#include <pico_dev_tap.h>
#define NUM_PING 10
static int finished = 0;
/* gets called when the ping receives a reply, or encounters a problem */
void cb_ping(struct pico_icmp4_stats *s)
{
char host[30];
pico_ipv4_to_string(host, s->dst.addr);
if (s->err == 0) {
/* if all is well, print some pretty info */
printf("%lu bytes from %s: icmp_req=%lu ttl=%lu time=%lu ms\n", s->size,
host, s->seq, s->ttl, (long unsigned int)s->time);
if (s->seq >= NUM_PING)
finished = 1;
} else {
/* if something went wrong, print it and signal we want to stop */
printf("PING %lu to %s: Error %d\n", s->seq, host, s->err);
finished = 1;
}
}
int main(void){
int id;
struct pico_ip4 ipaddr, netmask;
struct pico_device* dev;
/* initialise the stack. Super important if you don't want ugly stuff like
* segfaults and such! */
pico_stack_init();
/* create the tap device */
dev = pico_tap_create("tap0");
if (!dev)
return -1;
/* assign the IP address to the tap interface */
pico_string_to_ipv4("192.168.5.4", &ipaddr.addr);
pico_string_to_ipv4("255.255.255.0", &netmask.addr);
pico_ipv4_link_add(dev, ipaddr, netmask);
printf("starting ping\n");
id = pico_icmp4_ping("192.168.5.5", NUM_PING, 1000, 10000, 64, cb_ping);
if (id == -1)
return -1;
/* keep running stack ticks to have picoTCP do its network magic. Note that
* you can do other stuff here as well, or sleep a little. This will impact
* your network performance, but everything should keep working (provided
* you don't go overboard with the delays). */
while (finished != 1)
{
usleep(1000);
pico_stack_tick();
}
printf("finished !\n");
return 0;
}
```
[//]: # (code extractor stop)
### Building and running
Now we can compile this and link it, by running
```bash
gcc -c -o main.o -I../picotcp/build/include main.c
gcc -o main.elf main.o ../picotcp/build/lib/libpicotcp.a
```
Next we'll create a persistent tap device - a virtual network port. You don't need to repeat this each time, the device will exist until you reboot, or until you go `sudo tunctl -d tap0`
```bash
sudo tunctl -u <username>
sudo ifconfig tap0 192.168.5.5
```
Now, you should be able to run `./main.elf`, and see output like
```
Protocol ethernet registered (layer: 2).
Protocol ipv4 registered (layer: 3).
Protocol ipv6 registered (layer: 3).
Protocol icmp4 registered (layer: 4).
Protocol icmp6 registered (layer: 4).
Protocol igmp registered (layer: 4).
Protocol udp registered (layer: 4).
Protocol tcp registered (layer: 4).
Device tap0 created.
Assigned ipv4 192.168.5.4 to device tap0
starting ping
64 bytes from 192.168.5.5: icmp_req=1 ttl=64 time=5 ms
64 bytes from 192.168.5.5: icmp_req=2 ttl=64 time=0 ms
64 bytes from 192.168.5.5: icmp_req=3 ttl=64 time=0 ms
64 bytes from 192.168.5.5: icmp_req=4 ttl=64 time=0 ms
64 bytes from 192.168.5.5: icmp_req=5 ttl=64 time=0 ms
64 bytes from 192.168.5.5: icmp_req=6 ttl=64 time=0 ms
64 bytes from 192.168.5.5: icmp_req=7 ttl=64 time=0 ms
64 bytes from 192.168.5.5: icmp_req=8 ttl=64 time=0 ms
64 bytes from 192.168.5.5: icmp_req=9 ttl=64 time=0 ms
64 bytes from 192.168.5.5: icmp_req=10 ttl=64 time=0 ms
finished !
```
While the application is running, you can also run
```
ping 192.168.5.4
```
to send pings in the other direction.
### Investigating what happened
Run wireshark, and sniff the tap0 interface. Then run the `./main.elf` again, and see what happens. You should see an ARP request from picoTCP to Linux, and a reply. After that you should see the ping requests and replies going back and forth.
Note, sometimes you may see lots of other stuff, IPv6 router sollicitations, various broadcasts, mDNS, DNS-SD, etc - this is your when your Linux notices the new network interface is up, and starts all sorts of discoveries. With the persistent TAP device, this usually only happens the first time you start the application. Start a new wireshark capture, and start the application again, it should be much cleaner now.
Now you could make some changes to the `main.c` file, and experiment a bit! Keep some statistics of your pings (max, min, avg time). Open a UDP socket, send some stuff to a netcat instance on your linux. Or build a rudimentary port scanner, see what ports are open on your machine.
This is just a very quick overview, more info can be found in our [wiki](https://github.com/tass-belgium/picotcp/wiki).
---------------
## Contributors
Contributors are very welcome. Report a bug, suggest a way to improve our documentation, or write some new code.
Note however that, before accepting your code, we would ask you to sign our [Contributors License Agreement](https://docs.google.com/forms/d/1-z6lsT75l6ZIrgHGEWrWdHylJ6xxpjc7FwGfL2ilDFU/viewform). Your code remains under your copyright, and will always be available under GPLv2 and GPLv3. However, this CLA enables us to use picoTCP (including code from external contributors like you) under other licenses, including our commercial license. By doing commercial projects, we can keep investing in the quality and features of picoTCP.

15
kernel/picotcp/RFC/get_all_rfc Executable file
View File

@ -0,0 +1,15 @@
#!/bin/sh
wget -O rfc4614.txt http://tools.ietf.org/rfc/rfc4614.txt
for RFC in `grep "\[RFC" rfc4614.txt | sed -e "s/^.*RFC/rfc/" | grep -v "rfc \|rfc$" | sed -e "s/\].*$/.txt/g" |sort |uniq`; do
wget -O ${RFC} http://tools.ietf.org/rfc/${RFC}
done
wget -O rfc3927.txt http://tools.ietf.org/rfc/rfc3927.txt
# Get PPP related RFC's
for RFC in $(echo 1332 1334 1661 1662 1877 1994 | sed -r "s/[^ ]+/rfc&.txt/g"); do
wget -O ${RFC} http://tools.ietf.org/rfc/${RFC}
done

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,763 @@
RFC: 814
NAME, ADDRESSES, PORTS, AND ROUTES
David D. Clark
MIT Laboratory for Computer Science
Computer Systems and Communications Group
July, 1982
1. Introduction
It has been said that the principal function of an operating system
is to define a number of different names for the same object, so that it
can busy itself keeping track of the relationship between all of the
different names. Network protocols seem to have somewhat the same
characteristic. In TCP/IP, there are several ways of referring to
things. At the human visible interface, there are character string
"names" to identify networks, hosts, and services. Host names are
translated into network "addresses", 32-bit values that identify the
network to which a host is attached, and the location of the host on
that net. Service names are translated into a "port identifier", which
in TCP is a 16-bit value. Finally, addresses are translated into
"routes", which are the sequence of steps a packet must take to reach
the specified addresses. Routes show up explicitly in the form of the
internet routing options, and also implicitly in the address to route
translation tables which all hosts and gateways maintain.
This RFC gives suggestions and guidance for the design of the
tables and algorithms necessary to keep track of these various sorts of
identifiers inside a host implementation of TCP/IP.
2
2. The Scope of the Problem
One of the first questions one can ask about a naming mechanism is
how many names one can expect to encounter. In order to answer this, it
is necessary to know something about the expected maximum size of the
internet. Currently, the internet is fairly small. It contains no more
than 25 active networks, and no more than a few hundred hosts. This
makes it possible to install tables which exhaustively list all of these
elements. However, any implementation undertaken now should be based on
an assumption of a much larger internet. The guidelines currently
recommended are an upper limit of about 1,000 networks. If we imagine
an average number of 25 hosts per net, this would suggest a maximum
number of 25,000 hosts. It is quite unclear whether this host estimate
is high or low, but even if it is off by several factors of two, the
resulting number is still large enough to suggest that current table
management strategies are unacceptable. Some fresh techniques will be
required to deal with the internet of the future.
3. Names
As the previous section suggests, the internet will eventually have
a sufficient number of names that a host cannot have a static table
which provides a translation from every name to its associated address.
There are several reasons other than sheer size why a host would not
wish to have such a table. First, with that many names, we can expect
names to be added and deleted at such a rate that an installer might
spend all his time just revising the table. Second, most of the names
will refer to addresses of machines with which nothing will ever be
3
exchanged. In fact, there may be whole networks with which a particular
host will never have any traffic.
To cope with this large and somewhat dynamic environment, the
internet is moving from its current position in which a single name
table is maintained by the NIC and distributed to all hosts, to a
distributed approach in which each network (or group of networks) is
responsible for maintaining its own names and providing a "name server"
to translate between the names and the addresses in that network. Each
host is assumed to store not a complete set of name-address
translations, but only a cache of recently used names. When a name is
provided by a user for translation to an address, the host will first
examine its local cache, and if the name is not found there, will
communicate with an appropriate name server to obtain the information,
which it may then insert into its cache for future reference.
Unfortunately, the name server mechanism is not totally in place in
the internet yet, so for the moment, it is necessary to continue to use
the old strategy of maintaining a complete table of all names in every
host. Implementors, however, should structure this table in such a way
that it is easy to convert later to a name server approach. In
particular, a reasonable programming strategy would be to make the name
table accessible only through a subroutine interface, rather than by
scattering direct references to the table all through the code. In this
way, it will be possible, at a later date, to replace the subroutine
with one capable of making calls on remote name servers.
A problem which occasionally arises in the ARPANET today is that
4
the information in a local host table is out of date, because a host has
moved, and a revision of the host table has not yet been installed from
the NIC. In this case, one attempts to connect to a particular host and
discovers an unexpected machine at the address obtained from the local
table. If a human is directly observing the connection attempt, the
error is usually detected immediately. However, for unattended
operations such as the sending of queued mail, this sort of problem can
lead to a great deal of confusion.
The nameserver scheme will only make this problem worse, if hosts
cache locally the address associated with names that have been looked
up, because the host has no way of knowing when the address has changed
and the cache entry should be removed. To solve this problem, plans are
currently under way to define a simple facility by which a host can
query a foreign address to determine what name is actually associated
with it. SMTP already defines a verification technique based on this
approach.
4. Addresses
The IP layer must know something about addresses. In particular,
when a datagram is being sent out from a host, the IP layer must decide
where to send it on the immediately connected network, based on the
internet address. Mechanically, the IP first tests the internet address
to see whether the network number of the recipient is the same as the
network number of the sender. If so, the packet can be sent directly to
the final recipient. If not, the datagram must be sent to a gateway for
further forwarding. In this latter case, a second decision must be
5
made, as there may be more than one gateway available on the immediately
attached network.
When the internet address format was first specified, 8 bits were
reserved to identify the network. Early implementations thus
implemented the above algorithm by means of a table with 256 entries,
one for each possible net, that specified the gateway of choice for that
net, with a special case entry for those nets to which the host was
immediately connected. Such tables were sometimes statically filled in,
which caused confusion and malfunctions when gateways and networks moved
(or crashed).
The current definition of the internet address provides three
different options for network numbering, with the goal of allowing a
very large number of networks to be part of the internet. Thus, it is
no longer possible to imagine having an exhaustive table to select a
gateway for any foreign net. Again, current implementations must use a
strategy based on a local cache of routing information for addresses
currently being used.
The recommended strategy for address to route translation is as
follows. When the IP layer receives an outbound datagram for
transmission, it extracts the network number from the destination
address, and queries its local table to determine whether it knows a
suitable gateway to which to send the datagram. If it does, the job is
done. (But see RFC 816 on Fault Isolation and Recovery, for
recommendations on how to deal with the possible failure of the
gateway.) If there is no such entry in the local table, then select any
6
accessible gateway at random, insert that as an entry in the table, and
use it to send the packet. Either the guess will be right or wrong. If
it is wrong, the gateway to which the packet was sent will return an
ICMP redirect message to report that there is a better gateway to reach
the net in question. The arrival of this redirect should cause an
update of the local table.
The number of entries in the local table should be determined by
the maximum number of active connections which this particular host can
support at any one time. For a large time sharing system, one might
imagine a table with 100 or more entries. For a personal computer being
used to support a single user telnet connection, only one address to
gateway association need be maintained at once.
The above strategy actually does not completely solve the problem,
but only pushes it down one level, where the problem then arises of how
a new host, freshly arriving on the internet, finds all of its
accessible gateways. Intentionally, this problem is not solved within
the internetwork architecture. The reason is that different networks
have drastically different strategies for allowing a host to find out
about other hosts on its immediate network. Some nets permit a
broadcast mechanism. In this case, a host can send out a message and
expect an answer back from all of the attached gateways. In other
cases, where a particular network is richly provided with tools to
support the internet, there may be a special network mechanism which a
host can invoke to determine where the gateways are. In other cases, it
may be necessary for an installer to manually provide the name of at
7
least one accessible gateway. Once a host has discovered the name of
one gateway, it can build up a table of all other available gateways, by
keeping track of every gateway that has been reported back to it in an
ICMP message.
5. Advanced Topics in Addressing and Routing
The preceding discussion describes the mechanism required in a
minimal implementation, an implementation intended only to provide
operational service access today to the various networks that make up
the internet. For any host which will participate in future research,
as contrasted with service, some additional features are required.
These features will also be helpful for service hosts if they wish to
obtain access to some of the more exotic networks which will become part
of the internet over the next few years. All implementors are urged to
at least provide a structure into which these features could be later
integrated.
There are several features, either already a part of the
architecture or now under development, which are used to modify or
expand the relationships between addresses and routes. The IP source
route options allow a host to explicitly direct a datagram through a
series of gateways to its foreign host. An alternative form of the ICMP
redirect packet has been proposed, which would return information
specific to a particular destination host, not a destination net.
Finally, additional IP options have been proposed to identify particular
routes within the internet that are unacceptable. The difficulty with
implementing these new features is that the mechanisms do not lie
8
entirely within the bounds of IP. All the mechanisms above are designed
to apply to a particular connection, so that their use must be specified
at the TCP level. Thus, the interface between IP and the layers above
it must include mechanisms to allow passing this information back and
forth, and TCP (or any other protocol at this level, such as UDP), must
be prepared to store this information. The passing of information
between IP and TCP is made more complicated by the fact that some of the
information, in particular ICMP packets, may arrive at any time. The
normal interface envisioned between TCP and IP is one across which
packets can be sent or received. The existence of asynchronous ICMP
messages implies that there must be an additional channel between the
two, unrelated to the actual sending and receiving of data. (In fact,
there are many other ICMP messages which arrive asynchronously and which
must be passed from IP up to higher layers. See RFC 816, Fault
Isolation and Recovery.)
Source routes are already in use in the internet, and many
implementations will wish to be able to take advantage of them. The
following sorts of usages should be permitted. First, a user, when
initiating a TCP connection, should be able to hand a source route into
TCP, which in turn must hand the source route to IP with every outgoing
datagram. The user might initially obtain the source route by querying
a different sort of name server, which would return a source route
instead of an address, or the user may have fabricated the source route
manually. A TCP which is listening for a connection, rather than
attempting to open one, must be prepared to receive a datagram which
contains a IP return route, in which case it must remember this return
route, and use it as a source route on all returning datagrams.
9
6. Ports and Service Identifiers
The IP layer of the architecture contains the address information
which specifies the destination host to which the datagram is being
sent. In fact, datagrams are not intended just for particular hosts,
but for particular agents within a host, processes or other entities
that are the actual source and sink of the data. IP performs only a
very simple dispatching once the datagram has arrived at the target
host, it dispatches it to a particular protocol. It is the
responsibility of that protocol handler, for example TCP, to finish
dispatching the datagram to the particular connection for which it is
destined. This next layer of dispatching is done using "port
identifiers", which are a part of the header of the higher level
protocol, and not the IP layer.
This two-layer dispatching architecture has caused a problem for
certain implementations. In particular, some implementations have
wished to put the IP layer within the kernel of the operating system,
and the TCP layer as a user domain application program. Strict
adherence to this partitioning can lead to grave performance problems,
for the datagram must first be dispatched from the kernel to a TCP
process, which then dispatches the datagram to its final destination
process. The overhead of scheduling this dispatch process can severely
limit the achievable throughput of the implementation.
As is discussed in RFC 817, Modularity and Efficiency in Protocol
Implementations, this particular separation between kernel and user
leads to other performance problems, even ignoring the issue of port
10
level dispatching. However, there is an acceptable shortcut which can
be taken to move the higher level dispatching function into the IP
layer, if this makes the implementation substantially easier.
In principle, every higher level protocol could have a different
dispatching algorithm. The reason for this is discussed below.
However, for the protocols involved in the service offering being
implemented today, TCP and UDP, the dispatching algorithm is exactly the
same, and the port field is located in precisely the same place in the
header. Therefore, unless one is interested in participating in further
protocol research, there is only one higher level dispatch algorithm.
This algorithm takes into account the internet level foreign address,
the protocol number, and the local port and foreign port from the higher
level protocol header. This algorithm can be implemented as a sort of
adjunct to the IP layer implementation, as long as no other higher level
protocols are to be implemented. (Actually, the above statement is only
partially true, in that the UDP dispatch function is subset of the TCP
dispatch function. UDP dispatch depends only protocol number and local
port. However, there is an occasion within TCP when this exact same
subset comes into play, when a process wishes to listen for a connection
from any foreign host. Thus, the range of mechanisms necessary to
support TCP dispatch are also sufficient to support precisely the UDP
requirement.)
The decision to remove port level dispatching from IP to the higher
level protocol has been questioned by some implementors. It has been
argued that if all of the address structure were part of the IP layer,
11
then IP could do all of the packet dispatching function within the host,
which would lead to a simpler modularity. Three problems were
identified with this. First, not all protocol implementors could agree
on the size of the port identifier. TCP selected a fairly short port
identifier, 16 bits, to reduce header size. Other protocols being
designed, however, wanted a larger port identifier, perhaps 32 bits, so
that the port identifier, if properly selected, could be considered
probabilistically unique. Thus, constraining the port id to one
particular IP level mechanism would prevent certain fruitful lines of
research. Second, ports serve a special function in addition to
datagram delivery: certain port numbers are reserved to identify
particular services. Thus, TCP port 23 is the remote login service. If
ports were implemented at the IP level, then the assignment of well
known ports could not be done on a protocol basis, but would have to be
done in a centralized manner for all of the IP architecture. Third, IP
was designed with a very simple layering role: IP contained exactly
those functions that the gateways must understand. If the port idea had
been made a part of the IP layer, it would have suggested that gateways
needed to know about ports, which is not the case.
There are, of course, other ways to avoid these problems. In
particular, the "well-known port" problem can be solved by devising a
second mechanism, distinct from port dispatching, to name well-known
ports. Several protocols have settled on the idea of including, in the
packet which sets up a connection to a particular service, a more
general service descriptor, such as a character string field. These
special packets, which are requesting connection to a particular
12
service, are routed on arrival to a special server, sometimes called a
"rendezvous server", which examines the service request, selects a
random port which is to be used for this instance of the service, and
then passes the packet along to the service itself to commence the
interaction.
For the internet architecture, this strategy had the serious flaw
that it presumed all protocols would fit into the same service paradigm:
an initial setup phase, which might contain a certain overhead such as
indirect routing through a rendezvous server, followed by the packets of
the interaction itself, which would flow directly to the process
providing the service. Unfortunately, not all high level protocols in
internet were expected to fit this model. The best example of this is
isolated datagram exchange using UDP. The simplest exchange in UDP is
one process sending a single datagram to another. Especially on a local
net, where the net related overhead is very low, this kind of simple
single datagram interchange can be extremely efficient, with very low
overhead in the hosts. However, since these individual packets would
not be part of an established connection, if IP supported a strategy
based on a rendezvous server and service descriptors, every isolated
datagram would have to be routed indirectly in the receiving host
through the rendezvous server, which would substantially increase the
overhead of processing, and every datagram would have to carry the full
service request field, which would increase the size of the packet
header.
In general, if a network is intended for "virtual circuit service",
13
or things similar to that, then using a special high overhead mechanism
for circuit setup makes sense. However, current directions in research
are leading away from this class of protocol, so once again the
architecture was designed not to preclude alternative protocol
structures. The only rational position was that the particular
dispatching strategy used should be part of the higher level protocol
design, not the IP layer.
This same argument about circuit setup mechanisms also applies to
the design of the IP address structure. Many protocols do not transmit
a full address field as part of every packet, but rather transmit a
short identifier which is created as part of a circuit setup from source
to destination. If the full address needs to be carried in only the
first packet of a long exchange, then the overhead of carrying a very
long address field can easily be justified. Under these circumstances,
one can create truly extravagant address fields, which are capable of
extending to address almost any conceivable entity. However, this
strategy is useable only in a virtual circuit net, where the packets
being transmitted are part of a established sequence, otherwise this
large extravagant address must be transported on every packet. Since
Internet explicitly rejected this restriction on the architecture, it
was necessary to come up with an address field that was compact enough
to be sent in every datagram, but general enough to correctly route the
datagram through the catanet without a previous setup phase. The IP
address of 32 bits is the compromise that results. Clearly it requires
a substantial amount of shoehorning to address all of the interesting
places in the universe with only 32 bits. On the other hand, had the
14
address field become much bigger, IP would have been susceptible to
another criticism, which is that the header had grown unworkably large.
Again, the fundamental design decision was that the protocol be designed
in such a way that it supported research in new and different sorts of
protocol architectures.
There are some limited restrictions imposed by the IP design on the
port mechanism selected by the higher level process. In particular,
when a packet goes awry somewhere on the internet, the offending packet
is returned, along with an error indication, as part of an ICMP packet.
An ICMP packet returns only the IP layer, and the next 64 bits of the
original datagram. Thus, any higher level protocol which wishes to sort
out from which port a particular offending datagram came must make sure
that the port information is contained within the first 64 bits of the
next level header. This also means, in most cases, that it is possible
to imagine, as part of the IP layer, a port dispatch mechanism which
works by masking and matching on the first 64 bits of the incoming
higher level header.

View File

@ -0,0 +1,648 @@
RFC: 816
FAULT ISOLATION AND RECOVERY
David D. Clark
MIT Laboratory for Computer Science
Computer Systems and Communications Group
July, 1982
1. Introduction
Occasionally, a network or a gateway will go down, and the sequence
of hops which the packet takes from source to destination must change.
Fault isolation is that action which hosts and gateways collectively
take to determine that something is wrong; fault recovery is the
identification and selection of an alternative route which will serve to
reconnect the source to the destination. In fact, the gateways perform
most of the functions of fault isolation and recovery. There are,
however, a few actions which hosts must take if they wish to provide a
reasonable level of service. This document describes the portion of
fault isolation and recovery which is the responsibility of the host.
2. What Gateways Do
Gateways collectively implement an algorithm which identifies the
best route between all pairs of networks. They do this by exchanging
packets which contain each gateway's latest opinion about the
operational status of its neighbor networks and gateways. Assuming that
this algorithm is operating properly, one can expect the gateways to go
through a period of confusion immediately after some network or gateway
2
has failed, but one can assume that once a period of negotiation has
passed, the gateways are equipped with a consistent and correct model of
the connectivity of the internet. At present this period of negotiation
may actually take several minutes, and many TCP implementations time out
within that period, but it is a design goal of the eventual algorithm
that the gateway should be able to reconstruct the topology quickly
enough that a TCP connection should be able to survive a failure of the
route.
3. Host Algorithm for Fault Recovery
Since the gateways always attempt to have a consistent and correct
model of the internetwork topology, the host strategy for fault recovery
is very simple. Whenever the host feels that something is wrong, it
asks the gateway for advice, and, assuming the advice is forthcoming, it
believes the advice completely. The advice will be wrong only during
the transient period of negotiation, which immediately follows an
outage, but will otherwise be reliably correct.
In fact, it is never necessary for a host to explicitly ask a
gateway for advice, because the gateway will provide it as appropriate.
When a host sends a datagram to some distant net, the host should be
prepared to receive back either of two advisory messages which the
gateway may send. The ICMP "redirect" message indicates that the
gateway to which the host sent the datagram is not longer the best
gateway to reach the net in question. The gateway will have forwarded
the datagram, but the host should revise its routing table to have a
different immediate address for this net. The ICMP "destination
3
unreachable" message indicates that as a result of an outage, it is
currently impossible to reach the addressed net or host in any manner.
On receipt of this message, a host can either abandon the connection
immediately without any further retransmission, or resend slowly to see
if the fault is corrected in reasonable time.
If a host could assume that these two ICMP messages would always
arrive when something was amiss in the network, then no other action on
the part of the host would be required in order maintain its tables in
an optimal condition. Unfortunately, there are two circumstances under
which the messages will not arrive properly. First, during the
transient following a failure, error messages may arrive that do not
correctly represent the state of the world. Thus, hosts must take an
isolated error message with some scepticism. (This transient period is
discussed more fully below.) Second, if the host has been sending
datagrams to a particular gateway, and that gateway itself crashes, then
all the other gateways in the internet will reconstruct the topology,
but the gateway in question will still be down, and therefore cannot
provide any advice back to the host. As long as the host continues to
direct datagrams at this dead gateway, the datagrams will simply vanish
off the face of the earth, and nothing will come back in return. Hosts
must detect this failure.
If some gateway many hops away fails, this is not of concern to the
host, for then the discovery of the failure is the responsibility of the
immediate neighbor gateways, which will perform this action in a manner
invisible to the host. The problem only arises if the very first
4
gateway, the one to which the host is immediately sending the datagrams,
fails. We thus identify one single task which the host must perform as
its part of fault isolation in the internet: the host must use some
strategy to detect that a gateway to which it is sending datagrams is
dead.
Let us assume for the moment that the host implements some
algorithm to detect failed gateways; we will return later to discuss
what this algorithm might be. First, let us consider what the host
should do when it has determined that a gateway is down. In fact, with
the exception of one small problem, the action the host should take is
extremely simple. The host should select some other gateway, and try
sending the datagram to it. Assuming that gateway is up, this will
either produce correct results, or some ICMP advice. Since we assume
that, ignoring temporary periods immediately following an outage, any
gateway is capable of giving correct advice, once the host has received
advice from any gateway, that host is in as good a condition as it can
hope to be.
There is always the unpleasant possibility that when the host tries
a different gateway, that gateway too will be down. Therefore, whatever
algorithm the host uses to detect a dead gateway must continuously be
applied, as the host tries every gateway in turn that it knows about.
The only difficult part of this algorithm is to specify the means
by which the host maintains the table of all of the gateways to which it
has immediate access. Currently, the specification of the internet
protocol does not architect any message by which a host can ask to be
5
supplied with such a table. The reason is that different networks may
provide very different mechanisms by which this table can be filled in.
For example, if the net is a broadcast net, such as an ethernet or a
ringnet, every gateway may simply broadcast such a table from time to
time, and the host need do nothing but listen to obtain the required
information. Alternatively, the network may provide the mechanism of
logical addressing, by which a whole set of machines can be provided
with a single group address, to which a request can be sent for
assistance. Failing those two schemes, the host can build up its table
of neighbor gateways by remembering all the gateways from which it has
ever received a message. Finally, in certain cases, it may be necessary
for this table, or at least the initial entries in the table, to be
constructed manually by a manager or operator at the site. In cases
where the network in question provides absolutely no support for this
kind of host query, at least some manual intervention will be required
to get started, so that the host can find out about at least one
gateway.
4. Host Algorithms for Fault Isolation
We now return to the question raised above. What strategy should
the host use to detect that it is talking to a dead gateway, so that it
can know to switch to some other gateway in the list. In fact, there are
several algorithms which can be used. All are reasonably simple to
implement, but they have very different implications for the overhead on
the host, the gateway, and the network. Thus, to a certain extent, the
algorithm picked must depend on the details of the network and of the
host.
6
1. NETWORK LEVEL DETECTION
Many networks, particularly the Arpanet, perform precisely the
required function internal to the network. If a host sends a datagram
to a dead gateway on the Arpanet, the network will return a "host dead"
message, which is precisely the information the host needs to know in
order to switch to another gateway. Some early implementations of
Internet on the Arpanet threw these messages away. That is an
exceedingly poor idea.
2. CONTINUOUS POLLING
The ICMP protocol provides an echo mechanism by which a host may
solicit a response from a gateway. A host could simply send this
message at a reasonable rate, to assure itself continuously that the
gateway was still up. This works, but, since the message must be sent
fairly often to detect a fault in a reasonable time, it can imply an
unbearable overhead on the host itself, the network, and the gateway.
This strategy is prohibited except where a specific analysis has
indicated that the overhead is tolerable.
3. TRIGGERED POLLING
If the use of polling could be restricted to only those times when
something seemed to be wrong, then the overhead would be bearable.
Provided that one can get the proper advice from one's higher level
protocols, it is possible to implement such a strategy. For example,
one could program the TCP level so that whenever it retransmitted a
7
segment more than once, it sent a hint down to the IP layer which
triggered polling. This strategy does not have excessive overhead, but
does have the problem that the host may be somewhat slow to respond to
an error, since only after polling has started will the host be able to
confirm that something has gone wrong, and by then the TCP above may
have already timed out.
Both forms of polling suffer from a minor flaw. Hosts as well as
gateways respond to ICMP echo messages. Thus, polling cannot be used to
detect the error that a foreign address thought to be a gateway is
actually a host. Such a confusion can arise if the physical addresses
of machines are rearranged.
4. TRIGGERED RESELECTION
There is a strategy which makes use of a hint from a higher level,
as did the previous strategy, but which avoids polling altogether.
Whenever a higher level complains that the service seems to be
defective, the Internet layer can pick the next gateway from the list of
available gateways, and switch to it. Assuming that this gateway is up,
no real harm can come of this decision, even if it was wrong, for the
worst that will happen is a redirect message which instructs the host to
return to the gateway originally being used. If, on the other hand, the
original gateway was indeed down, then this immediately provides a new
route, so the period of time until recovery is shortened. This last
strategy seems particularly clever, and is probably the most generally
suitable for those cases where the network itself does not provide fault
isolation. (Regretably, I have forgotten who suggested this idea to me.
It is not my invention.)
8
5. Higher Level Fault Detection
The previous discussion has concentrated on fault detection and
recovery at the IP layer. This section considers what the higher layers
such as TCP should do.
TCP has a single fault recovery action; it repeatedly retransmits a
segment until either it gets an acknowledgement or its connection timer
expires. As discussed above, it may use retransmission as an event to
trigger a request for fault recovery to the IP layer. In the other
direction, information may flow up from IP, reporting such things as
ICMP Destination Unreachable or error messages from the attached
network. The only subtle question about TCP and faults is what TCP
should do when such an error message arrives or its connection timer
expires.
The TCP specification discusses the timer. In the description of
the open call, the timeout is described as an optional value that the
client of TCP may specify; if any segment remains unacknowledged for
this period, TCP should abort the connection. The default for the
timeout is 30 seconds. Early TCPs were often implemented with a fixed
timeout interval, but this did not work well in practice, as the
following discussion may suggest.
Clients of TCP can be divided into two classes: those running on
immediate behalf of a human, such as Telnet, and those supporting a
program, such as a mail sender. Humans require a sophisticated response
to errors. Depending on exactly what went wrong, they may want to
9
abandon the connection at once, or wait for a long time to see if things
get better. Programs do not have this human impatience, but also lack
the power to make complex decisions based on details of the exact error
condition. For them, a simple timeout is reasonable.
Based on these considerations, at least two modes of operation are
needed in TCP. One, for programs, abandons the connection without
exception if the TCP timer expires. The other mode, suitable for
people, never abandons the connection on its own initiative, but reports
to the layer above when the timer expires. Thus, the human user can see
error messages coming from all the relevant layers, TCP and ICMP, and
can request TCP to abort as appropriate. This second mode requires that
TCP be able to send an asynchronous message up to its client to report
the timeout, and it requires that error messages arriving at lower
layers similarly flow up through TCP.
At levels above TCP, fault detection is also required. Either of
the following can happen. First, the foreign client of TCP can fail,
even though TCP is still running, so data is still acknowledged and the
timer never expires. Alternatively, the communication path can fail,
without the TCP timer going off, because the local client has no data to
send. Both of these have caused trouble.
Sending mail provides an example of the first case. When sending
mail using SMTP, there is an SMTP level acknowledgement that is returned
when a piece of mail is successfully delivered. Several early mail
receiving programs would crash just at the point where they had received
all of the mail text (so TCP did not detect a timeout due to outstanding
10
unacknowledged data) but before the mail was acknowledged at the SMTP
level. This failure would cause early mail senders to wait forever for
the SMTP level acknowledgement. The obvious cure was to set a timer at
the SMTP level, but the first attempt to do this did not work, for there
was no simple way to select the timer interval. If the interval
selected was short, it expired in normal operational when sending a
large file to a slow host. An interval of many minutes was needed to
prevent false timeouts, but that meant that failures were detected only
very slowly. The current solution in several mailers is to pick a
timeout interval proportional to the size of the message.
Server telnet provides an example of the other kind of failure. It
can easily happen that the communications link can fail while there is
no traffic flowing, perhaps because the user is thinking. Eventually,
the user will attempt to type something, at which time he will discover
that the connection is dead and abort it. But the host end of the
connection, having nothing to send, will not discover anything wrong,
and will remain waiting forever. In some systems there is no way for a
user in a different process to destroy or take over such a hanging
process, so there is no way to recover.
One solution to this would be to have the host server telnet query
the user end now and then, to see if it is still up. (Telnet does not
have an explicit query feature, but the host could negotiate some
unimportant option, which should produce either agreement or
disagreement in return.) The only problem with this is that a
reasonable sample interval, if applied to every user on a large system,
11
can generate an unacceptable amount of traffic and system overhead. A
smart server telnet would use this query only when something seems
wrong, perhaps when there had been no user activity for some time.
In both these cases, the general conclusion is that client level
error detection is needed, and that the details of the mechanism are
very dependent on the application. Application programmers must be made
aware of the problem of failures, and must understand that error
detection at the TCP or lower level cannot solve the whole problem for
them.
6. Knowing When to Give Up
It is not obvious, when error messages such as ICMP Destination
Unreachable arrive, whether TCP should abandon the connection. The
reason that error messages are difficult to interpret is that, as
discussed above, after a failure of a gateway or network, there is a
transient period during which the gateways may have incorrect
information, so that irrelevant or incorrect error messages may
sometimes return. An isolated ICMP Destination Unreachable may arrive
at a host, for example, if a packet is sent during the period when the
gateways are trying to find a new route. To abandon a TCP connection
based on such a message arriving would be to ignore the valuable feature
of the Internet that for many internal failures it reconstructs its
function without any disruption of the end points.
But if failure messages do not imply a failure, what are they for?
In fact, error messages serve several important purposes. First, if
12
they arrive in response to opening a new connection, they probably are
caused by opening the connection improperly (e.g., to a non-existent
address) rather than by a transient network failure. Second, they
provide valuable information, after the TCP timeout has occurred, as to
the probable cause of the failure. Finally, certain messages, such as
ICMP Parameter Problem, imply a possible implementation problem. In
general, error messages give valuable information about what went wrong,
but are not to be taken as absolutely reliable. A general alerting
mechanism, such as the TCP timeout discussed above, provides a good
indication that whatever is wrong is a serious condition, but without
the advisory messages to augment the timer, there is no way for the
client to know how to respond to the error. The combination of the
timer and the advice from the error messages provide a reasonable set of
facts for the client layer to have. It is important that error messages
from all layers be passed up to the client module in a useful and
consistent way.
-------

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,470 @@
Network Working Group David C. Plummer
Request For Comments: 826 (DCP@MIT-MC)
November 1982
An Ethernet Address Resolution Protocol
-- or --
Converting Network Protocol Addresses
to 48.bit Ethernet Address
for Transmission on
Ethernet Hardware
Abstract
The implementation of protocol P on a sending host S decides,
through protocol P's routing mechanism, that it wants to transmit
to a target host T located some place on a connected piece of
10Mbit Ethernet cable. To actually transmit the Ethernet packet
a 48.bit Ethernet address must be generated. The addresses of
hosts within protocol P are not always compatible with the
corresponding Ethernet address (being different lengths or
values). Presented here is a protocol that allows dynamic
distribution of the information needed to build tables to
translate an address A in protocol P's address space into a
48.bit Ethernet address.
Generalizations have been made which allow the protocol to be
used for non-10Mbit Ethernet hardware. Some packet radio
networks are examples of such hardware.
--------------------------------------------------------------------
The protocol proposed here is the result of a great deal of
discussion with several other people, most notably J. Noel
Chiappa, Yogen Dalal, and James E. Kulp, and helpful comments
from David Moon.
[The purpose of this RFC is to present a method of Converting
Protocol Addresses (e.g., IP addresses) to Local Network
Addresses (e.g., Ethernet addresses). This is a issue of general
concern in the ARPA Internet community at this time. The
method proposed here is presented for your consideration and
comment. This is not the specification of a Internet Standard.]
Notes:
------
This protocol was originally designed for the DEC/Intel/Xerox
10Mbit Ethernet. It has been generalized to allow it to be used
for other types of networks. Much of the discussion will be
directed toward the 10Mbit Ethernet. Generalizations, where
applicable, will follow the Ethernet-specific discussion.
DOD Internet Protocol will be referred to as Internet.
Numbers here are in the Ethernet standard, which is high byte
first. This is the opposite of the byte addressing of machines
such as PDP-11s and VAXes. Therefore, special care must be taken
with the opcode field (ar$op) described below.
An agreed upon authority is needed to manage hardware name space
values (see below). Until an official authority exists, requests
should be submitted to
David C. Plummer
Symbolics, Inc.
243 Vassar Street
Cambridge, Massachusetts 02139
Alternatively, network mail can be sent to DCP@MIT-MC.
The Problem:
------------
The world is a jungle in general, and the networking game
contributes many animals. At nearly every layer of a network
architecture there are several potential protocols that could be
used. For example, at a high level, there is TELNET and SUPDUP
for remote login. Somewhere below that there is a reliable byte
stream protocol, which might be CHAOS protocol, DOD TCP, Xerox
BSP or DECnet. Even closer to the hardware is the logical
transport layer, which might be CHAOS, DOD Internet, Xerox PUP,
or DECnet. The 10Mbit Ethernet allows all of these protocols
(and more) to coexist on a single cable by means of a type field
in the Ethernet packet header. However, the 10Mbit Ethernet
requires 48.bit addresses on the physical cable, yet most
protocol addresses are not 48.bits long, nor do they necessarily
have any relationship to the 48.bit Ethernet address of the
hardware. For example, CHAOS addresses are 16.bits, DOD Internet
addresses are 32.bits, and Xerox PUP addresses are 8.bits. A
protocol is needed to dynamically distribute the correspondences
between a <protocol, address> pair and a 48.bit Ethernet address.
Motivation:
-----------
Use of the 10Mbit Ethernet is increasing as more manufacturers
supply interfaces that conform to the specification published by
DEC, Intel and Xerox. With this increasing availability, more
and more software is being written for these interfaces. There
are two alternatives: (1) Every implementor invents his/her own
method to do some form of address resolution, or (2) every
implementor uses a standard so that his/her code can be
distributed to other systems without need for modification. This
proposal attempts to set the standard.
Definitions:
------------
Define the following for referring to the values put in the TYPE
field of the Ethernet packet header:
ether_type$XEROX_PUP,
ether_type$DOD_INTERNET,
ether_type$CHAOS,
and a new one:
ether_type$ADDRESS_RESOLUTION.
Also define the following values (to be discussed later):
ares_op$REQUEST (= 1, high byte transmitted first) and
ares_op$REPLY (= 2),
and
ares_hrd$Ethernet (= 1).
Packet format:
--------------
To communicate mappings from <protocol, address> pairs to 48.bit
Ethernet addresses, a packet format that embodies the Address
Resolution protocol is needed. The format of the packet follows.
Ethernet transmission layer (not necessarily accessible to
the user):
48.bit: Ethernet address of destination
48.bit: Ethernet address of sender
16.bit: Protocol type = ether_type$ADDRESS_RESOLUTION
Ethernet packet data:
16.bit: (ar$hrd) Hardware address space (e.g., Ethernet,
Packet Radio Net.)
16.bit: (ar$pro) Protocol address space. For Ethernet
hardware, this is from the set of type
fields ether_typ$<protocol>.
8.bit: (ar$hln) byte length of each hardware address
8.bit: (ar$pln) byte length of each protocol address
16.bit: (ar$op) opcode (ares_op$REQUEST | ares_op$REPLY)
nbytes: (ar$sha) Hardware address of sender of this
packet, n from the ar$hln field.
mbytes: (ar$spa) Protocol address of sender of this
packet, m from the ar$pln field.
nbytes: (ar$tha) Hardware address of target of this
packet (if known).
mbytes: (ar$tpa) Protocol address of target.
Packet Generation:
------------------
As a packet is sent down through the network layers, routing
determines the protocol address of the next hop for the packet
and on which piece of hardware it expects to find the station
with the immediate target protocol address. In the case of the
10Mbit Ethernet, address resolution is needed and some lower
layer (probably the hardware driver) must consult the Address
Resolution module (perhaps implemented in the Ethernet support
module) to convert the <protocol type, target protocol address>
pair to a 48.bit Ethernet address. The Address Resolution module
tries to find this pair in a table. If it finds the pair, it
gives the corresponding 48.bit Ethernet address back to the
caller (hardware driver) which then transmits the packet. If it
does not, it probably informs the caller that it is throwing the
packet away (on the assumption the packet will be retransmitted
by a higher network layer), and generates an Ethernet packet with
a type field of ether_type$ADDRESS_RESOLUTION. The Address
Resolution module then sets the ar$hrd field to
ares_hrd$Ethernet, ar$pro to the protocol type that is being
resolved, ar$hln to 6 (the number of bytes in a 48.bit Ethernet
address), ar$pln to the length of an address in that protocol,
ar$op to ares_op$REQUEST, ar$sha with the 48.bit ethernet address
of itself, ar$spa with the protocol address of itself, and ar$tpa
with the protocol address of the machine that is trying to be
accessed. It does not set ar$tha to anything in particular,
because it is this value that it is trying to determine. It
could set ar$tha to the broadcast address for the hardware (all
ones in the case of the 10Mbit Ethernet) if that makes it
convenient for some aspect of the implementation. It then causes
this packet to be broadcast to all stations on the Ethernet cable
originally determined by the routing mechanism.
Packet Reception:
-----------------
When an address resolution packet is received, the receiving
Ethernet module gives the packet to the Address Resolution module
which goes through an algorithm similar to the following.
Negative conditionals indicate an end of processing and a
discarding of the packet.
?Do I have the hardware type in ar$hrd?
Yes: (almost definitely)
[optionally check the hardware length ar$hln]
?Do I speak the protocol in ar$pro?
Yes:
[optionally check the protocol length ar$pln]
Merge_flag := false
If the pair <protocol type, sender protocol address> is
already in my translation table, update the sender
hardware address field of the entry with the new
information in the packet and set Merge_flag to true.
?Am I the target protocol address?
Yes:
If Merge_flag is false, add the triplet <protocol type,
sender protocol address, sender hardware address> to
the translation table.
?Is the opcode ares_op$REQUEST? (NOW look at the opcode!!)
Yes:
Swap hardware and protocol fields, putting the local
hardware and protocol addresses in the sender fields.
Set the ar$op field to ares_op$REPLY
Send the packet to the (new) target hardware address on
the same hardware on which the request was received.
Notice that the <protocol type, sender protocol address, sender
hardware address> triplet is merged into the table before the
opcode is looked at. This is on the assumption that communcation
is bidirectional; if A has some reason to talk to B, then B will
probably have some reason to talk to A. Notice also that if an
entry already exists for the <protocol type, sender protocol
address> pair, then the new hardware address supersedes the old
one. Related Issues gives some motivation for this.
Generalization: The ar$hrd and ar$hln fields allow this protocol
and packet format to be used for non-10Mbit Ethernets. For the
10Mbit Ethernet <ar$hrd, ar$hln> takes on the value <1, 6>. For
other hardware networks, the ar$pro field may no longer
correspond to the Ethernet type field, but it should be
associated with the protocol whose address resolution is being
sought.
Why is it done this way??
-------------------------
Periodic broadcasting is definitely not desired. Imagine 100
workstations on a single Ethernet, each broadcasting address
resolution information once per 10 minutes (as one possible set
of parameters). This is one packet every 6 seconds. This is
almost reasonable, but what use is it? The workstations aren't
generally going to be talking to each other (and therefore have
100 useless entries in a table); they will be mainly talking to a
mainframe, file server or bridge, but only to a small number of
other workstations (for interactive conversations, for example).
The protocol described in this paper distributes information as
it is needed, and only once (probably) per boot of a machine.
This format does not allow for more than one resolution to be
done in the same packet. This is for simplicity. If things were
multiplexed the packet format would be considerably harder to
digest, and much of the information could be gratuitous. Think
of a bridge that talks four protocols telling a workstation all
four protocol addresses, three of which the workstation will
probably never use.
This format allows the packet buffer to be reused if a reply is
generated; a reply has the same length as a request, and several
of the fields are the same.
The value of the hardware field (ar$hrd) is taken from a list for
this purpose. Currently the only defined value is for the 10Mbit
Ethernet (ares_hrd$Ethernet = 1). There has been talk of using
this protocol for Packet Radio Networks as well, and this will
require another value as will other future hardware mediums that
wish to use this protocol.
For the 10Mbit Ethernet, the value in the protocol field (ar$pro)
is taken from the set ether_type$. This is a natural reuse of
the assigned protocol types. Combining this with the opcode
(ar$op) would effectively halve the number of protocols that can
be resolved under this protocol and would make a monitor/debugger
more complex (see Network Monitoring and Debugging below). It is
hoped that we will never see 32768 protocols, but Murphy made
some laws which don't allow us to make this assumption.
In theory, the length fields (ar$hln and ar$pln) are redundant,
since the length of a protocol address should be determined by
the hardware type (found in ar$hrd) and the protocol type (found
in ar$pro). It is included for optional consistency checking,
and for network monitoring and debugging (see below).
The opcode is to determine if this is a request (which may cause
a reply) or a reply to a previous request. 16 bits for this is
overkill, but a flag (field) is needed.
The sender hardware address and sender protocol address are
absolutely necessary. It is these fields that get put in a
translation table.
The target protocol address is necessary in the request form of
the packet so that a machine can determine whether or not to
enter the sender information in a table or to send a reply. It
is not necessarily needed in the reply form if one assumes a
reply is only provoked by a request. It is included for
completeness, network monitoring, and to simplify the suggested
processing algorithm described above (which does not look at the
opcode until AFTER putting the sender information in a table).
The target hardware address is included for completeness and
network monitoring. It has no meaning in the request form, since
it is this number that the machine is requesting. Its meaning in
the reply form is the address of the machine making the request.
In some implementations (which do not get to look at the 14.byte
ethernet header, for example) this may save some register
shuffling or stack space by sending this field to the hardware
driver as the hardware destination address of the packet.
There are no padding bytes between addresses. The packet data
should be viewed as a byte stream in which only 3 byte pairs are
defined to be words (ar$hrd, ar$pro and ar$op) which are sent
most significant byte first (Ethernet/PDP-10 byte style).
Network monitoring and debugging:
---------------------------------
The above Address Resolution protocol allows a machine to gain
knowledge about the higher level protocol activity (e.g., CHAOS,
Internet, PUP, DECnet) on an Ethernet cable. It can determine
which Ethernet protocol type fields are in use (by value) and the
protocol addresses within each protocol type. In fact, it is not
necessary for the monitor to speak any of the higher level
protocols involved. It goes something like this:
When a monitor receives an Address Resolution packet, it always
enters the <protocol type, sender protocol address, sender
hardware address> in a table. It can determine the length of the
hardware and protocol address from the ar$hln and ar$pln fields
of the packet. If the opcode is a REPLY the monitor can then
throw the packet away. If the opcode is a REQUEST and the target
protocol address matches the protocol address of the monitor, the
monitor sends a REPLY as it normally would. The monitor will
only get one mapping this way, since the REPLY to the REQUEST
will be sent directly to the requesting host. The monitor could
try sending its own REQUEST, but this could get two monitors into
a REQUEST sending loop, and care must be taken.
Because the protocol and opcode are not combined into one field,
the monitor does not need to know which request opcode is
associated with which reply opcode for the same higher level
protocol. The length fields should also give enough information
to enable it to "parse" a protocol addresses, although it has no
knowledge of what the protocol addresses mean.
A working implementation of the Address Resolution protocol can
also be used to debug a non-working implementation. Presumably a
hardware driver will successfully broadcast a packet with Ethernet
type field of ether_type$ADDRESS_RESOLUTION. The format of the
packet may not be totally correct, because initial
implementations may have bugs, and table management may be
slightly tricky. Because requests are broadcast a monitor will
receive the packet and can display it for debugging if desired.
An Example:
-----------
Let there exist machines X and Y that are on the same 10Mbit
Ethernet cable. They have Ethernet address EA(X) and EA(Y) and
DOD Internet addresses IPA(X) and IPA(Y) . Let the Ethernet type
of Internet be ET(IP). Machine X has just been started, and
sooner or later wants to send an Internet packet to machine Y on
the same cable. X knows that it wants to send to IPA(Y) and
tells the hardware driver (here an Ethernet driver) IPA(Y). The
driver consults the Address Resolution module to convert <ET(IP),
IPA(Y)> into a 48.bit Ethernet address, but because X was just
started, it does not have this information. It throws the
Internet packet away and instead creates an ADDRESS RESOLUTION
packet with
(ar$hrd) = ares_hrd$Ethernet
(ar$pro) = ET(IP)
(ar$hln) = length(EA(X))
(ar$pln) = length(IPA(X))
(ar$op) = ares_op$REQUEST
(ar$sha) = EA(X)
(ar$spa) = IPA(X)
(ar$tha) = don't care
(ar$tpa) = IPA(Y)
and broadcasts this packet to everybody on the cable.
Machine Y gets this packet, and determines that it understands
the hardware type (Ethernet), that it speaks the indicated
protocol (Internet) and that the packet is for it
((ar$tpa)=IPA(Y)). It enters (probably replacing any existing
entry) the information that <ET(IP), IPA(X)> maps to EA(X). It
then notices that it is a request, so it swaps fields, putting
EA(Y) in the new sender Ethernet address field (ar$sha), sets the
opcode to reply, and sends the packet directly (not broadcast) to
EA(X). At this point Y knows how to send to X, but X still
doesn't know how to send to Y.
Machine X gets the reply packet from Y, forms the map from
<ET(IP), IPA(Y)> to EA(Y), notices the packet is a reply and
throws it away. The next time X's Internet module tries to send
a packet to Y on the Ethernet, the translation will succeed, and
the packet will (hopefully) arrive. If Y's Internet module then
wants to talk to X, this will also succeed since Y has remembered
the information from X's request for Address Resolution.
Related issue:
---------------
It may be desirable to have table aging and/or timeouts. The
implementation of these is outside the scope of this protocol.
Here is a more detailed description (thanks to MOON@SCRC@MIT-MC).
If a host moves, any connections initiated by that host will
work, assuming its own address resolution table is cleared when
it moves. However, connections initiated to it by other hosts
will have no particular reason to know to discard their old
address. However, 48.bit Ethernet addresses are supposed to be
unique and fixed for all time, so they shouldn't change. A host
could "move" if a host name (and address in some other protocol)
were reassigned to a different physical piece of hardware. Also,
as we know from experience, there is always the danger of
incorrect routing information accidentally getting transmitted
through hardware or software error; it should not be allowed to
persist forever. Perhaps failure to initiate a connection should
inform the Address Resolution module to delete the information on
the basis that the host is not reachable, possibly because it is
down or the old translation is no longer valid. Or perhaps
receiving of a packet from a host should reset a timeout in the
address resolution entry used for transmitting packets to that
host; if no packets are received from a host for a suitable
length of time, the address resolution entry is forgotten. This
may cause extra overhead to scan the table for each incoming
packet. Perhaps a hash or index can make this faster.
The suggested algorithm for receiving address resolution packets
tries to lessen the time it takes for recovery if a host does
move. Recall that if the <protocol type, sender protocol
address> is already in the translation table, then the sender
hardware address supersedes the existing entry. Therefore, on a
perfect Ethernet where a broadcast REQUEST reaches all stations
on the cable, each station will be get the new hardware address.
Another alternative is to have a daemon perform the timeouts.
After a suitable time, the daemon considers removing an entry.
It first sends (with a small number of retransmissions if needed)
an address resolution packet with opcode REQUEST directly to the
Ethernet address in the table. If a REPLY is not seen in a short
amount of time, the entry is deleted. The request is sent
directly so as not to bother every station on the Ethernet. Just
forgetting entries will likely cause useful information to be
forgotten, which must be regained.
Since hosts don't transmit information about anyone other than
themselves, rebooting a host will cause its address mapping table
to be up to date. Bad information can't persist forever by being
passed around from machine to machine; the only bad information
that can exist is in a machine that doesn't know that some other
machine has changed its 48.bit Ethernet address. Perhaps
manually resetting (or clearing) the address mapping table will
suffice.
This issue clearly needs more thought if it is believed to be
important. It is caused by any address resolution-like protocol.

View File

@ -0,0 +1,549 @@
RFC 872 September 1982
M82-48
TCP-ON-A-LAN
M.A. PADLIPSKY
THE MITRE CORPORATION
Bedford, Massachusetts
Abstract
The sometimes-held position that the DoD Standard
Transmission Control Protocol (TCP) and Internet Protocol (IP)
are inappropriate for use "on" a Local Area Network (LAN) is
shown to be fallacious. The paper is a companion piece to
M82-47, M82-49, M82-50, and M82-51.
i
"TCP-ON-A-LAN"
M. A. Padlipsky
Thesis
It is the thesis of this paper that fearing "TCP-on-a-LAN"
is a Woozle which needs slaying. To slay the "TCP-on-a-LAN"
Woozle, we need to know three things: What's a Woozle? What's a
LAN? What's a TCP?
Woozles
The first is rather straightforward [1]:
One fine winter's day when Piglet was brushing away the
snow in front of his house, he happened to look up, and
there was Winnie-the-Pooh. Pooh was walking round and round
in a circle, thinking of something else, and when Piglet
called to him, he just went on walking.
"Hallo!" said Piglet, "what are you doing?"
"Hunting," said Pooh.
"Hunting what?"
"Tracking something," said Winnie-the-Pooh very
mysteriously.
"Tracking what?" said Piglet, coming closer.
"That's just what I ask myself. I ask myself, What?"
"What do you think you'll answer?"
"I shall have to wait until I catch up with it," said
Winnie-the-Pooh. "Now look there." He pointed to the
ground in front of him. "What do you see there?
"Tracks," said Piglet, "Paw-marks." he gave a little
squeak of excitement. "Oh, Pooh! Do you think it's a--a--a
Woozle?"
Well, they convince each other that it is a Woozle, keep
"tracking," convince each other that it's a herd of Hostile
Animals, and get duly terrified before Christopher Robin comes
along and points out that they were following their own tracks
all the long.
In other words, it is our contention that expressed fears
about the consequences of using a particular protocol named "TCP"
in a particular environment called a Local Area Net stem from
misunderstandings of the protocol and the environment, not from
the technical facts of the situation.
1
RFC 872 September 1982
LAN's
The second thing we need to know is somewhat less
straightforward: A LAN is, properly speaking [2], a
communications mechanism (or subnetwork) employing a transmission
technology suitable for relatively short distances (typically a
few kilometers) at relatively high bit-per-second rates
(typically greater than a few hundred kilobits per second) with
relatively low error rates, which exists primarily to enable
suitably attached computer systems (or "Hosts") to exchange bits,
and secondarily, though not necessarily, to allow terminals of
the teletypewriter and CRT classes to exchange bits with Hosts.
The Hosts are, at least in principle, heterogeneous; that is,
they are not merely multiple instances of the same operating
system. The Hosts are assumed to communicate by means of layered
protocols in order to achieve what the ARPANET tradition calls
"resource sharing" and what the newer ISO tradition calls "Open
System Interconnection." Addressing typically can be either
Host-Host (point-to-point) or "broadcast." (In some environments,
e.g., Ethernet, interesting advantage can be taken of broadcast
addressing; in other environments, e.g., LAN's which are
constituents of ARPA- or ISO-style "internets", broadcast
addressing is deemed too expensive to implement throughout the
internet as a whole and so may be ignored in the constituent LAN
even if available as part of the Host-LAN interface.)
Note that no assumptions are made about the particular
transmission medium or the particular topology in play. LAN
media can be twisted-pair wires, CATV or other coaxial-type
cables, optical fibers, or whatever. However, if the medium is a
processor-to-processor bus it is likely that the system in
question is going to turn out to "be" a moderately closely
coupled distributed processor or a somewhat loosely coupled
multiprocessor rather than a LAN, because the processors are
unlikely to be using either ARPANET or ISO-style layered
protocols. (They'll usually -- either be homogeneous processors
interpreting only the protocol necessary to use the transmission
medium, or heterogeneous with one emulating the expectations of
the other.) Systems like "PDSC" or "NMIC" (the evolutionarily
related, bus-oriented, multiple PDP-11 systems in use at the
Pacific Data Services Center and the National Military
Intelligence Center, respectively), then, aren't LANs.
LAN topologies can be either "bus," "ring," or "star". That
is, a digital PBX can be a LAN, in the sense of furnishing a
transmission medium/communications subnetwork for Hosts to do
resource sharing/Open System Interconnection over, though it
might not present attractive speed or failure mode properties.
(It might, though.) Topologically, it would probably be a
neutron star.
2
RFC 872 September 1982
For our purposes, the significant properties of a LAN are
the high bit transmission capacity and the good error properties.
Intuitively, a medium with these properties in some sense
"shouldn't require a heavy-duty protocol designed for long-haul
nets," according to some. (We will not address the issue of
"wasted bandwidth" due to header sizes. [2], pp. 1509f, provides
ample refutation of that traditional communications notion.)
However, it must be borne in mind that for our purposes the
assumption of resource-sharing/OSI type protocols between/among
the attached Hosts is also extremely significant. That is, if
all you're doing is letting some terminals access some different
Hosts, but the Hosts don't really have any intercomputer
networking protocols between them, what you have should be viewed
as a Localized Communications Network (LCN), not a LAN in the
sense we're talking about here.
TCP
The third thing we have to know can be either
straightforward or subtle, depending largely on how aware we are
of the context estabished by ARPANET-style prococols: For the
visual-minded, Figure 1 and Figure 2 might be all that need be
"said." Their moral is meant to be that in ARPANET-style
layering, layers aren't monoliths. For those who need more
explanation, here goes: TCP [3] (we'll take IP later) is a
Host-Host protocol (roughly equivalent to the functionality
implied by some of ISO Level 5 and all of ISO Level 4). Its most
significant property is that it presents reliable logical
connections to protocols above itself. (This point will be
returned to subsequently.) Its next most significant property is
that it is designed to operate in a "catenet" (also known as the,
or an, "internet"); that is, its addressing discipline is such
that Hosts attached to communications subnets other than the one
a given Host is attached to (the "proximate net") can be
communicated with as well as Hosts on the proximate net. Other
significant properties are those common to the breed: Host-Host
protocols (and Transport protocols) "all" offer mechanisms for
flow Control, Out-of-Band Signals, Logical Connection management,
and the like.
Because TCP has a catenet-oriented addressing mechanism
(that is, it expresses foreign Host addresses as the
"two-dimensional" entity Foreign Net/Foreign Host because it
cannot assume that the Foreign Host is attached to the proximate
net), to be a full Host-Host protocol it needs an adjunct to deal
with the proximate net. This adjunct, the Internet Protocol (IP)
was designed as a separate protocol from TCP, however, in order
to allow it to play the same role it plays for TCP for other
Host-Host protocols too.
3
RFC 872 September 1982
In order to "deal with the proximate net", IP possess the
following significant properties: An IP implementation maps from
a virtualization (or common intermediate representation) of
generic proximate net qualities (such as precedence, grade of
service, security labeling) to the closest equivalent on the
proximate net. It determines whether the "Internet Address" of a
given transmission is on the proximate net or not; if so, it
sends it; if not, it sends it to a "Gateway" (where another IP
module resides). That is, IP handles internet routing, whereas
TCP (or some other Host-Host protocol) handles only internet
addressing. Because some proximate nets will accept smaller
transmissions ("packets") than others, IP, qua protocol, also has
a discipline for allowing packets to be fragmented while in the
catenet and reassembled at their destination. Finally (for our
purposes), IP offers a mechanism to allow the particular protocol
it was called by (for a given packet) to be identified so that
the receiver can demultiplex transmissions based on IP-level
information only. (This is in accordance with the Principle of
Layering: you don't want to have to look at the data IP is
conveying to find out what to do with it.)
Now that all seems rather complex, even though it omits a
number of mechanisms. (For a more complete discussion, see
Reference [4].) But it should be just about enough to slay the
Woozle, especially if just one more protocol's most significant
property can be snuck in. An underpublicized member of the
ARPANET suite of protocols is called UDP--the "User Datagram
Protocol." UDP is designed for speed rather than accuracy. That
is, it's not "reliable." All there is to UDP, basically, is a
mechanism to allow a given packet to be associated with a given
logical connection. Not a TCP logical connection, mind you, but a
UDP logical connection. So if all you want is the ability to
demultiplex data streams from your Host-Host protocol, you use
UDP, not TCP. ("You" is usually supposed to be a Packetized
Speech protocol, but doesn't have to be.) (And we'll worry about
Flow Control some other time.)
TCP-on-a-LAN
So whether you're a Host proximate to a LAN or not, and even
whether your TCP/IP is "inboard" or "outboard" of you, if you're
talking to a Host somewhere out there on the catenet, you use IP;
and if you're exercising some process-level/applications protocol
(roughly equivalent to some of some versions of ISO L5 and all of
L6 and L7) that expects TCP/IP as its Host-Host protocol (because
it "wants" reliable, flow controlled, ordered delivery [whoops,
forgot that "ordered" property earlier--but it doesn't matter all
that much for present purposes] over logical connections which
allow it to be
4
RFC 872 September 1982
addressed via a Well-Known Socket), you use TCP "above" IP
regardless of whether the other Host is on your proximate net or
not. But if your application doesn't require the properties of
TCP (say for Packetized Speech), don't use it--regardless of
where or what you are. And if you want to make the decision
about whether you're talking to a proximate Host explicitly and
not even go through IP, you can even arrange to do that (though
it might make for messy implementation under some circumstances).
That is, if you want to take advantage of the properties of your
LAN "in the raw" and have or don't need appropriate applications
protocols, the Reference Model to which TCP/IP were designed
won't stop you. See Figure 2 if you're visual. A word of
caution, though: those applications probably will need protocols
of some sort--and they'll probably need some sort of Host-Host
protocol under them, so unless you relish maintaining "parallel"
suites of protocols.... that is, you really would be better off
with TCP most of the time locally anyway, because you've got to
have it to talk to the catenet and it's a nuisance to have
"something else" to talk over the LAN--when, of course, what
you're talking requires a Host-Host protocol.
We'll touch on "performance" issues in a bit more detail
later. At this level, though, one point really does need to be
made: On the "reliability" front, many (including the author) at
first blush take the TCP checksum to be "overkill" for use on a
LAN, which does, after all, typically present extremely good
error properties. Interestingly enough, however, metering of TCP
implementations on several Host types in the research community
shows that the processing time expended on the TCP checksum is
only around 12% of the per-transmission processing time anyway.
So, again, it's not clear that it's worthwhile to bother with an
alternate Host-Host protocol for local use (if, that is, you need
the rest of the properties of TCP other than "reliability"--and,
of course, always assuming you've got a LAN, not an LCN, as
distinguished earlier.)
Take that, Woozle!
Other Significant Properties
Oh, by the way, one or two other properties of TCP/IP really
do bear mention:
1. Protocol interpreters for TCP/IP exist for a dozen or
two different operating systems.
2. TCP/IP work, and have been working (though in less
refined versions) for several years.
5
RFC 872 September 1982
3. IP levies no constraints on the interface protocol
presented by the proximate net (though some protocols
at that level are more wasteful than others).
4. IP levies no constraints on its users; in particular,
any proximate net that offers alternate routing can be
taken advantage of (unlike X.25, which appears to
preclude alternate routing).
5. IP-bearing Gateways both exist and present and exploit
properties 3 and 4.
6. TCP/IP are Department of Defense Standards.
7. Process (or application) protocols compatible with
TCP/IP for Virtual Terminal and File Transfer
(including "electronic mail") exist and have been
implemented on numerous operating systems.
8. "Vendor-style" specifications of TCP/IP are being
prepared under the aegis of the DoD Protocol Standards
Technical Panel, for those who find the
research-community-provided specs not to their liking.
9. The research community has recently reported speeds in
excess of 300 kb/s on an 800 kb/s subnet, 1.2 Mb/s on a
3 Mb/s subnet, and 9.2 kbs on a 9.6 kb/s phone
line--all using TCP. (We don't know of any numbers for
alternative protocol suites, but it's unlikely they'd
be appreciably better if they confer like
functionality--and they may well be worse if they
represent implementations which haven't been around
enough to have been iterated a time or three.)
With the partial exception of property 8, no other
resource-sharing protocol suite can make those claims.
Note particularly well that none of the above should be
construed as eliminating the need for extremely careful
measurement of TCP/IP performance in/on a LAN. (You do, after
all, want to know their limitations, to guide you in when to
bother ringing in "local" alternatives--but be very careful: 1.
they're hard to measure commensurately with alternative
protocols; and 2. most conventional Hosts can't take [or give]
as many bits per second as you might imagine.) It merely
dramatically refocuses the motivation for doing such measurement.
(And levies a constraint or two on how you outboard, if you're
outboarding.)
6
RFC 872 September 1982
Other Contextual Data
Our case could really rest here, but some amplification of
the aside above about Host capacities is warranted, if only to
suggest that some quantification is available to supplement the a
priori argument: Consider the previously mentioned PDSC. Its
local terminals operate in a screen-at-a-time mode, each
screen-load comprising some 16 kb. How many screens can one of
its Hosts handle in a given second? Well, we're told that each
disk fetch requires 17 ms average latency, and each context
switch costs around 2 ms, so allowing 1 ms for transmission of
the data from the disk and to the "net" (it makes the arithmetic
easy), that would add up to 20 ms "processing" time per screen,
even if no processing were done to the disk image. Thus, even if
the Host were doing nothing else, and even if the native disk
I/O software were optimized to do 16 kb reads, it could only
present 50 screens to its communications mechanism
(processor-processor bus) per second. That's 800 kb/s. And
that's well within the range of TCP-achievable rates (cf. Other
Significant Property 9). So in a realistic sample environment,
it would certainly seem that typical Hosts can't necessarily
present so many bits as to overtax the protocols anyway. (The
analysis of how many bits typical Hosts can accept is more
difficult because it depends more heavily on system internals.
However, the point is nearly moot in that even in the intuitively
unlikely event that receiving were appreciably faster in
principle [unlikely because of typical operating system
constraints on address space sizes, the need to do input to a
single address space, and the need to share buffers in the
address space among several processes], you can't accept more
than you can be given.)
Conclusion
The sometimes-expressed fear that using TCP on a local net
is a bad idea is unfounded.
References
[1] Milne, A. A., "Winnie-the-Pooh", various publishers.
[2] The LAN description is based on Clark, D. D. et al., "An
Introduction to Local Area Networks," IEEE Proc., V. 66, N.
11, November 1978, pp. 1497-1517, several year's worth of
conversations with Dr. Clark, and the author's observations
of both the open literature and the Oral Tradition (which
were sufficiently well-thought of to have prompted The MITRE
Corporation/NBS/NSA Local Nets "Brain Picking Panel" to have
7
RFC 872 September 1982
solicited his testimony during the year he was in FACC's
employ.*)
[3] The TCP/IP descriptions are based on Postel, J. B.,
"Internet Protocol Specification," and "Transmission Control
Specification" in DARPA Internet Program Protocol
Specifications, USC Information Sciences Institute,
September, 1981, and on more than 10 years' worth of
conversations with Dr. Postel, Dr. Clark (now the DARPA
"Internet Architect") and Dr. Vinton G. Cerf (co-originator
of TCP), and on numerous discussions with several other
members of the TCP/IP design team, on having edited the
referenced documents for the PSTP, and, for that matter, on
having been one of the developers of the ARPANET "Reference
Model."
[4] Padlipsky, M. A., "A Perspective on the ARPANET Reference
Model", M82-47, The MITRE Corporation, September 1982; also
available in Proc. INFOCOM '83.
________________
* In all honesty, as far as I know I started the rumor that TCP
might be overkill for a LAN at that meeting. At the next TCP
design meeting, however, they separated IP out from TCP, and
everything's been alright for about three years now--except
for getting the rumor killed. (I'd worry about Woozles
turning into roosting chickens if it weren't for the facts
that: 1. People tend to ignore their local guru; 2. I was
trying to encourage the IP separation; and 3. All I ever
wanted was some empirical data.)
NOTE: FIGURE 1. ARM in the Abstract, and FIGURE 2. ARMS,
Somewhat Particularized, may be obtained by writing to: Mike
Padlipsky, MITRE Corporation, P.O. Box 208, Bedford,
Massachusetts, 01730, or sending computer mail to
Padlipsky@USC-ISIA.
8

View File

@ -0,0 +1,638 @@
Network Working Group J. Postel
Request for Comments: 879 ISI
November 1983
The TCP Maximum Segment Size
and Related Topics
This memo discusses the TCP Maximum Segment Size Option and related
topics. The purposes is to clarify some aspects of TCP and its
interaction with IP. This memo is a clarification to the TCP
specification, and contains information that may be considered as
"advice to implementers".
1. Introduction
This memo discusses the TCP Maximum Segment Size and its relation to
the IP Maximum Datagram Size. TCP is specified in reference [1]. IP
is specified in references [2,3].
This discussion is necessary because the current specification of
this TCP option is ambiguous.
Much of the difficulty with understanding these sizes and their
relationship has been due to the variable size of the IP and TCP
headers.
There have been some assumptions made about using other than the
default size for datagrams with some unfortunate results.
HOSTS MUST NOT SEND DATAGRAMS LARGER THAN 576 OCTETS UNLESS THEY
HAVE SPECIFIC KNOWLEDGE THAT THE DESTINATION HOST IS PREPARED TO
ACCEPT LARGER DATAGRAMS.
This is a long established rule.
To resolve the ambiguity in the TCP Maximum Segment Size option
definition the following rule is established:
THE TCP MAXIMUM SEGMENT SIZE IS THE IP MAXIMUM DATAGRAM SIZE MINUS
FORTY.
The default IP Maximum Datagram Size is 576.
The default TCP Maximum Segment Size is 536.
Postel [Page 1]
RFC 879 November 1983
TCP Maximum Segment Size
2. The IP Maximum Datagram Size
Hosts are not required to reassemble infinitely large IP datagrams.
The maximum size datagram that all hosts are required to accept or
reassemble from fragments is 576 octets. The maximum size reassembly
buffer every host must have is 576 octets. Hosts are allowed to
accept larger datagrams and assemble fragments into larger datagrams,
hosts may have buffers as large as they please.
Hosts must not send datagrams larger than 576 octets unless they have
specific knowledge that the destination host is prepared to accept
larger datagrams.
3. The TCP Maximum Segment Size Option
TCP provides an option that may be used at the time a connection is
established (only) to indicate the maximum size TCP segment that can
be accepted on that connection. This Maximum Segment Size (MSS)
announcement (often mistakenly called a negotiation) is sent from the
data receiver to the data sender and says "I can accept TCP segments
up to size X". The size (X) may be larger or smaller than the
default. The MSS can be used completely independently in each
direction of data flow. The result may be quite different maximum
sizes in the two directions.
The MSS counts only data octets in the segment, it does not count the
TCP header or the IP header.
A footnote: The MSS value counts only data octets, thus it does not
count the TCP SYN and FIN control bits even though SYN and FIN do
consume TCP sequence numbers.
4. The Relationship of TCP Segments and IP Datagrams
TCP segment are transmitted as the data in IP datagrams. The
correspondence between TCP segments and IP datagrams must be one to
one. This is because TCP expects to find exactly one complete TCP
segment in each block of data turned over to it by IP, and IP must
turn over a block of data for each datagram received (or completely
reassembled).
Postel [Page 2]
RFC 879 November 1983
TCP Maximum Segment Size
5. Layering and Modularity
TCP is an end to end reliable data stream protocol with error
control, flow control, etc. TCP remembers many things about the
state of a connection.
IP is a one shot datagram protocol. IP has no memory of the
datagrams transmitted. It is not appropriate for IP to keep any
information about the maximum datagram size a particular destination
host might be capable of accepting.
TCP and IP are distinct layers in the protocol architecture, and are
often implemented in distinct program modules.
Some people seem to think that there must be no communication between
protocol layers or program modules. There must be communication
between layers and modules, but it should be carefully specified and
controlled. One problem in understanding the correct view of
communication between protocol layers or program modules in general,
or between TCP and IP in particular is that the documents on
protocols are not very clear about it. This is often because the
documents are about the protocol exchanges between machines, not the
program architecture within a machine, and the desire to allow many
program architectures with different organization of tasks into
modules.
6. IP Information Requirements
There is no general requirement that IP keep information on a per
host basis.
IP must make a decision about which directly attached network address
to send each datagram to. This is simply mapping an IP address into
a directly attached network address.
There are two cases to consider: the destination is on the same
network, and the destination is on a different network.
Same Network
For some networks the the directly attached network address can
be computed from the IP address for destination hosts on the
directly attached network.
For other networks the mapping must be done by table look up
(however the table is initialized and maintained, for
example, [4]).
Postel [Page 3]
RFC 879 November 1983
TCP Maximum Segment Size
Different Network
The IP address must be mapped to the directly attached network
address of a gateway. For networks with one gateway to the
rest of the Internet the host need only determine and remember
the gateway address and use it for sending all datagrams to
other networks.
For networks with multiple gateways to the rest of the
Internet, the host must decide which gateway to use for each
datagram sent. It need only check the destination network of
the IP address and keep information on which gateway to use for
each network.
The IP does, in some cases, keep per host routing information for
other hosts on the directly attached network. The IP does, in some
cases, keep per network routing information.
A Special Case
There are two ICMP messages that convey information about
particular hosts. These are subtypes of the Destination
Unreachable and the Redirect ICMP messages. These messages are
expected only in very unusual circumstances. To make effective
use of these messages the receiving host would have to keep
information about the specific hosts reported on. Because these
messages are quite rare it is strongly recommended that this be
done through an exception mechanism rather than having the IP keep
per host tables for all hosts.
7. The Relationship between IP Datagram and TCP Segment Sizes
The relationship between the value of the maximum IP datagram size
and the maximum TCP segment size is obscure. The problem is that
both the IP header and the TCP header may vary in length. The TCP
Maximum Segment Size option (MSS) is defined to specify the maximum
number of data octets in a TCP segment exclusive of TCP (or IP)
header.
To notify the data sender of the largest TCP segment it is possible
to receive the calculation of the MSS value to send is:
MSS = MTU - sizeof(TCPHDR) - sizeof(IPHDR)
On receipt of the MSS option the calculation of the size of segment
that can be sent is:
SndMaxSegSiz = MIN((MTU - sizeof(TCPHDR) - sizeof(IPHDR)), MSS)
Postel [Page 4]
RFC 879 November 1983
TCP Maximum Segment Size
where MSS is the value in the option, and MTU is the Maximum
Transmission Unit (or the maximum packet size) allowed on the
directly attached network.
This begs the question, though. What value should be used for the
"sizeof(TCPHDR)" and for the "sizeof(IPHDR)"?
There are three reasonable positions to take: the conservative, the
moderate, and the liberal.
The conservative or pessimistic position assumes the worst -- that
both the IP header and the TCP header are maximum size, that is, 60
octets each.
MSS = MTU - 60 - 60 = MTU - 120
If MTU is 576 then MSS = 456
The moderate position assumes the that the IP is maximum size (60
octets) and the TCP header is minimum size (20 octets), because there
are no TCP header options currently defined that would normally be
sent at the same time as data segments.
MSS = MTU - 60 - 20 = MTU - 80
If MTU is 576 then MSS = 496
The liberal or optimistic position assumes the best -- that both the
IP header and the TCP header are minimum size, that is, 20 octets
each.
MSS = MTU - 20 - 20 = MTU - 40
If MTU is 576 then MSS = 536
If nothing is said about MSS, the data sender may cram as much as
possible into a 576 octet datagram, and if the datagram has
minimum headers (which is most likely), the result will be 536
data octets in the TCP segment. The rule relating MSS to the
maximum datagram size ought to be consistent with this.
A practical point is raised in favor of the liberal position too.
Since the use of minimum IP and TCP headers is very likely in the
very large percentage of cases, it seems wasteful to limit the TCP
segment data to so much less than could be transmitted at once,
especially since it is less that 512 octets.
Postel [Page 5]
RFC 879 November 1983
TCP Maximum Segment Size
For comparison: 536/576 is 93% data, 496/576 is 86% data, 456/576
is 79% data.
8. Maximum Packet Size
Each network has some maximum packet size, or maximum transmission
unit (MTU). Ultimately there is some limit imposed by the
technology, but often the limit is an engineering choice or even an
administrative choice. Different installations of the same network
product do not have to use the same maximum packet size. Even within
one installation not all host must use the same packet size (this way
lies madness, though).
Some IP implementers have assumed that all hosts on the directly
attached network will be the same or at least run the same
implementation. This is a dangerous assumption. It has often
developed that after a small homogeneous set of host have become
operational additional hosts of different types are introduced into
the environment. And it has often developed that it is desired to
use a copy of the implementation in a different inhomogeneous
environment.
Designers of gateways should be prepared for the fact that successful
gateways will be copied and used in other situation and
installations. Gateways must be prepared to accept datagrams as
large as can be sent in the maximum packets of the directly attached
networks. Gateway implementations should be easily configured for
installation in different circumstances.
A footnote: The MTUs of some popular networks (note that the actual
limit in some installations may be set lower by administrative
policy):
ARPANET, MILNET = 1007
Ethernet (10Mb) = 1500
Proteon PRONET = 2046
9. Source Fragmentation
A source host would not normally create datagram fragments. Under
normal circumstances datagram fragments only arise when a gateway
must send a datagram into a network with a smaller maximum packet
size than the datagram. In this case the gateway must fragment the
datagram (unless it is marked "don't fragment" in which case it is
discarded, with the option of sending an ICMP message to the source
reporting the problem).
It might be desirable for the source host to send datagram fragments
Postel [Page 6]
RFC 879 November 1983
TCP Maximum Segment Size
if the maximum segment size (default or negotiated) allowed by the
data receiver were larger than the maximum packet size allowed by the
directly attached network. However, such datagram fragments must not
combine to a size larger than allowed by the destination host.
For example, if the receiving TCP announced that it would accept
segments up to 5000 octets (in cooperation with the receiving IP)
then the sending TCP could give such a large segment to the
sending IP provided the sending IP would send it in datagram
fragments that fit in the packets of the directly attached
network.
There are some conditions where source host fragmentation would be
necessary.
If the host is attached to a network with a small packet size (for
example 256 octets), and it supports an application defined to
send fixed sized messages larger than that packet size (for
example TFTP [5]).
If the host receives ICMP Echo messages with data it is required
to send an ICMP Echo-Reply message with the same data. If the
amount of data in the Echo were larger than the packet size of the
directly attached network the following steps might be required:
(1) receive the fragments, (2) reassemble the datagram, (3)
interpret the Echo, (4) create an Echo-Reply, (5) fragment it, and
(6) send the fragments.
10. Gateway Fragmentation
Gateways must be prepared to do fragmentation. It is not an optional
feature for a gateway.
Gateways have no information about the size of datagrams destination
hosts are prepared to accept. It would be inappropriate for gateways
to attempt to keep such information.
Gateways must be prepared to accept the largest datagrams that are
allowed on each of the directly attached networks, even if it is
larger than 576 octets.
Gateways must be prepared to fragment datagrams to fit into the
packets of the next network, even if it smaller than 576 octets.
If a source host thought to take advantage of the local network's
ability to carry larger datagrams but doesn't have the slightest idea
if the destination host can accept larger than default datagrams and
expects the gateway to fragment the datagram into default size
Postel [Page 7]
RFC 879 November 1983
TCP Maximum Segment Size
fragments, then the source host is misguided. If indeed, the
destination host can't accept larger than default datagrams, it
probably can't reassemble them either. If the gateway either passes
on the large datagram whole or fragments into default size fragments
the destination will not accept it. Thus, this mode of behavior by
source hosts must be outlawed.
A larger than default datagram can only arrive at a gateway because
the source host knows that the destination host can handle such large
datagrams (probably because the destination host announced it to the
source host in an TCP MSS option). Thus, the gateway should pass on
this large datagram in one piece or in the largest fragments that fit
into the next network.
An interesting footnote is that even though the gateways may know
about know the 576 rule, it is irrelevant to them.
11. Inter-Layer Communication
The Network Driver (ND) or interface should know the Maximum
Transmission Unit (MTU) of the directly attached network.
The IP should ask the Network Driver for the Maximum Transmission
Unit.
The TCP should ask the IP for the Maximum Datagram Data Size (MDDS).
This is the MTU minus the IP header length (MDDS = MTU - IPHdrLen).
When opening a connection TCP can send an MSS option with the value
equal MDDS - TCPHdrLen.
TCP should determine the Maximum Segment Data Size (MSDS) from either
the default or the received value of the MSS option.
TCP should determine if source fragmentation is possible (by asking
the IP) and desirable.
If so TCP may hand to IP segments (including the TCP header) up to
MSDS + TCPHdrLen.
If not TCP may hand to IP segments (including the TCP header) up
to the lesser of (MSDS + TCPHdrLen) and MDDS.
IP checks the length of data passed to it by TCP. If the length is
less than or equal MDDS, IP attached the IP header and hands it to
the ND. Otherwise the IP must do source fragmentation.
Postel [Page 8]
RFC 879 November 1983
TCP Maximum Segment Size
12. What is the Default MSS ?
Another way of asking this question is "What transmitted value for
MSS has exactly the same effect of not transmitting the option at
all?".
In terms of the previous section:
The default assumption is that the Maximum Transmission Unit is
576 octets.
MTU = 576
The Maximum Datagram Data Size (MDDS) is the MTU minus the IP
header length.
MDDS = MTU - IPHdrLen = 576 - 20 = 556
When opening a connection TCP can send an MSS option with the
value equal MDDS - TCPHdrLen.
MSS = MDDS - TCPHdrLen = 556 - 20 = 536
TCP should determine the Maximum Segment Data Size (MSDS) from
either the default or the received value of the MSS option.
Default MSS = 536, then MSDS = 536
TCP should determine if source fragmentation is possible and
desirable.
If so TCP may hand to IP segments (including the TCP header) up
to MSDS + TCPHdrLen (536 + 20 = 556).
If not TCP may hand to IP segments (including the TCP header)
up to the lesser of (MSDS + TCPHdrLen (536 + 20 = 556)) and
MDDS (556).
Postel [Page 9]
RFC 879 November 1983
TCP Maximum Segment Size
13. The Truth
The rule relating the maximum IP datagram size and the maximum TCP
segment size is:
TCP Maximum Segment Size = IP Maximum Datagram Size - 40
The rule must match the default case.
If the TCP Maximum Segment Size option is not transmitted then the
data sender is allowed to send IP datagrams of maximum size (576)
with a minimum IP header (20) and a minimum TCP header (20) and
thereby be able to stuff 536 octets of data into each TCP segment.
The definition of the MSS option can be stated:
The maximum number of data octets that may be received by the
sender of this TCP option in TCP segments with no TCP header
options transmitted in IP datagrams with no IP header options.
14. The Consequences
When TCP is used in a situation when either the IP or TCP headers are
not minimum and yet the maximum IP datagram that can be received
remains 576 octets then the TCP Maximum Segment Size option must be
used to reduce the limit on data octets allowed in a TCP segment.
For example, if the IP Security option (11 octets) were in use and
the IP maximum datagram size remained at 576 octets, then the TCP
should send the MSS with a value of 525 (536-11).
Postel [Page 10]
RFC 879 November 1983
TCP Maximum Segment Size
15. References
[1] Postel, J., ed., "Transmission Control Protocol - DARPA Internet
Program Protocol Specification", RFC 793, USC/Information
Sciences Institute, September 1981.
[2] Postel, J., ed., "Internet Protocol - DARPA Internet Program
Protocol Specification", RFC 791, USC/Information Sciences
Institute, September 1981.
[3] Postel, J., "Internet Control Message Protocol - DARPA Internet
Program Protocol Specification", RFC 792, USC/Information
Sciences Institute, September 1981.
[4] Plummer, D., "An Ethernet Address Resolution Protocol or
Converting Network Protocol Addresses to 48-bit Ethernet
Addresses for Transmission on Ethernet Hardware", RFC 826,
MIT/LCS, November 1982.
[5] Sollins, K., "The TFTP Protocol (Revision 2)", RFC 783, MIT/LCS,
June 1981.
Postel [Page 11]

View File

@ -0,0 +1,512 @@
Network Working Group John Nagle
Request For Comments: 896 6 January 1984
Ford Aerospace and Communications Corporation
Congestion Control in IP/TCP Internetworks
This memo discusses some aspects of congestion control in IP/TCP
Internetworks. It is intended to stimulate thought and further
discussion of this topic. While some specific suggestions are
made for improved congestion control implementation, this memo
does not specify any standards.
Introduction
Congestion control is a recognized problem in complex networks.
We have discovered that the Department of Defense's Internet Pro-
tocol (IP) , a pure datagram protocol, and Transmission Control
Protocol (TCP), a transport layer protocol, when used together,
are subject to unusual congestion problems caused by interactions
between the transport and datagram layers. In particular, IP
gateways are vulnerable to a phenomenon we call "congestion col-
lapse", especially when such gateways connect networks of widely
different bandwidth. We have developed solutions that prevent
congestion collapse.
These problems are not generally recognized because these proto-
cols are used most often on networks built on top of ARPANET IMP
technology. ARPANET IMP based networks traditionally have uni-
form bandwidth and identical switching nodes, and are sized with
substantial excess capacity. This excess capacity, and the abil-
ity of the IMP system to throttle the transmissions of hosts has
for most IP / TCP hosts and networks been adequate to handle
congestion. With the recent split of the ARPANET into two inter-
connected networks and the growth of other networks with differ-
ing properties connected to the ARPANET, however, reliance on the
benign properties of the IMP system is no longer enough to allow
hosts to communicate rapidly and reliably. Improved handling of
congestion is now mandatory for successful network operation
under load.
Ford Aerospace and Communications Corporation, and its parent
company, Ford Motor Company, operate the only private IP/TCP
long-haul network in existence today. This network connects four
facilities (one in Michigan, two in California, and one in Eng-
land) some with extensive local networks. This net is cross-tied
to the ARPANET but uses its own long-haul circuits; traffic
between Ford facilities flows over private leased circuits,
including a leased transatlantic satellite connection. All
switching nodes are pure IP datagram switches with no node-to-
node flow control, and all hosts run software either written or
heavily modified by Ford or Ford Aerospace. Bandwidth of links
in this network varies widely, from 1200 to 10,000,000 bits per
second. In general, we have not been able to afford the luxury
of excess long-haul bandwidth that the ARPANET possesses, and our
long-haul links are heavily loaded during peak periods. Transit
times of several seconds are thus common in our network.
RFC 896 Congestion Control in IP/TCP Internetworks 1/6/84
Because of our pure datagram orientation, heavy loading, and wide
variation in bandwidth, we have had to solve problems that the
ARPANET / MILNET community is just beginning to recognize. Our
network is sensitive to suboptimal behavior by host TCP implemen-
tations, both on and off our own net. We have devoted consider-
able effort to examining TCP behavior under various conditions,
and have solved some widely prevalent problems with TCP. We
present here two problems and their solutions. Many TCP imple-
mentations have these problems; if throughput is worse through an
ARPANET / MILNET gateway for a given TCP implementation than
throughput across a single net, there is a high probability that
the TCP implementation has one or both of these problems.
Congestion collapse
Before we proceed with a discussion of the two specific problems
and their solutions, a description of what happens when these
problems are not addressed is in order. In heavily loaded pure
datagram networks with end to end retransmission, as switching
nodes become congested, the round trip time through the net
increases and the count of datagrams in transit within the net
also increases. This is normal behavior under load. As long as
there is only one copy of each datagram in transit, congestion is
under control. Once retransmission of datagrams not yet
delivered begins, there is potential for serious trouble.
Host TCP implementations are expected to retransmit packets
several times at increasing time intervals until some upper limit
on the retransmit interval is reached. Normally, this mechanism
is enough to prevent serious congestion problems. Even with the
better adaptive host retransmission algorithms, though, a sudden
load on the net can cause the round-trip time to rise faster than
the sending hosts measurements of round-trip time can be updated.
Such a load occurs when a new bulk transfer, such a file
transfer, begins and starts filling a large window. Should the
round-trip time exceed the maximum retransmission interval for
any host, that host will begin to introduce more and more copies
of the same datagrams into the net. The network is now in seri-
ous trouble. Eventually all available buffers in the switching
nodes will be full and packets must be dropped. The round-trip
time for packets that are delivered is now at its maximum. Hosts
are sending each packet several times, and eventually some copy
of each packet arrives at its destination. This is congestion
collapse.
This condition is stable. Once the saturation point has been
reached, if the algorithm for selecting packets to be dropped is
fair, the network will continue to operate in a degraded condi-
tion. In this condition every packet is being transmitted
several times and throughput is reduced to a small fraction of
normal. We have pushed our network into this condition experi-
mentally and observed its stability. It is possible for round-
trip time to become so large that connections are broken because
RFC 896 Congestion Control in IP/TCP Internetworks 1/6/84
the hosts involved time out.
Congestion collapse and pathological congestion are not normally
seen in the ARPANET / MILNET system because these networks have
substantial excess capacity. Where connections do not pass
through IP gateways, the IMP-to host flow control mechanisms usu-
ally prevent congestion collapse, especially since TCP implemen-
tations tend to be well adjusted for the time constants associ-
ated with the pure ARPANET case. However, other than ICMP Source
Quench messages, nothing fundamentally prevents congestion col-
lapse when TCP is run over the ARPANET / MILNET and packets are
being dropped at gateways. Worth noting is that a few badly-
behaved hosts can by themselves congest the gateways and prevent
other hosts from passing traffic. We have observed this problem
repeatedly with certain hosts (with whose administrators we have
communicated privately) on the ARPANET.
Adding additional memory to the gateways will not solve the prob-
lem. The more memory added, the longer round-trip times must
become before packets are dropped. Thus, the onset of congestion
collapse will be delayed but when collapse occurs an even larger
fraction of the packets in the net will be duplicates and
throughput will be even worse.
The two problems
Two key problems with the engineering of TCP implementations have
been observed; we call these the small-packet problem and the
source-quench problem. The second is being addressed by several
implementors; the first is generally believed (incorrectly) to be
solved. We have discovered that once the small-packet problem
has been solved, the source-quench problem becomes much more
tractable. We thus present the small-packet problem and our
solution to it first.
The small-packet problem
There is a special problem associated with small packets. When
TCP is used for the transmission of single-character messages
originating at a keyboard, the typical result is that 41 byte
packets (one byte of data, 40 bytes of header) are transmitted
for each byte of useful data. This 4000% overhead is annoying
but tolerable on lightly loaded networks. On heavily loaded net-
works, however, the congestion resulting from this overhead can
result in lost datagrams and retransmissions, as well as exces-
sive propagation time caused by congestion in switching nodes and
gateways. In practice, throughput may drop so low that TCP con-
nections are aborted.
This classic problem is well-known and was first addressed in the
Tymnet network in the late 1960s. The solution used there was to
impose a limit on the count of datagrams generated per unit time.
This limit was enforced by delaying transmission of small packets
RFC 896 Congestion Control in IP/TCP Internetworks 1/6/84
until a short (200-500ms) time had elapsed, in hope that another
character or two would become available for addition to the same
packet before the timer ran out. An additional feature to
enhance user acceptability was to inhibit the time delay when a
control character, such as a carriage return, was received.
This technique has been used in NCP Telnet, X.25 PADs, and TCP
Telnet. It has the advantage of being well-understood, and is not
too difficult to implement. Its flaw is that it is hard to come
up with a time limit that will satisfy everyone. A time limit
short enough to provide highly responsive service over a 10M bits
per second Ethernet will be too short to prevent congestion col-
lapse over a heavily loaded net with a five second round-trip
time; and conversely, a time limit long enough to handle the
heavily loaded net will produce frustrated users on the Ethernet.
The solution to the small-packet problem
Clearly an adaptive approach is desirable. One would expect a
proposal for an adaptive inter-packet time limit based on the
round-trip delay observed by TCP. While such a mechanism could
certainly be implemented, it is unnecessary. A simple and
elegant solution has been discovered.
The solution is to inhibit the sending of new TCP segments when
new outgoing data arrives from the user if any previously
transmitted data on the connection remains unacknowledged. This
inhibition is to be unconditional; no timers, tests for size of
data received, or other conditions are required. Implementation
typically requires one or two lines inside a TCP program.
At first glance, this solution seems to imply drastic changes in
the behavior of TCP. This is not so. It all works out right in
the end. Let us see why this is so.
When a user process writes to a TCP connection, TCP receives some
data. It may hold that data for future sending or may send a
packet immediately. If it refrains from sending now, it will
typically send the data later when an incoming packet arrives and
changes the state of the system. The state changes in one of two
ways; the incoming packet acknowledges old data the distant host
has received, or announces the availability of buffer space in
the distant host for new data. (This last is referred to as
"updating the window"). Each time data arrives on a connec-
tion, TCP must reexamine its current state and perhaps send some
packets out. Thus, when we omit sending data on arrival from the
user, we are simply deferring its transmission until the next
message arrives from the distant host. A message must always
arrive soon unless the connection was previously idle or communi-
cations with the other end have been lost. In the first case,
the idle connection, our scheme will result in a packet being
sent whenever the user writes to the TCP connection. Thus we do
not deadlock in the idle condition. In the second case, where
RFC 896 Congestion Control in IP/TCP Internetworks 1/6/84
the distant host has failed, sending more data is futile anyway.
Note that we have done nothing to inhibit normal TCP retransmis-
sion logic, so lost messages are not a problem.
Examination of the behavior of this scheme under various condi-
tions demonstrates that the scheme does work in all cases. The
first case to examine is the one we wanted to solve, that of the
character-oriented Telnet connection. Let us suppose that the
user is sending TCP a new character every 200ms, and that the
connection is via an Ethernet with a round-trip time including
software processing of 50ms. Without any mechanism to prevent
small-packet congestion, one packet will be sent for each charac-
ter, and response will be optimal. Overhead will be 4000%, but
this is acceptable on an Ethernet. The classic timer scheme,
with a limit of 2 packets per second, will cause two or three
characters to be sent per packet. Response will thus be degraded
even though on a high-bandwidth Ethernet this is unnecessary.
Overhead will drop to 1500%, but on an Ethernet this is a bad
tradeoff. With our scheme, every character the user types will
find TCP with an idle connection, and the character will be sent
at once, just as in the no-control case. The user will see no
visible delay. Thus, our scheme performs as well as the no-
control scheme and provides better responsiveness than the timer
scheme.
The second case to examine is the same Telnet test but over a
long-haul link with a 5-second round trip time. Without any
mechanism to prevent small-packet congestion, 25 new packets
would be sent in 5 seconds.* Overhead here is 4000%. With the
classic timer scheme, and the same limit of 2 packets per second,
there would still be 10 packets outstanding and contributing to
congestion. Round-trip time will not be improved by sending many
packets, of course; in general it will be worse since the packets
will contend for line time. Overhead now drops to 1500%. With
our scheme, however, the first character from the user would find
an idle TCP connection and would be sent immediately. The next
24 characters, arriving from the user at 200ms intervals, would
be held pending a message from the distant host. When an ACK
arrived for the first packet at the end of 5 seconds, a single
packet with the 24 queued characters would be sent. Our scheme
thus results in an overhead reduction to 320% with no penalty in
response time. Response time will usually be improved with our
scheme because packet overhead is reduced, here by a factor of
4.7 over the classic timer scheme. Congestion will be reduced by
this factor and round-trip delay will decrease sharply. For this
________
* This problem is not seen in the pure ARPANET case because the
IMPs will block the host when the count of packets
outstanding becomes excessive, but in the case where a pure
datagram local net (such as an Ethernet) or a pure datagram
gateway (such as an ARPANET / MILNET gateway) is involved, it
is possible to have large numbers of tiny packets
outstanding.
RFC 896 Congestion Control in IP/TCP Internetworks 1/6/84
case, our scheme has a striking advantage over either of the
other approaches.
We use our scheme for all TCP connections, not just Telnet con-
nections. Let us see what happens for a file transfer data con-
nection using our technique. The two extreme cases will again be
considered.
As before, we first consider the Ethernet case. The user is now
writing data to TCP in 512 byte blocks as fast as TCP will accept
them. The user's first write to TCP will start things going; our
first datagram will be 512+40 bytes or 552 bytes long. The
user's second write to TCP will not cause a send but will cause
the block to be buffered. Assume that the user fills up TCP's
outgoing buffer area before the first ACK comes back. Then when
the ACK comes in, all queued data up to the window size will be
sent. From then on, the window will be kept full, as each ACK
initiates a sending cycle and queued data is sent out. Thus,
after a one round-trip time initial period when only one block is
sent, our scheme settles down into a maximum-throughput condi-
tion. The delay in startup is only 50ms on the Ethernet, so the
startup transient is insignificant. All three schemes provide
equivalent performance for this case.
Finally, let us look at a file transfer over the 5-second round
trip time connection. Again, only one packet will be sent until
the first ACK comes back; the window will then be filled and kept
full. Since the round-trip time is 5 seconds, only 512 bytes of
data are transmitted in the first 5 seconds. Assuming a 2K win-
dow, once the first ACK comes in, 2K of data will be sent and a
steady rate of 2K per 5 seconds will be maintained thereafter.
Only for this case is our scheme inferior to the timer scheme,
and the difference is only in the startup transient; steady-state
throughput is identical. The naive scheme and the timer scheme
would both take 250 seconds to transmit a 100K byte file under
the above conditions and our scheme would take 254 seconds, a
difference of 1.6%.
Thus, for all cases examined, our scheme provides at least 98% of
the performance of both other schemes, and provides a dramatic
improvement in Telnet performance over paths with long round trip
times. We use our scheme in the Ford Aerospace Software
Engineering Network, and are able to run screen editors over Eth-
ernet and talk to distant TOPS-20 hosts with improved performance
in both cases.
Congestion control with ICMP
Having solved the small-packet congestion problem and with it the
problem of excessive small-packet congestion within our own net-
work, we turned our attention to the problem of general conges-
tion control. Since our own network is pure datagram with no
node-to-node flow control, the only mechanism available to us
RFC 896 Congestion Control in IP/TCP Internetworks 1/6/84
under the IP standard was the ICMP Source Quench message. With
careful handling, we find this adequate to prevent serious
congestion problems. We do find it necessary to be careful about
the behavior of our hosts and switching nodes regarding Source
Quench messages.
When to send an ICMP Source Quench
The present ICMP standard* specifies that an ICMP Source Quench
message should be sent whenever a packet is dropped, and addi-
tionally may be sent when a gateway finds itself becoming short
of resources. There is some ambiguity here but clearly it is a
violation of the standard to drop a packet without sending an
ICMP message.
Our basic assumption is that packets ought not to be dropped dur-
ing normal network operation. We therefore want to throttle
senders back before they overload switching nodes and gateways.
All our switching nodes send ICMP Source Quench messages well
before buffer space is exhausted; they do not wait until it is
necessary to drop a message before sending an ICMP Source Quench.
As demonstrated in our analysis of the small-packet problem,
merely providing large amounts of buffering is not a solution.
In general, our experience is that Source Quench should be sent
when about half the buffering space is exhausted; this is not
based on extensive experimentation but appears to be a reasonable
engineering decision. One could argue for an adaptive scheme
that adjusted the quench generation threshold based on recent
experience; we have not found this necessary as yet.
There exist other gateway implementations that generate Source
Quenches only after more than one packet has been discarded. We
consider this approach undesirable since any system for control-
ling congestion based on the discarding of packets is wasteful of
bandwidth and may be susceptible to congestion collapse under
heavy load. Our understanding is that the decision to generate
Source Quenches with great reluctance stems from a fear that ack-
nowledge traffic will be quenched and that this will result in
connection failure. As will be shown below, appropriate handling
of Source Quench in host implementations eliminates this possi-
bility.
What to do when an ICMP Source Quench is received
We inform TCP or any other protocol at that layer when ICMP
receives a Source Quench. The basic action of our TCP implemen-
tations is to reduce the amount of data outstanding on connec-
tions to the host mentioned in the Source Quench. This control is
________
* ARPANET RFC 792 is the present standard. We are advised by
the Defense Communications Agency that the description of
ICMP in MIL-STD-1777 is incomplete and will be deleted from
future revision of that standard.
RFC 896 Congestion Control in IP/TCP Internetworks 1/6/84
applied by causing the sending TCP to behave as if the distant
host's window size has been reduced. Our first implementation
was simplistic but effective; once a Source Quench has been
received our TCP behaves as if the window size is zero whenever
the window isn't empty. This behavior continues until some
number (at present 10) of ACKs have been received, at that time
TCP returns to normal operation.* David Mills of Linkabit Cor-
poration has since implemented a similar but more elaborate
throttle on the count of outstanding packets in his DCN systems.
The additional sophistication seems to produce a modest gain in
throughput, but we have not made formal tests. Both implementa-
tions effectively prevent congestion collapse in switching nodes.
Source Quench thus has the effect of limiting the connection to a
limited number (perhaps one) of outstanding messages. Thus, com-
munication can continue but at a reduced rate, that is exactly
the effect desired.
This scheme has the important property that Source Quench doesn't
inhibit the sending of acknowledges or retransmissions. Imple-
mentations of Source Quench entirely within the IP layer are usu-
ally unsuccessful because IP lacks enough information to throttle
a connection properly. Holding back acknowledges tends to pro-
duce retransmissions and thus unnecessary traffic. Holding back
retransmissions may cause loss of a connection by a retransmis-
sion timeout. Our scheme will keep connections alive under
severe overload but at reduced bandwidth per connection.
Other protocols at the same layer as TCP should also be respon-
sive to Source Quench. In each case we would suggest that new
traffic should be throttled but acknowledges should be treated
normally. The only serious problem comes from the User Datagram
Protocol, not normally a major traffic generator. We have not
implemented any throttling in these protocols as yet; all are
passed Source Quench messages by ICMP but ignore them.
Self-defense for gateways
As we have shown, gateways are vulnerable to host mismanagement
of congestion. Host misbehavior by excessive traffic generation
can prevent not only the host's own traffic from getting through,
but can interfere with other unrelated traffic. The problem can
be dealt with at the host level but since one malfunctioning host
can interfere with others, future gateways should be capable of
defending themselves against such behavior by obnoxious or mali-
cious hosts. We offer some basic self-defense techniques.
On one occasion in late 1983, a TCP bug in an ARPANET host caused
the host to frantically generate retransmissions of the same
datagram as fast as the ARPANET would accept them. The gateway
________
* This follows the control engineering dictum "Never bother
with proportional control unless bang-bang doesn't work".
RFC 896 Congestion Control in IP/TCP Internetworks 1/6/84
that connected our net with the ARPANET was saturated and little
useful traffic could get through, since the gateway had more
bandwidth to the ARPANET than to our net. The gateway busily
sent ICMP Source Quench messages but the malfunctioning host
ignored them. This continued for several hours, until the mal-
functioning host crashed. During this period, our network was
effectively disconnected from the ARPANET.
When a gateway is forced to discard a packet, the packet is
selected at the discretion of the gateway. Classic techniques
for making this decision are to discard the most recently
received packet, or the packet at the end of the longest outgoing
queue. We suggest that a worthwhile practical measure is to dis-
card the latest packet from the host that originated the most
packets currently queued within the gateway. This strategy will
tend to balance throughput amongst the hosts using the gateway.
We have not yet tried this strategy, but it seems a reasonable
starting point for gateway self-protection.
Another strategy is to discard a newly arrived packet if the
packet duplicates a packet already in the queue. The computa-
tional load for this check is not a problem if hashing techniques
are used. This check will not protect against malicious hosts
but will provide some protection against TCP implementations with
poor retransmission control. Gateways between fast local net-
works and slower long-haul networks may find this check valuable
if the local hosts are tuned to work well with the local network.
Ideally the gateway should detect malfunctioning hosts and
squelch them; such detection is difficult in a pure datagram sys-
tem. Failure to respond to an ICMP Source Quench message,
though, should be regarded as grounds for action by a gateway to
disconnect a host. Detecting such failure is non-trivial but is
a worthwhile area for further research.
Conclusion
The congestion control problems associated with pure datagram
networks are difficult, but effective solutions exist. If IP /
TCP networks are to be operated under heavy load, TCP implementa-
tions must address several key issues in ways at least as effec-
tive as the ones described here.

View File

@ -0,0 +1,570 @@
Network Working Group Deepinder P. Sidhu
Request for Comments: 964 Thomas P. Blumer
SDC - A Burroughs Company
November 1985
SOME PROBLEMS WITH THE SPECIFICATION OF THE
MILITARY STANDARD TRANSMISSION CONTROL PROTOCOL
STATUS OF THIS MEMO
The purpose of this RFC is to provide helpful information on the
Military Standard Transmission Control Protocol (MIL-STD-1778) so
that one can obtain a reliable implementation of this protocol
standard. Distribution of this note is unlimited.
Reprinted from: Proc. Protocol Specification, Testing and
Verification IV, (ed.) Y. Yemini, et al, North-Holland (1984).
ABSTRACT
This note points out three errors with the specification of the
Military Standard Transmission Control Protocol (MIL-STD-1778, dated
August 1983 [MILS83]). These results are based on an initial
investigation of this protocol standard. The first problem is that
data accompanying a SYN can not be accepted because of errors in the
acceptance policy. The second problem is that no retransmission
timer is set for a SYN packet, and therefore the SYN will not be
retransmitted if it is lost. The third problem is that when the
connection has been established, neither entity takes the proper
steps to accept incoming data. This note also proposes solutions to
these problems.
1. Introduction
In recent years, much progress has been made in creating an
integrated set of tools for developing reliable communication
protocols. These tools provide assistance in the specification,
verification, implementation and testing of protocols. Several
protocols have been analyzed and developed using such tools.
In a recent paper, the authors discussed the verification of the
connection management of NBS class 4 transport protocol (TP4). The
verification was carried out with the help of a software tool we
developed [BLUT82] [BLUT83] [SIDD83]. In spite of the very precise
specification of this protocol, our analysis discovered several
errors in the current specification of NBS TP4. These errors are
incompleteness errors in the specification, that is, states where
there is no transition for the reception of some input event. Our
analysis did not find deadlocks, livelocks or any other problem in
the connection management of TP4. In that paper, we proposed
Sidhu & Blumer [Page 1]
RFC 964 November 1985
Some Problems with MIL-STD TCP
solutions for all errors except for errors associated with 2 states
whose satisfactory resolution may require redesigning parts of TP4.
Modifications to TP4 specification are currently underway to solve
the remaining incompleteness problems with 2 states. It is important
to emphasize that we did not find any obvious error in the NBS
specification of TP4.
The authors are currently working on the verification of connection
management of the Military Standard Transmission Control Protocol
(TCP). This analysis will be based on the published specification
[MILS83] of TCP dated 12 August 1983.
While studying the MIL standard TCP specification in preparation for
our analysis of the connection management features, we have noticed
several errors in the specification. As a consequence of these
errors, the Transmission Control Protocol (as specified in [MILS83])
will not permit data to be received by TCP entities in SYN_RECVD and
ESTAB states.
The proof of this statement follows from the specification of the
three-way handshake mechanism of TCP [MILS83] and from a decision
table associated with ESTAB state.
2. Transmission Control Protocol
The Transmission Control Protocol (TCP) is a transport level
connection-oriented protocol in the DoD protocol hierarchy for use in
packet-switched and other networks. Its most important services are
reliable transfer and ordered delivery of data over full-duplex and
flow-controlled virtual connections. TCP is designed to operate
successfully over channels that are inherently unreliable, i.e., they
can lose, damage, duplicate, and reorder packets.
TCP is based, in part, on a protocol discussed by Cerf and Kahn
[CERV74]. Over the years, DARPA has supported specifications of
several versions of this protocol, the last one appeared in [POSJ81].
Some issues in the connection management of this protocol are
discussed in [SUNC78].
A few years ago, DCA decided to standardize TCP for use in DoD
networks and supported formal specification of this protocol
following the design of this protocol discussed in [POSJ81]. A
detailed specification of this protocol given in [MILS83] has been
adopted as the DoD standard for the Transmission Control Protocol, a
reliable connection-oriented transport protocol for DoD networks.
A TCP connection progresses through three phases: opening (or
Sidhu & Blumer [Page 2]
RFC 964 November 1985
Some Problems with MIL-STD TCP
synchronization), maintenance, and closing. In this note we consider
data transfer in the opening and maintenance phases of the
connection.
3. Problems with MIL Standard TCP
One basic feature of TCP is the three-way handshake which is used to
set up a properly synchronized connection between two remote TCP
entities. This mechanism is incorrectly specified in the current
specification of TCP. One problem is that data associated with the
SYN packet can not be delivered. This results from an incorrect
specification of the interaction between the accept_policy action
procedure and the record_syn action procedure. Neither of the 2
possible strategies suggested in accept_policy will give the correct
result when called from the record_syn procedure, because the
recv_next variable is updated in record_syn before the accept_policy
procedure is called.
Another problem with the specification of the three-way handshake is
apparent in the actions listed for the Active Open event (with or
without data) when in the CLOSED state. No retransmission timer is
set in these actions, and therefore if the initial SYN is lost, there
will be no timer expiration to trigger retransmission. This will
prevent connection establishment if the initial SYN packet is lost by
the network.
The third problem with the specification is that the actions for
receiving data in the ESTAB state are incorrect. The accept action
procedure must be called when data is received, so that arriving data
may be queued and possibly passed to the user.
A general problem with this specification is that the program
language and action table portions of the specification were clearly
not checked by any automatic syntax checking process. Several
variable and procedure names are misspelled, and the syntax of the
action statements is often incorrect. This can be confusing,
especially when a procedure name cannot be found in the alphabetized
list of procedures because of misspelling.
These are some of the very serious errors that we have discovered
with the MIL standard TCP.
Sidhu & Blumer [Page 3]
RFC 964 November 1985
Some Problems with MIL-STD TCP
4. Detailed Discussion of the Problem
Problem 1: Problem with Receiving Data Accompanying SYN
The following scenario traces the actions of 2 communicating
entities during the establishment of a connection. Only the
simplest case is considered, i.e., the case where the connection
is established by the exchange of 3 segments.
TCP entity A TCP entity B
------------ ------------
state segment segment state
transition recvd or sent recvd or sent transition
by A by B
CLOSED -> LISTEN
CLOSED -> SYN_SENT SYN -->
SYN --> LISTEN -> SYN_RECVD
<-- SYN ACK
SYN_SENT -> ESTAB <-- SYN ACK
ACK -->
ACK --> SYN_RECVD -> ESTAB
As shown in the above diagram, 5 state transitions occur and 3 TCP
segments are exchanged during the simplest case of the three-way
handshake. We now examine in detail the actions of each entity
during this exchange. Special attention is given to the sequence
numbers carried in each packet and recorded in the state variables
of each entity.
In the diagram below, the actions occurring within a procedure are
shown indented from the procedure call. The resulting values of
sequence number variables are shown in square brackets to the
right of each statement. The sequence number variables are shown
with the entity name (A or B) as prefix so that the two sets of
state variables may be easily distinguished.
Sidhu & Blumer [Page 4]
RFC 964 November 1985
Some Problems with MIL-STD TCP
Transition 1 (entity B goes from state CLOSED to state LISTEN).
The user associated with entity B issues a Passive Open.
Actions: (see p. 104)
open; (see p. 144)
new state := LISTEN;
Transition 2 (entity A goes from state CLOSED to SYN_SENT). The
user associated with entity A issues an Active Open with Data.
Actions: (see p. 104)
open; (see p. 144)
gen_syn(WITH_DATA); (see p. 141)
send_isn := gen_isn(); [A.send_isn = 100]
send_next := send_isn + 1; [A.send_next = 101]
send_una := send_isn; [A.send_una = 100]
seg.seq_num := send_isn; [seg.seq_num = 100]
seg.ack_flag := FALSE; [seg.ack_flag = FALSE]
seg.wndw := 0; [seg.wndw = 0]
amount := send_policy() [assume amount > 0]
new state := SYN_SENT;
Sidhu & Blumer [Page 5]
RFC 964 November 1985
Some Problems with MIL-STD TCP
Transition 3 (Entity B goes from state LISTEN to state SYN_RECVD).
Entity B receives the SYN segment accompanying data sent by entity
A.
Actions: (see p. 106)
(since this segment has no RESET, no ACK, does have SYN, and
we assume reasonable security and precedence parameters, row
3 of the table applies)
record_syn; (see p. 147)
recv_isn := seg.seq_num; [B.recv_isn = seg_seq_num = 100]
recv_next := recv_isn + 1; [B.recv_next = 101]
if seg.ack_flag then
send_una := seg.ack_num; [no change]
accept_policy; (see p. 131)
Accept in-order data only:
Acceptance Test is
seg.seq_num = recv_next;
Accept any data within the receive window:
Acceptance Test has two parts
recv_next =< seg.seq_num =< recv_next +
recv_wndw
or
recv_next =< seg.seq_num + length =<
recv_next + recv_wndw
********************************************
An error occurs here, with either possible
strategy given in accept_policy, because
recv_next > seg.seq_num. Therefore
accept_policy will incorrectly indicate that
the data cannot be accepted.
********************************************
gen_syn(WITH_ACK); (see p. 141)
send_isn := gen_isn(); [B.send_isn = 300]
send_next := send_isn + 1; [B.send_next = 301]
send_una := send_isn; [B.send_una = 300]
seg.seq_num := send_next; [seg.seq_num = 301]
seg.ack_flag := TRUE; [seg.ack_flag = TRUE]
seg.ack_num := recv_isn + 1; [seg.ack_num = 102]
new state := SYN_RECVD;
Sidhu & Blumer [Page 6]
RFC 964 November 1985
Some Problems with MIL-STD TCP
Transition 4 (entity A goes from state SYN_SENT to ESTAB) Entity A
receives the SYN ACK sent by entity B.
Actions: (see p. 107)
In order to select the applicable row of the table on p.
107, we first evaluate the decision function
ACK_status_test1.
ACK_status_test1();
if(seg.ack_flag = FALSE) then
return(NONE);
if(seg.ack_num <= send_una) or
(seg.ack_num > send_next) then
return(INVALID)
else
return(VALID);
... and so on.
The important thing to notice in the above scenario is the error
that occurs in transition 3, where the wrong value for recv_next
leads to the routine record_syn refusing to accept the data.
Problem 2: Problem with Retransmission of SYN Packet
The actions listed for Active Open (with or without data; see p.
103) are calls to the routines open and gen_syn. Neither of these
routines (or routines that they call) explicitly sets a
retransmission timer. Therefore if the initial SYN is lost there
is no timer expiration to trigger retransmission of the SYN. If
this happens, the TCP will fail in its attempt to establish the
desired connection with a remote TCP.
Note that this differs with the actions specified for transmission
of data from the ESTAB state. In that transition the routine
dispatch (p. 137) is called first which in turn calls the routine
send_new_data (p. 156). One of actions of the last routine is to
start a retransmission timer for the newly sent data.
Sidhu & Blumer [Page 7]
RFC 964 November 1985
Some Problems with MIL-STD TCP
Problem 3: Problem with Receiving Data in TCP ESTAB State
When both entities are in the state ESTAB, and one sends data to
the other, an error in the actions of the receiver prohibits the
data from being accepted. The following simple scenario
illustrates the problem. Here the user associated with entity A
issues a Send request, and A sends data to entity B. When B
receives the data it replies with an acknowledgment.
TCP entity A TCP entity B
------------ ------------
state segment segment state
transition recvd or sent recvd or sent transition
by A by B
ESTAB -> ESTAB DATA -->
DATA --> ESTAB -> ESTAB
<-- ACK
Transition 1 (entity A goes from state ESTAB to ESTAB) Entity A
sends data packet to entity B.
Actions: (see p. 110)
dispatch; (see p. 137)
Transition 2 (entity B goes from state ESTAB to ESTAB) Entity B
receives data packet from entity B.
Actions: (see p. 111)
Assuming the data is in order and valid, we use row 6 of the
table.
update; (see p. 159)
************************************************************
An error occurs here, because the routine update does
nothing to accept the incoming data, or to arrange to
pass it on to the user.
************************************************************
Sidhu & Blumer [Page 8]
RFC 964 November 1985
Some Problems with MIL-STD TCP
5. Solutions to Problems
The problem with record_syn and accept_policy can be solved by having
record_syn call accept_policy before the variable recv_next is
updated.
The problem with gen_syn can be corrected by having gen_syn or open
explicitly request the retransmission timer.
The problem with the reception of data in the ESTAB state is
apparently caused by the transposition of the action tables on pages
111 and 112. These tables should be interchanged. This solution
will also correct a related problem, namely that an entity can never
reach the CLOSE_WAIT state from the ESTAB state.
Syntax errors in the action statements and tables could be easily
caught by an automatic syntax checker if the document used a more
formal description technique. This would be difficult to do for
[MILS83] since this document is not based on a formalized description
technique [BREM83].
The errors pointed out in this note have been submitted to DCA and
will be corrected in the next update of the MIL STD TCP
specification.
6. Implementation of MIL Standard TCP
In the discussion above, we pointed out several serious errors in the
specification of the Military Standard Transmission Control Protocol
[MILS83]. These errors imply that a TCP implementation that
faithfully conforms to the Military TCP standard will not be able to
Receive data sent with a SYN packet.
Establish a connection if the initial SYN packet is lost.
Receive data when in the ESTAB state.
It also follows from our discussion that an implementation of MIL
Standard TCP [MILS83] must include corrections mentioned above to get
a running TCP.
The problems pointed out in this paper with the current specification
of the MIL Standard TCP [MILS83] are based on an initial
investigation of this protocol standard by the authors.
Sidhu & Blumer [Page 9]
RFC 964 November 1985
Some Problems with MIL-STD TCP
REFERENCES
[BLUT83] Blumer, T. P., and Sidhu, D. P., "Mechanical Verification
and Automatic Implementation of Authentication Protocols
for Computer Networks", SDC Burroughs Report (1983),
submitted for publication.
[BLUT82] Blumer, T. P., and Tenney, R. L., "A Formal Specification
Technique and Implementation Method for Protocols",
Computer Networks, Vol. 6, No. 3, July 1982, pp. 201-217.
[BREM83] Breslin, M., Pollack, R. and Sidhu D. P., "Formalization of
DoD Protocol Specification Technique", SDC - Burroughs
Report 1983.
[CERV74] Cerf, V., and Kahn, R., "A Protocol for Packet Network
Interconnection", IEEE Trans. Comm., May 1974.
[MILS83] "Military Standard Transmission Control Protocol",
MIL-STD-1778, 12 August 1983.
[POSJ81] Postel, J. (ed.), "DoD Standard Transmission Control
Protocol", Defense Advanced Research Projects Agency,
Information Processing Techniques Office, RFC-793,
September 1981.
[SIDD83] Sidhu, D. P., and Blumer, T. P., "Verification of NBS Class
4 Transport Protocol", SDC Burroughs Report (1983),
submitted for publication.
[SUNC78] Sunshine, C., and Dalal, Y., "Connection Management in
Transport Protocols", Computer Networks, Vol. 2, pp.454-473
(1978).
Sidhu & Blumer [Page 10]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,893 @@
Network Working Group V. Jacobson
Request for Comments: 1072 LBL
R. Braden
ISI
October 1988
TCP Extensions for Long-Delay Paths
Status of This Memo
This memo proposes a set of extensions to the TCP protocol to provide
efficient operation over a path with a high bandwidth*delay product.
These extensions are not proposed as an Internet standard at this
time. Instead, they are intended as a basis for further
experimentation and research on transport protocol performance.
Distribution of this memo is unlimited.
1. INTRODUCTION
Recent work on TCP performance has shown that TCP can work well over
a variety of Internet paths, ranging from 800 Mbit/sec I/O channels
to 300 bit/sec dial-up modems [Jacobson88]. However, there is still
a fundamental TCP performance bottleneck for one transmission regime:
paths with high bandwidth and long round-trip delays. The
significant parameter is the product of bandwidth (bits per second)
and round-trip delay (RTT in seconds); this product is the number of
bits it takes to "fill the pipe", i.e., the amount of unacknowledged
data that TCP must handle in order to keep the pipeline full. TCP
performance problems arise when this product is large, e.g.,
significantly exceeds 10**5 bits. We will refer to an Internet path
operating in this region as a "long, fat pipe", and a network
containing this path as an "LFN" (pronounced "elephan(t)").
High-capacity packet satellite channels (e.g., DARPA's Wideband Net)
are LFN's. For example, a T1-speed satellite channel has a
bandwidth*delay product of 10**6 bits or more; this corresponds to
100 outstanding TCP segments of 1200 bytes each! Proposed future
terrestrial fiber-optical paths will also fall into the LFN class;
for example, a cross-country delay of 30 ms at a DS3 bandwidth
(45Mbps) also exceeds 10**6 bits.
Clever algorithms alone will not give us good TCP performance over
LFN's; it will be necessary to actually extend the protocol. This
RFC proposes a set of TCP extensions for this purpose.
There are three fundamental problems with the current TCP over LFN
Jacobson & Braden [Page 1]
RFC 1072 TCP Extensions for Long-Delay Paths October 1988
paths:
(1) Window Size Limitation
The TCP header uses a 16 bit field to report the receive window
size to the sender. Therefore, the largest window that can be
used is 2**16 = 65K bytes. (In practice, some TCP
implementations will "break" for windows exceeding 2**15,
because of their failure to do unsigned arithmetic).
To circumvent this problem, we propose a new TCP option to allow
windows larger than 2**16. This option will define an implicit
scale factor, to be used to multiply the window size value found
in a TCP header to obtain the true window size.
(2) Cumulative Acknowledgments
Any packet losses in an LFN can have a catastrophic effect on
throughput. This effect is exaggerated by the simple cumulative
acknowledgment of TCP. Whenever a segment is lost, the
transmitting TCP will (eventually) time out and retransmit the
missing segment. However, the sending TCP has no information
about segments that may have reached the receiver and been
queued because they were not at the left window edge, so it may
be forced to retransmit these segments unnecessarily.
We propose a TCP extension to implement selective
acknowledgements. By sending selective acknowledgments, the
receiver of data can inform the sender about all segments that
have arrived successfully, so the sender need retransmit only
the segments that have actually been lost.
Selective acknowledgments have been included in a number of
experimental Internet protocols -- VMTP [Cheriton88], NETBLT
[Clark87], and RDP [Velten84]. There is some empirical evidence
in favor of selective acknowledgments -- simple experiments with
RDP have shown that disabling the selective acknowlegment
facility greatly increases the number of retransmitted segments
over a lossy, high-delay Internet path [Partridge87]. A
simulation study of a simple form of selective acknowledgments
added to the ISO transport protocol TP4 also showed promise of
performance improvement [NBS85].
Jacobson & Braden [Page 2]
RFC 1072 TCP Extensions for Long-Delay Paths October 1988
(3) Round Trip Timing
TCP implements reliable data delivery by measuring the RTT,
i.e., the time interval between sending a segment and receiving
an acknowledgment for it, and retransmitting any segments that
are not acknowledged within some small multiple of the average
RTT. Experience has shown that accurate, current RTT estimates
are necessary to adapt to changing traffic conditions and,
without them, a busy network is subject to an instability known
as "congestion collapse" [Nagle84].
In part because TCP segments may be repacketized upon
retransmission, and in part because of complications due to the
cumulative TCP acknowledgement, measuring a segments's RTT may
involve a non-trivial amount of computation in some
implementations. To minimize this computation, some
implementations time only one segment per window. While this
yields an adequate approximation to the RTT for small windows
(e.g., a 4 to 8 segment Arpanet window), for an LFN (e.g., 100
segment Wideband Network windows) it results in an unacceptably
poor RTT estimate.
In the presence of errors, the problem becomes worse. Zhang
[Zhang86], Jain [Jain86] and Karn [Karn87] have shown that it is
not possible to accumulate reliable RTT estimates if
retransmitted segments are included in the estimate. Since a
full window of data will have been transmitted prior to a
retransmission, all of the segments in that window will have to
be ACKed before the next RTT sample can be taken. This means at
least an additional window's worth of time between RTT
measurements and, as the error rate approaches one per window of
data (e.g., 10**-6 errors per bit for the Wideband Net), it
becomes effectively impossible to obtain an RTT measurement.
We propose a TCP "echo" option that allows each segment to carry
its own timestamp. This will allow every segment, including
retransmissions, to be timed at negligible computational cost.
In designing new TCP options, we must pay careful attention to
interoperability with existing implementations. The only TCP option
defined to date is an "initial option", i.e., it may appear only on a
SYN segment. It is likely that most implementations will properly
ignore any options in the SYN segment that they do not understand, so
new initial options should not cause a problem. On the other hand,
we fear that receiving unexpected non-initial options may cause some
TCP's to crash.
Jacobson & Braden [Page 3]
RFC 1072 TCP Extensions for Long-Delay Paths October 1988
Therefore, in each of the extensions we propose, non-initial options
may be sent only if an exchange of initial options has indicated that
both sides understand the extension. This approach will also allow a
TCP to determine when the connection opens how big a TCP header it
will be sending.
2. TCP WINDOW SCALE OPTION
The obvious way to implement a window scale factor would be to define
a new TCP option that could be included in any segment specifying a
window. The receiver would include it in every acknowledgment
segment, and the sender would interpret it. Unfortunately, this
simple approach would not work. The sender must reliably know the
receiver's current scale factor, but a TCP option in an
acknowledgement segment will not be delivered reliably (unless the
ACK happens to be piggy-backed on data).
However, SYN segments are always sent reliably, suggesting that each
side may communicate its window scale factor in an initial TCP
option. This approach has a disadvantage: the scale must be
established when the connection is opened, and cannot be changed
thereafter. However, other alternatives would be much more
complicated, and we therefore propose a new initial option called
Window Scale.
2.1 Window Scale Option
This three-byte option may be sent in a SYN segment by a TCP (1)
to indicate that it is prepared to do both send and receive window
scaling, and (2) to communicate a scale factor to be applied to
its receive window. The scale factor is encoded logarithmically,
as a power of 2 (presumably to be implemented by binary shifts).
Note: the window in the SYN segment itself is never scaled.
TCP Window Scale Option:
Kind: 3
+---------+---------+---------+
| Kind=3 |Length=3 |shift.cnt|
+---------+---------+---------+
Here shift.cnt is the number of bits by which the receiver right-
shifts the true receive-window value, to scale it into a 16-bit
value to be sent in TCP header (this scaling is explained below).
The value shift.cnt may be zero (offering to scale, while applying
a scale factor of 1 to the receive window).
Jacobson & Braden [Page 4]
RFC 1072 TCP Extensions for Long-Delay Paths October 1988
This option is an offer, not a promise; both sides must send
Window Scale options in their SYN segments to enable window
scaling in either direction.
2.2 Using the Window Scale Option
A model implementation of window scaling is as follows, using the
notation of RFC-793 [Postel81]:
* The send-window (SND.WND) and receive-window (RCV.WND) sizes
in the connection state block and in all sequence space
calculations are expanded from 16 to 32 bits.
* Two window shift counts are added to the connection state:
snd.scale and rcv.scale. These are shift counts to be
applied to the incoming and outgoing windows, respectively.
The precise algorithm is shown below.
* All outgoing SYN segments are sent with the Window Scale
option, containing a value shift.cnt = R that the TCP would
like to use for its receive window.
* Snd.scale and rcv.scale are initialized to zero, and are
changed only during processing of a received SYN segment. If
the SYN segment contains a Window Scale option with shift.cnt
= S, set snd.scale to S and set rcv.scale to R; otherwise,
both snd.scale and rcv.scale are left at zero.
* The window field (SEG.WND) in the header of every incoming
segment, with the exception of SYN segments, will be left-
shifted by snd.scale bits before updating SND.WND:
SND.WND = SEG.WND << snd.scale
(assuming the other conditions of RFC793 are met, and using
the "C" notation "<<" for left-shift).
* The window field (SEG.WND) of every outgoing segment, with
the exception of SYN segments, will have been right-shifted
by rcv.scale bits:
SEG.WND = RCV.WND >> rcv.scale.
TCP determines if a data segment is "old" or "new" by testing if
its sequence number is within 2**31 bytes of the left edge of the
window. If not, the data is "old" and discarded. To insure that
new data is never mistakenly considered old and vice-versa, the
Jacobson & Braden [Page 5]
RFC 1072 TCP Extensions for Long-Delay Paths October 1988
left edge of the sender's window has to be at least 2**31 away
from the right edge of the receiver's window. Similarly with the
sender's right edge and receiver's left edge. Since the right and
left edges of either the sender's or receiver's window differ by
the window size, and since the sender and receiver windows can be
out of phase by at most the window size, the above constraints
imply that 2 * the max window size must be less than 2**31, or
max window < 2**30
Since the max window is 2**S (where S is the scaling shift count)
times at most 2**16 - 1 (the maximum unscaled window), the maximum
window is guaranteed to be < 2*30 if S <= 14. Thus, the shift
count must be limited to 14. (This allows windows of 2**30 = 1
Gbyte.) If a Window Scale option is received with a shift.cnt
value exceeding 14, the TCP should log the error but use 14
instead of the specified value.
3. TCP SELECTIVE ACKNOWLEDGMENT OPTIONS
To minimize the impact on the TCP protocol, the selective
acknowledgment extension uses the form of two new TCP options. The
first is an enabling option, "SACK-permitted", that may be sent in a
SYN segment to indicate that the the SACK option may be used once the
connection is established. The other is the SACK option itself,
which may be sent over an established connection once permission has
been given by SACK-permitted.
The SACK option is to be included in a segment sent from a TCP that
is receiving data to the TCP that is sending that data; we will refer
to these TCP's as the data receiver and the data sender,
respectively. We will consider a particular simplex data flow; any
data flowing in the reverse direction over the same connection can be
treated independently.
3.1 SACK-Permitted Option
This two-byte option may be sent in a SYN by a TCP that has been
extended to receive (and presumably process) the SACK option once
the connection has opened.
Jacobson & Braden [Page 6]
RFC 1072 TCP Extensions for Long-Delay Paths October 1988
TCP Sack-Permitted Option:
Kind: 4
+---------+---------+
| Kind=4 | Length=2|
+---------+---------+
3.2 SACK Option
The SACK option is to be used to convey extended acknowledgment
information over an established connection. Specifically, it is
to be sent by a data receiver to inform the data transmitter of
non-contiguous blocks of data that have been received and queued.
The data receiver is awaiting the receipt of data in later
retransmissions to fill the gaps in sequence space between these
blocks. At that time, the data receiver will acknowledge the data
normally by advancing the left window edge in the Acknowledgment
Number field of the TCP header.
It is important to understand that the SACK option will not change
the meaning of the Acknowledgment Number field, whose value will
still specify the left window edge, i.e., one byte beyond the last
sequence number of fully-received data. The SACK option is
advisory; if it is ignored, TCP acknowledgments will continue to
function as specified in the protocol.
However, SACK will provide additional information that the data
transmitter can use to optimize retransmissions. The TCP data
receiver may include the SACK option in an acknowledgment segment
whenever it has data that is queued and unacknowledged. Of
course, the SACK option may be sent only when the TCP has received
the SACK-permitted option in the SYN segment for that connection.
TCP SACK Option:
Kind: 5
Length: Variable
+--------+--------+--------+--------+--------+--------+...---+
| Kind=5 | Length | Relative Origin | Block Size | |
+--------+--------+--------+--------+--------+--------+...---+
This option contains a list of the blocks of contiguous sequence
space occupied by data that has been received and queued within
Jacobson & Braden [Page 7]
RFC 1072 TCP Extensions for Long-Delay Paths October 1988
the window. Each block is contiguous and isolated; that is, the
octets just below the block,
Acknowledgment Number + Relative Origin -1,
and just above the block,
Acknowledgment Number + Relative Origin + Block Size,
have not been received.
Each contiguous block of data queued at the receiver is defined in
the SACK option by two 16-bit integers:
* Relative Origin
This is the first sequence number of this block, relative to
the Acknowledgment Number field in the TCP header (i.e.,
relative to the data receiver's left window edge).
* Block Size
This is the size in octets of this block of contiguous data.
A SACK option that specifies n blocks will have a length of 4*n+2
octets, so the 44 bytes available for TCP options can specify a
maximum of 10 blocks. Of course, if other TCP options are
introduced, they will compete for the 44 bytes, and the limit of
10 may be reduced in particular segments.
There is no requirement on the order in which blocks can appear in
a single SACK option.
Note: requiring that the blocks be ordered would allow a
slightly more efficient algorithm in the transmitter; however,
this does not seem to be an important optimization.
3.3 SACK with Window Scaling
If window scaling is in effect, then 16 bits may not be sufficient
for the SACK option fields that define the origin and length of a
block. There are two possible ways to handle this:
(1) Expand the SACK origin and length fields to 24 or 32 bits.
Jacobson & Braden [Page 8]
RFC 1072 TCP Extensions for Long-Delay Paths October 1988
(2) Scale the SACK fields by the same factor as the window.
The first alternative would significantly reduce the number of
blocks possible in a SACK option; therefore, we have chosen the
second alternative, scaling the SACK information as well as the
window.
Scaling the SACK information introduces some loss of precision,
since a SACK option must report queued data blocks whose origins
and lengths are multiples of the window scale factor rcv.scale.
These reported blocks must be equal to or smaller than the actual
blocks of queued data.
Specifically, suppose that the receiver has a contiguous block of
queued data that occupies sequence numbers L, L+1, ... L+N-1, and
that the window scale factor is S = rcv.scale. Then the
corresponding block that will be reported in a SACK option will
be:
Relative Origin = int((L+S-1)/S)
Block Size = int((L+N)/S) - (Relative Origin)
where the function int(x) returns the greatest integer contained
in x.
The resulting loss of precision is not a serious problem for the
sender. If the data-sending TCP keeps track of the boundaries of
all segments in its retransmission queue, it will generally be
able to infer from the imprecise SACK data which full segments
don't need to be retransmitted. This will fail only if S is
larger than the maximum segment size, in which case some segments
may be retransmitted unnecessarily. If the sending TCP does not
keep track of transmitted segment boundaries, the imprecision of
the scaled SACK quantities will only result in retransmitting a
small amount of unneeded sequence space. On the average, the data
sender will unnecessarily retransmit J*S bytes of the sequence
space for each SACK received; here J is the number of blocks
reported in the SACK, and S = snd.scale.
3.4 SACK Option Examples
Assume the left window edge is 5000 and that the data transmitter
sends a burst of 8 segments, each containing 500 data bytes.
Unless specified otherwise, we assume that the scale factor S = 1.
Jacobson & Braden [Page 9]
RFC 1072 TCP Extensions for Long-Delay Paths October 1988
Case 1: The first 4 segments are received but the last 4 are
dropped.
The data receiver will return a normal TCP ACK segment
acknowledging sequence number 7000, with no SACK option.
Case 2: The first segment is dropped but the remaining 7 are
received.
The data receiver will return a TCP ACK segment that
acknowledges sequence number 5000 and contains a SACK option
specifying one block of queued data:
Relative Origin = 500; Block Size = 3500
Case 3: The 2nd, 4th, 6th, and 8th (last) segments are
dropped.
The data receiver will return a TCP ACK segment that
acknowledges sequence number 5500 and contains a SACK option
specifying the 3 blocks:
Relative Origin = 500; Block Size = 500
Relative Origin = 1500; Block Size = 500
Relative Origin = 2500; Block Size = 500
Case 4: Same as Case 3, except Scale Factor S = 16.
The SACK option would specify the 3 scaled blocks:
Relative Origin = 32; Block Size = 30
Relative Origin = 94; Block Size = 31
Relative Origin = 157; Block Size = 30
These three reported blocks have sequence numbers 512 through
991, 1504 through 1999, and 2512 through 2992, respectively.
3.5 Generating the SACK Option
Let us assume that the data receiver maintains a queue of valid
segments that it has neither passed to the user nor acknowledged
because of earlier missing data, and that this queue is ordered by
starting sequence number. Computation of the SACK option can be
done with one pass down this queue. Segments that occupy
Jacobson & Braden [Page 10]
RFC 1072 TCP Extensions for Long-Delay Paths October 1988
contiguous sequence space are aggregated into a single SACK block,
and each gap in the sequence space (except a gap that is
terminated by the right window edge) triggers the start of a new
SACK block. If this algorithm defines more than 10 blocks, only
the first 10 can be included in the option.
3.6 Interpreting the SACK Option
The data transmitter is assumed to have a retransmission queue
that contains the segments that have been transmitted but not yet
acknowledged, in sequence-number order. If the data transmitter
performs re-packetization before retransmission, the block
boundaries in a SACK option that it receives may not fall on
boundaries of segments in the retransmission queue; however, this
does not pose a serious difficulty for the transmitter.
Let us suppose that for each segment in the retransmission queue
there is a (new) flag bit "ACK'd", to be used to indicate that
this particular segment has been entirely acknowledged. When a
segment is first transmitted, it will be entered into the
retransmission queue with its ACK'd bit off. If the ACK'd bit is
subsequently turned on (as the result of processing a received
SACK option), the data transmitter will skip this segment during
any later retransmission. However, the segment will not be
dequeued and its buffer freed until the left window edge is
advanced over it.
When an acknowledgment segment arrives containing a SACK option,
the data transmitter will turn on the ACK'd bits for segments that
have been selectively acknowleged. More specifically, for each
block in the SACK option, the data transmitter will turn on the
ACK'd flags for all segments in the retransmission queue that are
wholly contained within that block. This requires straightforward
sequence number comparisons.
4. TCP ECHO OPTIONS
A simple method for measuring the RTT of a segment would be: the
sender places a timestamp in the segment and the receiver returns
that timestamp in the corresponding ACK segment. When the ACK segment
arrives at the sender, the difference between the current time and
the timestamp is the RTT. To implement this timing method, the
receiver must simply reflect or echo selected data (the timestamp)
from the sender's segments. This idea is the basis of the "TCP Echo"
and "TCP Echo Reply" options.
Jacobson & Braden [Page 11]
RFC 1072 TCP Extensions for Long-Delay Paths October 1988
4.1 TCP Echo and TCP Echo Reply Options
TCP Echo Option:
Kind: 6
Length: 6
+--------+--------+--------+--------+--------+--------+
| Kind=6 | Length | 4 bytes of info to be echoed |
+--------+--------+--------+--------+--------+--------+
This option carries four bytes of information that the receiving TCP
may send back in a subsequent TCP Echo Reply option (see below). A
TCP may send the TCP Echo option in any segment, but only if a TCP
Echo option was received in a SYN segment for the connection.
When the TCP echo option is used for RTT measurement, it will be
included in data segments, and the four information bytes will define
the time at which the data segment was transmitted in any format
convenient to the sender.
TCP Echo Reply Option:
Kind: 7
Length: 6
+--------+--------+--------+--------+--------+--------+
| Kind=7 | Length | 4 bytes of echoed info |
+--------+--------+--------+--------+--------+--------+
A TCP that receives a TCP Echo option containing four information
bytes will return these same bytes in a TCP Echo Reply option.
This TCP Echo Reply option must be returned in the next segment
(e.g., an ACK segment) that is sent. If more than one Echo option is
received before a reply segment is sent, the TCP must choose only one
of the options to echo, ignoring the others; specifically, it must
choose the newest segment with the oldest sequence number (see next
section.)
To use the TCP Echo and Echo Reply options, a TCP must send a TCP
Echo option in its own SYN segment and receive a TCP Echo option in a
SYN segment from the other TCP. A TCP that does not implement the
TCP Echo or Echo Reply options must simply ignore any TCP Echo
options it receives. However, a TCP should not receive one of these
Jacobson & Braden [Page 12]
RFC 1072 TCP Extensions for Long-Delay Paths October 1988
options in a non-SYN segment unless it included a TCP Echo option in
its own SYN segment.
4.2 Using the Echo Options
If we wish to use the Echo/Echo Reply options for RTT measurement, we
have to define what the receiver does when there is not a one-to-one
correspondence between data and ACK segments. Assuming that we want
to minimize the state kept in the receiver (i.e., the number of
unprocessed Echo options), we can plan on a receiver remembering the
information value from at most one Echo between ACKs. There are
three situations to consider:
(A) Delayed ACKs.
Many TCP's acknowledge only every Kth segment out of a group of
segments arriving within a short time interval; this policy is
known generally as "delayed ACK's". The data-sender TCP must
measure the effective RTT, including the additional time due to
delayed ACK's, or else it will retransmit unnecessarily. Thus,
when delayed ACK's are in use, the receiver should reply with
the Echo option information from the earliest unacknowledged
segment.
(B) A hole in the sequence space (segment(s) have been lost).
The sender will continue sending until the window is filled, and
we may be generating ACKs as these out-of-order segments arrive
(e.g., for the SACK information or to aid "fast retransmit").
An Echo Reply option will tell the sender the RTT of some
recently sent segment (since the ACK can only contain the
sequence number of the hole, the sender may not be able to
determine which segment, but that doesn't matter). If the loss
was due to congestion, these RTTs may be particularly valuable
to the sender since they reflect the network characteristics
immediately after the congestion.
(C) A filled hole in the sequence space.
The segment that fills the hole represents the most recent
measurement of the network characteristics. On the other hand,
an RTT computed from an earlier segment would probably include
the sender's retransmit time-out, badly biasing the sender's
average RTT estimate.
Case (A) suggests the receiver should remember and return the Echo
option information from the oldest unacknowledged segment. Cases (B)
Jacobson & Braden [Page 13]
RFC 1072 TCP Extensions for Long-Delay Paths October 1988
and (C) suggest that the option should come from the most recent
unacknowledged segment. An algorithm that covers all three cases is
for the receiver to return the Echo option information from the
newest segment with the oldest sequence number, as specified earlier.
A model implementation of these options is as follows.
(1) Receiver Implementation
A 32-bit slot for Echo option data, rcv.echodata, is added to
the receiver connection state, together with a flag,
rcv.echopresent, that indicates whether there is anything in the
slot. When the receiver generates a segment, it checks
rcv.echopresent and, if it is set, adds an echo-reply option
containing rcv.echodata to the outgoing segment then clears
rcv.echopresent.
If an incoming segment is in the window and contains an echo
option, the receiver checks rcv.echopresent. If it isn't set,
the value of the echo option is copied to rcv.echodata and
rcv.echopresent is set. If rcv.echopresent is already set, the
receiver checks whether the segment is at the left edge of the
window. If so, the segment's echo option value is copied to
rcv.echodata (this is situation (C) above). Otherwise, the
segment's echo option is ignored.
(2) Sender Implementation
The sender's connection state has a single flag bit,
snd.echoallowed, added. If snd.echoallowed is set or if the
segment contains a SYN, the sender is free to add a TCP Echo
option (presumably containing the current time in some units
convenient to the sender) to every outgoing segment.
Snd.echoallowed should be set if a SYN is received with a TCP
Echo option (presumably, a host that implements the option will
attempt to use it to time the SYN segment).
5. CONCLUSIONS AND ACKNOWLEDGMENTS
We have proposed five new TCP options for scaled windows, selective
acknowledgments, and round-trip timing, in order to provide efficient
operation over large-bandwidth*delay-product paths. These extensions
are designed to provide compatible interworking with TCP's that do not
implement the extensions.
Jacobson & Braden [Page 14]
RFC 1072 TCP Extensions for Long-Delay Paths October 1988
The Window Scale option was originally suggested by Mike St. Johns of
USAF/DCA. The present form of the option was suggested by Mike Karels
of UC Berkeley in response to a more cumbersome scheme proposed by Van
Jacobson. Gerd Beling of FGAN (West Germany) contributed the initial
definition of the SACK option.
All three options have evolved through discussion with the End-to-End
Task Force, and the authors are grateful to the other members of the
Task Force for their advice and encouragement.
6. REFERENCES
[Cheriton88] Cheriton, D., "VMTP: Versatile Message Transaction
Protocol", RFC 1045, Stanford University, February 1988.
[Jain86] Jain, R., "Divergence of Timeout Algorithms for Packet
Retransmissions", Proc. Fifth Phoenix Conf. on Comp. and Comm.,
Scottsdale, Arizona, March 1986.
[Karn87] Karn, P. and C. Partridge, "Estimating Round-Trip Times
in Reliable Transport Protocols", Proc. SIGCOMM '87, Stowe, VT,
August 1987.
[Clark87] Clark, D., Lambert, M., and L. Zhang, "NETBLT: A Bulk
Data Transfer Protocol", RFC 998, MIT, March 1987.
[Nagle84] Nagle, J., "Congestion Control in IP/TCP
Internetworks", RFC 896, FACC, January 1984.
[NBS85] Colella, R., Aronoff, R., and K. Mills, "Performance
Improvements for ISO Transport", Ninth Data Comm Symposium,
published in ACM SIGCOMM Comp Comm Review, vol. 15, no. 5,
September 1985.
[Partridge87] Partridge, C., "Private Communication", February
1987.
[Postel81] Postel, J., "Transmission Control Protocol - DARPA
Internet Program Protocol Specification", RFC 793, DARPA,
September 1981.
[Velten84] Velten, D., Hinden, R., and J. Sax, "Reliable Data
Protocol", RFC 908, BBN, July 1984.
[Jacobson88] Jacobson, V., "Congestion Avoidance and Control", to
be presented at SIGCOMM '88, Stanford, CA., August 1988.
[Zhang86] Zhang, L., "Why TCP Timers Don't Work Well", Proc.
Jacobson & Braden [Page 15]
RFC 1072 TCP Extensions for Long-Delay Paths October 1988
SIGCOMM '86, Stowe, Vt., August 1986.
Jacobson & Braden [Page 16]

View File

@ -0,0 +1,731 @@
Network Working Group R. Fox
Request for Comments: 1106 Tandem
June 1989
TCP Big Window and Nak Options
Status of this Memo
This memo discusses two extensions to the TCP protocol to provide a
more efficient operation over a network with a high bandwidth*delay
product. The extensions described in this document have been
implemented and shown to work using resources at NASA. This memo
describes an Experimental Protocol, these extensions are not proposed
as an Internet standard, but as a starting point for further
research. Distribution of this memo is unlimited.
Abstract
Two extensions to the TCP protocol are described in this RFC in order
to provide a more efficient operation over a network with a high
bandwidth*delay product. The main issue that still needs to be
solved is congestion versus noise. This issue is touched on in this
memo, but further research is still needed on the applicability of
the extensions in the Internet as a whole infrastructure and not just
high bandwidth*delay product networks. Even with this outstanding
issue, this document does describe the use of these options in the
isolated satellite network environment to help facilitate more
efficient use of this special medium to help off load bulk data
transfers from links needed for interactive use.
1. Introduction
Recent work on TCP has shown great performance gains over a variety
of network paths [1]. However, these changes still do not work well
over network paths that have a large round trip delay (satellite with
a 600 ms round trip delay) or a very large bandwidth
(transcontinental DS3 line). These two networks exhibit a higher
bandwidth*delay product, over 10**6 bits, than the 10**5 bits that
TCP is currently limited to. This high bandwidth*delay product
refers to the amount of data that may be unacknowledged so that all
of the networks bandwidth is being utilized by TCP. This may also be
referred to as "filling the pipe" [2] so that the sender of data can
always put data onto the network and the receiver will always have
something to read, and neither end of the connection will be forced
to wait for the other end.
After the last batch of algorithm improvements to TCP, performance
Fox [Page 1]
RFC 1106 TCP Big Window and Nak Options June 1989
over high bandwidth*delay networks is still very poor. It appears
that no algorithm changes alone will make any significant
improvements over high bandwidth*delay networks, but will require an
extension to the protocol itself. This RFC discusses two possible
options to TCP for this purpose.
The two options implemented and discussed in this RFC are:
1. NAKs
This extension allows the receiver of data to inform the sender
that a packet of data was not received and needs to be resent.
This option proves to be useful over any network path (both high
and low bandwidth*delay type networks) that experiences periodic
errors such as lost packets, noisy links, or dropped packets due
to congestion. The information conveyed by this option is
advisory and if ignored, does not have any effect on TCP what so
ever.
2. Big Windows
This option will give a method of expanding the current 16 bit (64
Kbytes) TCP window to 32 bits of which 30 bits (over 1 gigabytes)
are allowed for the receive window. (The maximum window size
allowed in TCP due to the requirement of TCP to detect old data
versus new data. For a good explanation please see [2].) No
changes are required to the standard TCP header [6]. The 16 bit
field in the TCP header that is used to convey the receive window
will remain unchanged. The 32 bit receive window is achieved
through the use of an option that contains the upper half of the
window. It is this option that is necessary to fill large data
pipes such as a satellite link.
This RFC is broken up into the following sections: section 2 will
discuss the operation of the NAK option in greater detail, section 3
will discuss the big window option in greater detail. Section 4 will
discuss other effects of the big windows and nak feature when used
together. Included in this section will be a brief discussion on the
effects of congestion versus noise to TCP and possible options for
satellite networks. Section 5 will be a conclusion with some hints
as to what future development may be done at NASA, and then an
appendix containing some test results is included.
2. NAK Option
Any packet loss in a high bandwidth*delay network will have a
catastrophic effect on throughput because of the simple
acknowledgement of TCP. TCP always acks the stream of data that has
Fox [Page 2]
RFC 1106 TCP Big Window and Nak Options June 1989
successfully been received and tells the sender the next byte of data
of the stream that is expected. If a packet is lost and succeeding
packets arrive the current protocol has no way of telling the sender
that it missed one packet but received following packets. TCP
currently resends all of the data over again, after a timeout or the
sender suspects a lost packet due to a duplicate ack algorithm [1],
until the receiver receives the lost packet and can then ack the lost
packet as well as succeeding packets received. On a normal low
bandwidth*delay network this effect is minimal if the timeout period
is set short enough. However, on a long delay network such as a T1
satellite channel this is catastrophic because by the time the lost
packet can be sent and the ack returned the TCP window would have
been exhausted and both the sender and receiver would be temporarily
stalled waiting for the packet and ack to fully travel the data pipe.
This causes the pipe to become empty and requires the sender to
refill the pipe after the ack is received. This will cause a minimum
of 3*X bandwidth loss, where X is the one way delay of the medium and
may be much higher depending on the size of the timeout period and
bandwidth*delay product. Its 1X for the packet to be resent, 1X for
the ack to be received and 1X for the next packet being sent to reach
the destination. This calculation assumes that the window size is
much smaller than the pipe size (window = 1/2 data pipe or 1X), which
is the typical case with the current TCP window limitation over long
delay networks such as a T1 satellite link.
An attempt to reduce this wasted bandwidth from 3*X was introduced in
[1] by having the sender resend a packet after it notices that a
number of consecutively received acks completely acknowledges already
acknowledged data. On a typical network this will reduce the lost
bandwidth to almost nil, since the packet will be resent before the
TCP window is exhausted and with the data pipe being much smaller
than the TCP window, the data pipe will not become empty and no
bandwidth will be lost. On a high delay network the reduction of
lost bandwidth is minimal such that lost bandwidth is still
significant. On a very noisy satellite, for instance, the lost
bandwidth is very high (see appendix for some performance figures)
and performance is very poor.
There are two methods of informing the sender of lost data.
Selective acknowledgements and NAKS. Selective acknowledgements have
been the object of research in a number of experimental protocols
including VMTP [3], NETBLT [4], and SatFTP [5]. The idea behind
selective acks is that the receiver tells the sender which pieces it
received so that the sender can resend the data not acked but already
sent once. NAKs on the other hand, tell the sender that a particular
packet of data needs to be resent.
There are a couple of disadvantages of selective acks. Namely, in
Fox [Page 3]
RFC 1106 TCP Big Window and Nak Options June 1989
some of the protocols mentioned above, the receiver waits a certain
time before sending the selective ack so that acks may be bundled up.
This delay can cause some wasted bandwidth and requires more complex
state information than the simple nak. Even if the receiver doesn't
bundle up the selective acks but sends them as it notices that
packets have been lost, more complex state information is needed to
determine which packets have been acked and which packets need to be
resent. With naks, only the immediate data needed to move the left
edge of the window is naked, thus almost completely eliminating all
state information.
The selective ack has one advantage over naks. If the link is very
noisy and packets are being lost close together, then the sender will
find out about all of the missing data at once and can send all of
the missing data out immediately in an attempt to move the left
window edge in the acknowledge number of the TCP header, thus keeping
the data pipe flowing. Whereas with naks, the sender will be
notified of lost packets one at a time and this will cause the sender
to process extra packets compared to selective acks. However,
empirical studies has shown that most lost packets occur far enough
apart that the advantage of selective acks over naks is rarely seen.
Also, if naks are sent out as soon as a packet has been determined
lost, then the advantage of selective acks becomes no more than
possibly a more aesthetic algorithm for handling lost data, but
offers no gains over naks as described in this paper. It is this
reason that the simplicity of naks was chosen over selective acks for
the current implementation.
2.1 Implementation details
When the receiver of data notices a gap between the expected sequence
number and the actual sequence number of the packet received, the
receiver can assume that the data between the two sequence numbers is
either going to arrive late or is lost forever. Since the receiver
can not distinguish between the two events a nak should be sent in
the TCP option field. Naking a packet still destined to arrive has
the effect of causing the sender to resend the packet, wasting one
packets worth of bandwidth. Since this event is fairly rare, the
lost bandwidth is insignificant as compared to that of not sending a
nak when the packet is not going to arrive. The option will take the
form as follows:
+========+=========+=========================+================+
+option= + length= + sequence number of + number of +
+ A + 7 + first byte being naked + segments naked +
+========+=========+=========================+================+
This option contains the first sequence number not received and a
Fox [Page 4]
RFC 1106 TCP Big Window and Nak Options June 1989
count of how many segments of bytes needed to be resent, where
segments is the size of the current TCP MSS being used for the
connection. Since a nak is an advisory piece of information, the
sending of a nak is unreliable and no means for retransmitting a nak
is provided at this time.
When the sender of data receives the option it may either choose to
do nothing or it will resend the missing data immediately and then
continue sending data where it left off before receiving the nak.
The receiver will keep track of the last nak sent so that it will not
repeat the same nak. If it were to repeat the same nak the protocol
could get into the mode where on every reception of data the receiver
would nak the first missing data frame. Since the data pipe may be
very large by the time the first nak is read and responded to by the
sender, many naks would have been sent by the receiver. Since the
sender does not know that the naks are repetitious it will resend the
data each time, thus wasting the network bandwidth with useless
retransmissions of the same piece of data. Having an unreliable nak
may result in a nak being damaged and not being received by the
sender, and in this case, we will let the tcp recover by its normal
means. Empirical data has shown that the likelihood of the nak being
lost is quite small and thus, this advisory nak option works quite
well.
3. Big Window Option
Currently TCP has a 16 bit window limitation built into the protocol.
This limits the amount of outstanding unacknowledged data to 64
Kbytes. We have already seen that some networks have a pipe larger
than 64 Kbytes. A T1 satellite channel and a cross country DS3
network with a 30ms delay have data pipes much larger than 64 Kbytes.
Thus, even on a perfectly conditioned link with no bandwidth wasted
due to errors, the data pipe will not be filled and bandwidth will be
wasted. What is needed is the ability to send more unacknowledged
data. This is achieved by having bigger windows, bigger than the
current limitation of 16 bits. This option to expands the window
size to 30 bits or over 1 gigabytes by literally expanding the window
size mechanism currently used by TCP. The added option contains the
upper 15 bits of the window while the lower 16 bits will continue to
go where they normally go [6] in the TCP header.
A TCP session will use the big window options only if both sides
agree to use them, otherwise the option is not used and the normal 16
bit windows will be used. Once the 2 sides agree to use the big
windows then every packet thereafter will be expected to contain the
window option with the current upper 15 bits of the window. The
negotiation to decide whether or not to use the bigger windows takes
place during the SYN and SYN ACK segments of the TCP connection
Fox [Page 5]
RFC 1106 TCP Big Window and Nak Options June 1989
startup process. The originator of the connection will include in
the SYN segment the following option:
1 byte 1 byte 4 bytes
+=========+==========+===============+
+option=B + length=6 + 30 bit window +
+=========+==========+===============+
If the other end of the connection wants to use big windows it will
include the same option back in the SYN ACK segment that it must
send. At this point, both sides have agreed to use big windows and
the specified windows will be used. It should be noted that the SYN
and SYN ACK segments will use the small windows, and once the big
window option has been negotiated then the bigger windows will be
used.
Once both sides have agreed to use 32 bit windows the protocol will
function just as it did before with no difference in operation, even
in the event of lost packets. This claim holds true since the
rcv_wnd and snd_wnd variables of tcp contain the 16 bit windows until
the big window option is negotiated and then they are replaced with
the appropriate 32 bit values. Thus, the use of big windows becomes
part of the state information kept by TCP.
Other methods of expanding the windows have been presented, including
a window multiple [2] or streaming [5], but this solution is more
elegant in the sense that it is a true extension of the window that
one day may easily become part of the protocol and not just be an
option to the protocol.
3.1 How does it work
Once a connection has decided to use big windows every succeeding
packet must contain the following option:
+=========+==========+==========================+
+option=C + length=4 + upper 15 bits of rcv_wnd +
+=========+==========+==========================+
With all segments sent, the sender supplies the size of its receive
window. If the connection is only using 16 bits then this option is
not supplied, otherwise the lower 16 bits of the receive window go
into the tcp header where it currently resides [6] and the upper 15
bits of the window is put into the data portion of the option C.
When the receiver processes the packet it must first reform the
window and then process the packet as it would in the absence of the
option.
Fox [Page 6]
RFC 1106 TCP Big Window and Nak Options June 1989
3.2 Impact of changes
In implementing the first version of the big window option there was
very little change required to the source. State information must be
added to the protocol to determine if the big window option is to be
used and all 16 bit variables that dealt with window information must
now become 32 bit quantities. A future document will describe in
more detail the changes required to the 4.3 bsd tcp source code.
Test results of the window change only are presented in the appendix.
When expanding 16 bit quantities to 32 bit quantities in the TCP
control block in the source (4.3 bsd source) may cause the structure
to become larger than the mbuf used to hold the structure. Care must
be taken to insure this doesn't occur with your system or
undetermined events may take place.
4. Effects of Big Windows and Naks when used together
With big windows alone, transfer times over a satellite were quite
impressive with the absence of any introduced errors. However, when
an error simulator was used to create random errors during transfers,
performance went down extremely fast. When the nak option was added
to the big window option performance in the face of errors went up
some but not to the level that was expected. This section will
discuss some issues that were overcome to produce the results given
in the appendix.
4.1 Window Size and Nak benefits
With out errors, the window size required to keep the data pipe full
is equal to the round trip delay * throughput desired, or the data
pipe bandwidth (called Z from now on). This and other calculations
assume that processing time of the hosts is negligible. In the event
of an error (without NAKs), the window size needs to become larger
than Z in order to keep the data pipe full while the sender is
waiting for the ack of the resent packet. If the window size is
equaled to Z and we assume that the retransmission timer is equaled
to Z, then when a packet is lost, the retransmission timer will go
off as the last piece of data in the window is sent. In this case,
the lost piece of data can be resent with no delay. The data pipe
will empty out because it will take 1/2Z worth of data to get the ack
back to the sender, an additional 1/2Z worth of data to get the data
pipe refilled with new data. This causes the required window to be
2Z, 1Z to keep the data pipe full during normal operations and 1Z to
keep the data pipe full while waiting for a lost packet to be resent
and acked.
If the same scenario in the last paragraph is used with the addition
of NAKs, the required window size still needs to be 2Z to avoid
Fox [Page 7]
RFC 1106 TCP Big Window and Nak Options June 1989
wasting any bandwidth in the event of a dropped packet. This appears
to mean that the nak option does not provide any benefits at all.
Testing showed that the retransmission timer was larger than the data
pipe and in the event of errors became much bigger than the data
pipe, because of the retransmission backoff. Thus, the nak option
bounds the required window to 2Z such that in the event of an error
there is no lost bandwidth, even with the retransmission timer
fluctuations. The results in the appendix shows that by using naks,
bandwidth waste associated with the retransmission timer facility is
eliminated.
4.2 Congestions vs Noise
An issue that must be looked at when implementing both the NAKs and
big window scheme together is in the area of congestion versus lost
packets due to the medium, or noise. In the recent algorithm
enhancements [1], slow start was introduced so that whenever a data
transfer is being started on a connection or right after a dropped
packet, the effective send window would be set to a very small size
(typically would equal the MSS being used). This is done so that a
new connection would not cause congestion by immediately overloading
the network, and so that an existing connection would back off the
network if a packet was dropped due to congestion and allow the
network to clear up. If a connection using big windows loses a
packet due to the medium (a packet corrupted by an error) the last
thing that should be done is to close the send window so that the
connection can only send 1 packet and must use the slow start
algorithm to slowly work itself back up to sending full windows worth
of data. This algorithm would quickly limit the usefulness of the
big window and nak options over lossy links.
On the other hand, if a packet was dropped due to congestion and the
sender assumes the packet was dropped because of noise the sender
will continue sending large amounts of data. This action will cause
the congestion to continue, more packets will be dropped, and that
part of the network will collapse. In this instance, the sender
would want to back off from sending at the current window limit.
Using the current slow start mechanism over a satellite builds up the
window too slowly [1]. Possibly a better solution would be for the
window to be opened 2*Rlog2(W) instead of R*log2(W) [1] (open window
by 2 packets instead of 1 for each acked packet). This will reduce
the wasted bandwidth by opening the window much quicker while giving
the network a chance to clear up. More experimentation is necessary
to find the optimal rate of opening the window, especially when large
windows are being used.
The current recommendation for TCP is to use the slow start mechanism
in the event of any lost packet. If an application knows that it
Fox [Page 8]
RFC 1106 TCP Big Window and Nak Options June 1989
will be using a satellite with a high error rate, it doesn't make
sense to force it to use the slow start mechanism for every dropped
packet. Instead, the application should be able to choose what
action should happen in the event of a lost packet. In the BSD
environment, a setsockopt call should be provided so that the
application may inform TCP to handle lost packets in a special way
for this particular connection. If the known error rate of a link is
known to be small, then by using slow start with modified rate from
above, will cause the amount of bandwidth loss to be very small in
respect to the amount of bandwidth actually utilized. In this case,
the setsockopt call should not be used. What is really needed is a
way for a host to determine if a packet or packets are being dropped
due to congestion or noise. Then, the host can choose to do the
right thing. This will require a mechanism like source quench to be
used. For this to happen more experimentation is necessary to
determine a solid definition on the use of this mechanism. Now it is
believed by some that using source quench to avoid congestion only
adds to the problem, not help suppress it.
The TCP used to gather the results in the appendix for the big window
with nak experiment, assumed that lost packets were the result of
noise and not congestion. This assumption was used to show how to
make the current TCP work in such an environment. The actual
satellite used in the experiment (when the satellite simulator was
not used) only experienced an error rate around 10e-10. With this
error rate it is suggested that in practice when big windows are used
over the link, TCP should use the slow start mechanism for all lost
packets with the 2*Rlog2(W) rate discussed above. Under most
situations when long delay networks are being used (transcontinental
DS3 networks using fiber with very low error rates, or satellite
links with low error rates) big windows and naks should be used with
the assumption that lost packets are the result of congestion until a
better algorithm is devised [7].
Another problem noticed, while testing the affects of slow start over
a satellite link, was at times, the retransmission timer was set so
restrictive, that milliseconds before a naked packet's ack is
received the retransmission timer would go off due to a timed packet
within the send window. The timer was set at the round trip delay of
the network allowing no time for packet processing. If this timer
went off due to congestion then backing off is the right thing to do,
otherwise to avoid the scenario discovered by experimentation, the
transmit timer should be set a little longer so that the
retransmission timer does not go off too early. Care must be taken
to make sure the right thing is done in the implementation in
question so that a packet isn't retransmitted too soon, and blamed on
congestion when in fact, the ack is on its way.
Fox [Page 9]
RFC 1106 TCP Big Window and Nak Options June 1989
4.3 Duplicate Acks
Another problem found with the 4.3bsd implementation is in the area
of duplicate acks. When the sender of data receives a certain number
of acks (3 in the current Berkeley release) that acknowledge
previously acked data before, it then assumes that a packet has been
lost and will resend the one packet assumed lost, and close its send
window as if the network is congested and the slow start algorithm
mention above will be used to open the send window. This facility is
no longer needed since the sender can use the reception of a nak as
its indicator that a particular packet was dropped. If the nak
packet is lost then the retransmit timer will go off and the packet
will be retransmitted by normal means. If a senders algorithm
continues to count duplicate acks the sender will find itself
possibly receiving many duplicate acks after it has already resent
the packet due to a nak being received because of the large size of
the data pipe. By receiving all of these duplicate acks the sender
may find itself doing nothing but resending the same packet of data
unnecessarily while keeping the send window closed for absolutely no
reason. By removing this feature of the implementation a user can
expect to find a satellite connection working much better in the face
of errors and other connections should not see any performance loss,
but a slight improvement in performance if anything at all.
5. Conclusion
This paper has described two new options that if used will make TCP a
more efficient protocol in the face of errors and a more efficient
protocol over networks that have a high bandwidth*delay product
without decreasing performance over more common networks. If a
system that implements the options talks with one that does not, the
two systems should still be able to communicate with no problems.
This assumes that the system doesn't use the option numbers defined
in this paper in some other way or doesn't panic when faced with an
option that the machine does not implement. Currently at NASA, there
are many machines that do not implement either option and communicate
just fine with the systems that do implement them.
The drive for implementing big windows has been the direct result of
trying to make TCP more efficient over large delay networks [2,3,4,5]
such as a T1 satellite. However, another practical use of large
windows is becoming more apparent as the local area networks being
developed are becoming faster and supporting much larger MTU's.
Hyperchannel, for instances, has been stated to be able to support 1
Mega bit MTU's in their new line of products. With the current
implementation of TCP, efficient use of hyperchannel is not utilized
as it should because the physical mediums MTU is larger than the
maximum window of the protocol being used. By increasing the TCP
Fox [Page 10]
RFC 1106 TCP Big Window and Nak Options June 1989
window size, better utilization of networks like hyperchannel will be
gained instantly because the sender can send 64 Kbyte packets (IP
limitation) but not have to operate in a stop and wait fashion.
Future work is being started to increase the IP maximum datagram size
so that even better utilization of fast local area networks will be
seen by having the TCP/IP protocols being able to send large packets
over mediums with very large MTUs. This will hopefully, eliminate
the network protocol as the bottleneck in data transfers while
workstations and workstation file system technology advances even
more so, than it already has.
An area of concern when using the big window mechanism is the use of
machine resources. When running over a satellite and a packet is
dropped such that 2Z (where Z is the round trip delay) worth of data
is unacknowledged, both ends of the connection need to be able to
buffer the data using machine mbufs (or whatever mechanism the
machine uses), usually a valuable and scarce commodity. If the
window size is not chosen properly, some machines will crash when the
memory is all used up, or it will keep other parts of the system from
running. Thus, setting the window to some fairly large arbitrary
number is not a good idea, especially on a general purpose machine
where many users log on at any time. What is currently being
engineered at NASA is the ability for certain programs to use the
setsockopt feature or 4.3bsd asking to use big windows such that the
average user may not have access to the large windows, thus limiting
the use of big windows to applications that absolutely need them and
to protect a valuable system resource.
6. References
[1] Jacobson, V., "Congestion Avoidance and Control", SIGCOMM 88,
Stanford, Ca., August 1988.
[2] Jacobson, V., and R. Braden, "TCP Extensions for Long-Delay
Paths", LBL, USC/Information Sciences Institute, RFC 1072,
October 1988.
[3] Cheriton, D., "VMTP: Versatile Message Transaction Protocol", RFC
1045, Stanford University, February 1988.
[4] Clark, D., M. Lambert, and L. Zhang, "NETBLT: A Bulk Data
Transfer Protocol", RFC 998, MIT, March 1987.
[5] Fox, R., "Draft of Proposed Solution for High Delay Circuit File
Transfer", GE/NAS Internal Document, March 1988.
[6] Postel, J., "Transmission Control Protocol - DARPA Internet
Program Protocol Specification", RFC 793, DARPA, September 1981.
Fox [Page 11]
RFC 1106 TCP Big Window and Nak Options June 1989
[7] Leiner, B., "Critical Issues in High Bandwidth Networking", RFC
1077, DARPA, November 1989.
7. Appendix
Both options have been implemented and tested. Contained in this
section is some performance gathered to support the use of these two
options. The satellite channel used was a 1.544 Mbit link with a
580ms round trip delay. All values are given as units of bytes.
TCP with Big Windows, No Naks:
|---------------transfer rates----------------------|
Window Size | no error | 10e-7 error rate | 10e-6 error rate |
-----------------------------------------------------------------
64K | 94K | 53K | 14K |
-----------------------------------------------------------------
72K | 106K | 51K | 15K |
-----------------------------------------------------------------
80K | 115K | 42K | 14K |
-----------------------------------------------------------------
92K | 115K | 43K | 14K |
-----------------------------------------------------------------
100K | 135K | 66K | 15K |
-----------------------------------------------------------------
112K | 126K | 53K | 17K |
-----------------------------------------------------------------
124K | 154K | 45K | 14K |
-----------------------------------------------------------------
136K | 160K | 66K | 15K |
-----------------------------------------------------------------
156K | 167K | 45K | 14K |
-----------------------------------------------------------------
Figure 1.
Fox [Page 12]
RFC 1106 TCP Big Window and Nak Options June 1989
TCP with Big Windows, and Naks:
|---------------transfer rates----------------------|
Window Size | no error | 10e-7 error rate | 10e-6 error rate |
-----------------------------------------------------------------
64K | 95K | 83K | 43K |
-----------------------------------------------------------------
72K | 104K | 87K | 49K |
-----------------------------------------------------------------
80K | 117K | 96K | 62K |
-----------------------------------------------------------------
92K | 124K | 119K | 39K |
-----------------------------------------------------------------
100K | 140K | 124K | 35K |
-----------------------------------------------------------------
112K | 151K | 126K | 53K |
-----------------------------------------------------------------
124K | 160K | 140K | 36K |
-----------------------------------------------------------------
136K | 167K | 148K | 38K |
-----------------------------------------------------------------
156K | 167K | 160K | 38K |
-----------------------------------------------------------------
Figure 2.
With a 10e-6 error rate, many naks as well as data packets were
dropped, causing the wild swing in transfer times. Also, please note
that the machines used are SGI Iris 2500 Turbos with the 3.6 OS with
the new TCP enhancements. The performance associated with the Irises
are slower than a Sun 3/260, but due to some source code restrictions
the Iris was used. Initial results on the Sun showed slightly higher
performance and less variance.
Author's Address
Richard Fox
950 Linden #208
Sunnyvale, Cal, 94086
EMail: rfox@tandem.com
Fox [Page 13]

View File

@ -0,0 +1,171 @@
Network Working Group A. McKenzie
Request for Comments: 1110 BBN STC
August 1989
A Problem with the TCP Big Window Option
Status of this Memo
This memo comments on the TCP Big Window option described in RFC
1106. Distribution of this memo is unlimited.
Abstract
The TCP Big Window option discussed in RFC 1106 will not work
properly in an Internet environment which has both a high bandwidth *
delay product and the possibility of disordering and duplicating
packets. In such networks, the window size must not be increased
without a similar increase in the sequence number space. Therefore,
a different approach to big windows should be taken in the Internet.
Discussion
TCP was designed to work in a packet store-and-forward environment
characterized by the possibility of packet loss, packet disordering,
and packet duplication. Packet loss can occur, for example, by a
congested network element discarding a packet. Packet disordering
can occur, for example, by packets of a TCP connection being
arbitrarily transmitted partially over a low bandwidth terrestrial
path and partially over a high bandwidth satellite path. Packet
duplication can occur, for example, when two directly-connected
network elements use a reliable link protocol and the link goes down
after the receiver correctly receives a packet but before the
transmitter receives an acknowledgement for the packet; the
transmitter and receiver now each take responsibility for attempting
to deliver the same packet to its ultimate destination.
TCP has the task of recreating at the destination an exact copy of
the data stream generated at the source, in the same order and with
no gaps or duplicates. The mechanism used to accomplish this task is
to assign a "unique" sequence number to each byte of data at its
source, and to sort the bytes at the destination according to the
sequence number. The sorting operation corrects any disordering. An
acknowledgement, timeout, and retransmission scheme corrects for data
loss. The uniqueness of the sequence number corrects for data
duplication.
As a practical matter, however, the sequence number is not unique; it
McKenzie [Page 1]
RFC 1110 Comments on TCP Big Window Option August 1989
is contained in a 32-bit field and therefore "wraps around" after the
transmission of 2**32 bytes of data. Two additional mechanisms are
used to insure the effective uniqueness of sequence numbers; these
are the TCP transmission window and bounds on packet lifetime within
the Internet, including the IP Time-to-Live (TTL). The transmission
window specifies the maximum number of bytes which may be sent by the
source in one source-destination roundtrip time. Since the TCP
transmission window is specified by 16 bits, which is 1/65536 of the
sequence number space, a sequence number will not be reused (used to
number another byte) for 65,536 roundtrip times. So long as the
combination of gateway action on the IP TTL and holding times within
the individual networks which interconnect the gateways do not allow
a packet's lifetime to exceed 65,536 roundtrip times, each sequence
number is effectively unique. It was believed by the TCP designers
that the networks and gateways forming the internet would meet this
constraint, and such has been the case.
The proposed TCP Big Window option, as described in RFC 1106, expands
the size of the window specification to 30 bits, while leaving the
sequence number space unchanged. Thus, a sequence number can be
reused after 4 roundtrip times. Further, the Nak option allows a
packet to be retransmitted (i.e., potentially duplicated) by the
source after only one roundtrip time. Thus, if a packet becomes
"lost" in the Internet for only about 5 roundtrip times it may be
delivered when its sequence number again lies within the window,
albeit a later cycle of the window. In this case, TCP will not
necessarily recreate at the destination an exact copy of the data
stream generated at the source; it may replace some data with earlier
data.
Of course, the problem described above results from the storage of
the "lost" packet within the net, and its subsequent out-of-order
delivery. RFC 1106 seems to describe use of the proposed options in
an isolated satellite network. We may hypothesize that this network
is memoryless, and thus cannot deliver packets out of order; it
either delivers a packet in order or loses it. If this is the case,
then there is no problem with the proposed options. The Internet,
however, can deliver packets out of order, and this will likely
continue to be true even if gigabit links become part of the
Internet. Therefore, the approach described in RFC 1106 cannot be
adopted for general Internet use.
McKenzie [Page 2]
RFC 1110 Comments on TCP Big Window Option August 1989
Author's Address
Alex McKenzie
Bolt Beranek and Newman Inc.
10 Moulton Street
Cambridge, MA 02238
Phone: (617) 873-2962
EMail: MCKENZIE@BBN.COM
McKenzie [Page 3]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,283 @@
Network Working Group J. Zweig
Request for Comments: 1146 UIUC
Obsoletes: RFC 1145 C. Partridge
BBN
March 1990
TCP Alternate Checksum Options
Status of This Memo
This memo suggests a pair of TCP options to allow use of alternate
data checksum algorithms in the TCP header. The use of these options
is experimental, and not recommended for production use.
Note: This RFC corrects errors introduced in the editing process in
RFC 1145.
Distribution of this memo is unlimited.
Introduction
Some members of the networking community have expressed interest in
using checksum-algorithms with different error detection and
correction properties than the standard TCP checksum. The option
described in this memo provides a mechanism to negotiate the use of
an alternate checksum at connection-establishment time, as well as a
mechanism to carry additional checksum information for algorithms
that utilize checksums that are longer than 16 bits.
Definition of the Options
The TCP Alternate Checksum Request Option may be sent in a SYN
segment by a TCP to indicate that the TCP is prepared to both
generate and receive checksums based on an alternate algorithm.
During communication, the alternate checksum replaces the regular TCP
checksum in the checksum field of the TCP header. Should the
alternate checksum require more than 2 octets to transmit, the
checksum may either be moved into a TCP Alternate Checksum Data
Option and the checksum field of the TCP header be sent as 0, or the
data may be split between the header field and the option. Alternate
checksums are computed over the same data as the regular TCP checksum
(see TCP Alternate Checksum Data Option discussion below).
TCP Alternate Checksum Request Option
The format of the TCP Alternate Checksum Request Option is:
Zweig & Partridge [Page 1]
RFC 1146 TCP Alternate Checksum Options March 1990
+----------+----------+----------+
| Kind=14 | Length=3 | chksum |
+----------+----------+----------+
Here chksum is a number identifying the type of checksum to be used.
The currently defined values of chksum are:
0 -- TCP checksum
1 -- 8-bit Fletcher's algorithm (see Appendix I)
2 -- 16-bit Fletcher's algorithm (see Appendix II)
Note that the 8-bit Fletcher algorithm gives a 16-bit checksum and
the 16-bit algorithm gives a 32-bit checksum.
Alternate checksum negotiation proceeds as follows:
A SYN segment used to originate a connection may contain the
Alternate Checksum Request Option, which specifies an alternate
checksum-calculation algorithm to be used for the connection. The
acknowledging SYN-ACK segment may also carry the option.
If both SYN segments carry the Alternate Checksum Request option,
and both specify the same algorithm, that algorithm must be used
for the remainder of the connection. Otherwise, the standard TCP
checksum algorithm must be used for the entire connection. Thus,
for example, if one TCP specifies type 1 checksums, and the other
specifies type 2 checksums, then they will use type 0 (the regular
TCP checksum). Note that in practice, one TCP will typically be
responding to the other's SYN, and thus either accepting or
rejecting the proposed alternate checksum algorithm.
Any segment with the SYN bit set must always use the standard TCP
checksum algorithm. Thus the SYN segment will always be
understood by the receiving TCP. The alternate checksum must not
be used until the first non-SYN segment. In addition, because RST
segments may also be received or sent without complete state
information, any segment with the RST bit set must use the
standard TCP checksum.
The option may not be sent in any segment that does not have the
SYN bit set.
An implementation of TCP which does not support the option should
silently ignore it (as RFC 1122 requires). Ignoring the option
will force any TCP attempting to use an alternate checksum to use
the standard TCP checksum algorithm, thus ensuring
interoperability.
Zweig & Partridge [Page 2]
RFC 1146 TCP Alternate Checksum Options March 1990
TCP Alternate Checksum Data Option
The format of the TCP Alternate Checksum Data Option is:
+---------+---------+---------+ +---------+
| Kind=15 |Length=N | data | ... | data |
+---------+---------+---------+ +---------+
This field is used only when the alternate checksum that is
negotiated is longer than 16 bits. These checksums will not fit in
the checksum field of the TCP header and thus at least part of them
must be put in an option. Whether the checksum is split between the
checksum field in the TCP header and the option or the entire
checksum is placed in the option is determined on a checksum by
checksum basis.
The length of this option will depend on the choice of alternate
checksum algorithm for this connection.
While computing the alternate checksum, the TCP checksum field and
the data portion TCP Alternate Checksum Data Option are replaced with
zeros.
An otherwise acceptable segment carrying this option on a connection
using a 16-bit checksum algorithm, or carrying this option with an
inappropriate number of data octets for the chosen alternate checksum
algorithm is in error and must be discarded; a RST-segment must be
generated, and the connection aborted.
Note the requirement above that RST and SYN segments must always use
the standard TCP checksum.
APPENDIX I: The 8-bit Fletcher Checksum Algorithm
The 8-bit Fletcher Checksum Algorithm is calculated over a sequence
of data octets (call them D[1] through D[N]) by maintaining 2
unsigned 1's-complement 8-bit accumulators A and B whose contents are
initially zero, and performing the following loop where i ranges from
1 to N:
A := A + D[i]
B := B + A
It can be shown that at the end of the loop A will contain the 8-bit
1's complement sum of all octets in the datagram, and that B will
contain (N)D[1] + (N-1)D[2] + ... + D[N].
The octets covered by this algorithm should be the same as those over
Zweig & Partridge [Page 3]
RFC 1146 TCP Alternate Checksum Options March 1990
which the standard TCP checksum calculation is performed, with the
pseudoheader being D[1] through D[12] and the TCP header beginning at
D[13]. Note that, for purposes of the checksum computation, the
checksum field itself must be equal to zero.
At the end of the loop, the A goes in the first byte of the TCP
checksum and B goes in the second byte.
Note that, unlike the OSI version of the Fletcher checksum, this
checksum does not adjust the check bytes so that the receiver
checksum is 0.
There are a number of much faster algorithms for calculating the two
octets of the 8-bit Fletcher checksum. For more information see
[Sklower89], [Nakassis88] and [Fletcher82]. Naturally, any
computation which computes the same number as would be calculated by
the loop above may be used to calculate the checksum. One advantage
of the Fletcher algorithms over the standard TCP checksum algorithm
is the ability to detect the transposition of octets/words of any
size within a datagram.
APPENDIX II: The 16-bit Fletcher Checksum Algorithm
The 16-bit Fletcher Checksum algorithm proceeds in precisely the same
manner as the 8-bit checksum algorithm,, except that A, B and the
D[i] are 16-bit quantities. It is necessary (as it is with the
standard TCP checksum algorithm) to pad a datagram containing an odd
number of octets with a zero octet.
Result A should be placed in the TCP header checksum field and Result
B should appear in an TCP Alternate Checksum Data option. This
option must be present in every TCP header. The two bytes reserved
for B should be set to zero during the calculation of the checksum.
The checksum field of the TCP header shall contain the contents of A
at the end of the loop. The TCP Alternate Checksum Data option must
be present and contain the contents of B at the end of the loop.
BIBLIOGRAPHY:
[BrBoPa89] Braden, R., Borman, D., and C. Partridge, "Computing
the Internet Checksum", ACM Computer Communication
Review, Vol. 19, No. 2, pp. 86-101, April 1989.
[Note that this includes Plummer, W. "IEN-45: TCP
Checksum Function Design" (1978) as an appendix.]
[Fletcher82] Fletcher, J., "An Arithmetic Checksum for Serial
Transmissions", IEEE Transactions on Communication,
Zweig & Partridge [Page 4]
RFC 1146 TCP Alternate Checksum Options March 1990
Vol. COM-30, No. 1, pp. 247-252, January 1982.
[Nakassis88] Nakassis, T., "Fletcher's Error Detection Algorithm:
How to implement it efficiently and how to avoid the
most common pitfalls", ACM Computer Communication
Review, Vol. 18, No. 5, pp. 86-94, October 1988.
[Sklower89] Sklower, K., "Improving the Efficiency of the OSI
Checksum Calculation", ACM Computer Communication
Review, Vol. 19, No. 5, pp. 32-43, October 1989.
Security Considerations
Security issues are not addressed in this memo.
Authors' Addresses
Johnny Zweig
Digital Computer Lab
University of Illinois (UIUC)
1304 West Springfield Avenue
CAMPUS MC 258
Urbana, IL 61801
Phone: (217) 333-7937
EMail: zweig@CS.UIUC.EDU
Craig Partridge
Bolt Beranek and Newman Inc.
50 Moulton Street
Cambridge, MA 02138
Phone: (617) 873-2459
EMail: craig@BBN.COM
Zweig & Partridge [Page 5]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,787 @@
Network Working Group G. McGregor
Request for Comments: 1332 Merit
Obsoletes: RFC 1172 May 1992
The PPP Internet Protocol Control Protocol (IPCP)
Status of this Memo
This RFC specifies an IAB standards track protocol for the Internet
community, and requests discussion and suggestions for improvements.
Please refer to the current edition of the "IAB Official Protocol
Standards" for the standardization state and status of this protocol.
Distribution of this memo is unlimited.
Abstract
The Point-to-Point Protocol (PPP) [1] provides a standard method of
encapsulating Network Layer protocol information over point-to-point
links. PPP also defines an extensible Link Control Protocol, and
proposes a family of Network Control Protocols (NCPs) for
establishing and configuring different network-layer protocols.
This document defines the NCP for establishing and configuring the
Internet Protocol [2] over PPP, and a method to negotiate and use Van
Jacobson TCP/IP header compression [3] with PPP.
This RFC is a product of the Point-to-Point Protocol Working Group of
the Internet Engineering Task Force (IETF).
McGregor [Page i]
RFC 1332 PPP IPCP May 1992
Table of Contents
1. Introduction .......................................... 1
2. A PPP Network Control Protocol (NCP) for IP ........... 2
2.1 Sending IP Datagrams ............................ 2
3. IPCP Configuration Options ............................ 4
3.1 IP-Addresses .................................... 5
3.2 IP-Compression-Protocol ......................... 6
3.3 IP-Address ...................................... 8
4. Van Jacobson TCP/IP header compression ................ 9
4.1 Configuration Option Format ..................... 9
APPENDICES ................................................... 11
A. IPCP Recommended Options .............................. 11
SECURITY CONSIDERATIONS ...................................... 11
REFERENCES ................................................... 11
ACKNOWLEDGEMENTS ............................................. 11
CHAIR'S ADDRESS .............................................. 12
AUTHOR'S ADDRESS ............................................. 12
McGregor [Page ii]
RFC 1332 PPP IPCP May 1992
1. Introduction
PPP has three main components:
1. A method for encapsulating datagrams over serial links.
2. A Link Control Protocol (LCP) for establishing, configuring,
and testing the data-link connection.
3. A family of Network Control Protocols (NCPs) for establishing
and configuring different network-layer protocols.
In order to establish communications over a point-to-point link, each
end of the PPP link must first send LCP packets to configure and test
the data link. After the link has been established and optional
facilities have been negotiated as needed by the LCP, PPP must send
NCP packets to choose and configure one or more network-layer
protocols. Once each of the chosen network-layer protocols has been
configured, datagrams from each network-layer protocol can be sent
over the link.
The link will remain configured for communications until explicit LCP
or NCP packets close the link down, or until some external event
occurs (an inactivity timer expires or network administrator
intervention).
McGregor [Page 1]
RFC 1332 PPP IPCP May 1992
2. A PPP Network Control Protocol (NCP) for IP
The IP Control Protocol (IPCP) is responsible for configuring,
enabling, and disabling the IP protocol modules on both ends of the
point-to-point link. IPCP uses the same packet exchange machanism as
the Link Control Protocol (LCP). IPCP packets may not be exchanged
until PPP has reached the Network-Layer Protocol phase. IPCP packets
received before this phase is reached should be silently discarded.
The IP Control Protocol is exactly the same as the Link Control
Protocol [1] with the following exceptions:
Data Link Layer Protocol Field
Exactly one IPCP packet is encapsulated in the Information field
of PPP Data Link Layer frames where the Protocol field indicates
type hex 8021 (IP Control Protocol).
Code field
Only Codes 1 through 7 (Configure-Request, Configure-Ack,
Configure-Nak, Configure-Reject, Terminate-Request, Terminate-Ack
and Code-Reject) are used. Other Codes should be treated as
unrecognized and should result in Code-Rejects.
Timeouts
IPCP packets may not be exchanged until PPP has reached the
Network-Layer Protocol phase. An implementation should be
prepared to wait for Authentication and Link Quality Determination
to finish before timing out waiting for a Configure-Ack or other
response. It is suggested that an implementation give up only
after user intervention or a configurable amount of time.
Configuration Option Types
IPCP has a distinct set of Configuration Options, which are
defined below.
2.1. Sending IP Datagrams
Before any IP packets may be communicated, PPP must reach the
Network-Layer Protocol phase, and the IP Control Protocol must reach
the Opened state.
Exactly one IP packet is encapsulated in the Information field of PPP
Data Link Layer frames where the Protocol field indicates type hex
0021 (Internet Protocol).
McGregor [Page 2]
RFC 1332 PPP IPCP May 1992
The maximum length of an IP packet transmitted over a PPP link is the
same as the maximum length of the Information field of a PPP data
link layer frame. Larger IP datagrams must be fragmented as
necessary. If a system wishes to avoid fragmentation and reassembly,
it should use the TCP Maximum Segment Size option [4], and MTU
discovery [5].
McGregor [Page 3]
RFC 1332 PPP IPCP May 1992
3. IPCP Configuration Options
IPCP Configuration Options allow negotiatiation of desirable Internet
Protocol parameters. IPCP uses the same Configuration Option format
defined for LCP [1], with a separate set of Options.
The most up-to-date values of the IPCP Option Type field are specified
in the most recent "Assigned Numbers" RFC [6]. Current values are
assigned as follows:
1 IP-Addresses
2 IP-Compression-Protocol
3 IP-Address
McGregor [Page 4]
RFC 1332 PPP IPCP May 1992
3.1. IP-Addresses
Description
The use of the Configuration Option IP-Addresses has been
deprecated. It has been determined through implementation
experience that it is difficult to ensure negotiation convergence
in all cases using this option. RFC 1172 [7] provides information
for implementations requiring backwards compatability. The IP-
Address Configuration Option replaces this option, and its use is
preferred.
This option SHOULD NOT be sent in a Configure-Request if a
Configure-Request has been received which includes either an IP-
Addresses or IP-Address option. This option MAY be sent if a
Configure-Reject is received for the IP-Address option, or a
Configure-Nak is received with an IP-Addresses option as an
appended option.
Support for this option MAY be removed after the IPCP protocol
status advances to Internet Draft Standard.
McGregor [Page 5]
RFC 1332 PPP IPCP May 1992
3.2. IP-Compression-Protocol
Description
This Configuration Option provides a way to negotiate the use of a
specific compression protocol. By default, compression is not
enabled.
A summary of the IP-Compression-Protocol Configuration Option format
is shown below. The fields are transmitted from left to right.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Type | Length | IP-Compression-Protocol |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Data ...
+-+-+-+-+
Type
2
Length
>= 4
IP-Compression-Protocol
The IP-Compression-Protocol field is two octets and indicates the
compression protocol desired. Values for this field are always
the same as the PPP Data Link Layer Protocol field values for that
same compression protocol.
The most up-to-date values of the IP-Compression-Protocol field
are specified in the most recent "Assigned Numbers" RFC [6].
Current values are assigned as follows:
Value (in hex) Protocol
002d Van Jacobson Compressed TCP/IP
Data
The Data field is zero or more octets and contains additional data
as determined by the particular compression protocol.
McGregor [Page 6]
RFC 1332 PPP IPCP May 1992
Default
No compression protocol enabled.
McGregor [Page 7]
RFC 1332 PPP IPCP May 1992
3.3. IP-Address
Description
This Configuration Option provides a way to negotiate the IP
address to be used on the local end of the link. It allows the
sender of the Configure-Request to state which IP-address is
desired, or to request that the peer provide the information. The
peer can provide this information by NAKing the option, and
returning a valid IP-address.
If negotiation about the remote IP-address is required, and the
peer did not provide the option in its Configure-Request, the
option SHOULD be appended to a Configure-Nak. The value of the
IP-address given must be acceptable as the remote IP-address, or
indicate a request that the peer provide the information.
By default, no IP address is assigned.
A summary of the IP-Address Configuration Option format is shown
below. The fields are transmitted from left to right.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Type | Length | IP-Address
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
IP-Address (cont) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Type
3
Length
6
IP-Address
The four octet IP-Address is the desired local address of the
sender of a Configure-Request. If all four octets are set to
zero, it indicates a request that the peer provide the IP-Address
information.
Default
No IP address is assigned.
McGregor [Page 8]
RFC 1332 PPP IPCP May 1992
4. Van Jacobson TCP/IP header compression
Van Jacobson TCP/IP header compression reduces the size of the TCP/IP
headers to as few as three bytes. This can be a significant improvement
on slow serial lines, particularly for interactive traffic.
The IP-Compression-Protocol Configuration Option is used to indicate the
ability to receive compressed packets. Each end of the link must
separately request this option if bi-directional compression is desired.
The PPP Protocol field is set to the following values when transmitting
IP packets:
Value (in hex)
0021 Type IP. The IP protocol is not TCP, or the packet is a
fragment, or cannot be compressed.
002d Compressed TCP. The TCP/IP headers are replaced by the
compressed header.
002f Uncompressed TCP. The IP protocol field is replaced by
the slot identifier.
4.1. Configuration Option Format
A summary of the IP-Compression-Protocol Configuration Option format
to negotiate Van Jacobson TCP/IP header compression is shown below.
The fields are transmitted from left to right.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Type | Length | IP-Compression-Protocol |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Max-Slot-Id | Comp-Slot-Id |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Type
2
Length
6
McGregor [Page 9]
RFC 1332 PPP IPCP May 1992
IP-Compression-Protocol
002d (hex) for Van Jacobson Compressed TCP/IP headers.
Max-Slot-Id
The Max-Slot-Id field is one octet and indicates the maximum slot
identifier. This is one less than the actual number of slots; the
slot identifier has values from zero to Max-Slot-Id.
Note: There may be implementations that have problems with only
one slot (Max-Slot-Id = 0). See the discussion in reference
[3]. The example implementation in [3] will only work with 3
through 254 slots.
Comp-Slot-Id
The Comp-Slot-Id field is one octet and indicates whether the slot
identifier field may be compressed.
0 The slot identifier must not be compressed. All compressed
TCP packets must set the C bit in every change mask, and
must include the slot identifier.
1 The slot identifer may be compressed.
The slot identifier must not be compressed if there is no ability
for the PPP link level to indicate an error in reception to the
decompression module. Synchronization after errors depends on
receiving a packet with the slot identifier. See the discussion
in reference [3].
McGregor [Page 10]
RFC 1332 PPP IPCP May 1992
A. IPCP Recommended Options
The following Configurations Options are recommended:
IP-Compression-Protocol -- with at least 4 slots, usually 16
slots.
IP-Address -- only on dial-up lines.
Security Considerations
Security issues are not discussed in this memo.
References
[1] Simpson, W., "The Point-to-Point Protocol", RFC 1331, May 1992.
[2] Postel, J., "Internet Protocol", RFC 791, USC/Information
Sciences Institute, September 1981.
[3] Jacobson, V., "Compressing TCP/IP Headers", RFC 1144, January
1990.
[4] Postel, J., "The TCP Maximum Segment Size Option and Related
Topics", RFC 879, USC/Information Sciences Institute, November
1983.
[5] Mogul, J., and S. Deering, "Path MTU Discovery", RFC 1191,
November 1990.
[6] Reynolds, J., and J. Postel, "Assigned Numbers", RFC 1060,
USC/Information Sciences Institute, March 1990.
[7] Perkins, D., and R. Hobby, "Point-to-Point Protocol (PPP)
initial configuration options", RFC 1172, August 1990.
Acknowledgments
Some of the text in this document is taken from RFCs 1171 & 1172, by
Drew Perkins of Carnegie Mellon University, and by Russ Hobby of the
University of California at Davis.
Information leading to the expanded IP-Compression option provided by
Van Jacobson at SIGCOMM '90.
McGregor [Page 11]
RFC 1332 PPP IPCP May 1992
Bill Simpson helped with the document formatting.
Chair's Address
The working group can be contacted via the current chair:
Brian Lloyd
Lloyd & Associates
3420 Sudbury Road
Cameron Park, California 95682
Phone: (916) 676-1147
EMail: brian@ray.lloyd.com
Author's Address
Questions about this memo can also be directed to:
Glenn McGregor
Merit Network, Inc.
1071 Beal Avenue
Ann Arbor, MI 48109-2103
Phone: (313) 763-1203
EMail: Glenn.McGregor@Merit.edu
McGregor [Page 12]

View File

@ -0,0 +1,899 @@
Network Working Group B. Lloyd
Request for Comments: 1334 L&A
W. Simpson
Daydreamer
October 1992
PPP Authentication Protocols
Status of this Memo
This RFC specifies an IAB standards track protocol for the Internet
community, and requests discussion and suggestions for improvements.
Please refer to the current edition of the "IAB Official Protocol
Standards" for the standardization state and status of this protocol.
Distribution of this memo is unlimited.
Abstract
The Point-to-Point Protocol (PPP) [1] provides a standard method of
encapsulating Network Layer protocol information over point-to-point
links. PPP also defines an extensible Link Control Protocol, which
allows negotiation of an Authentication Protocol for authenticating
its peer before allowing Network Layer protocols to transmit over the
link.
This document defines two protocols for Authentication: the Password
Authentication Protocol and the Challenge-Handshake Authentication
Protocol. This memo is the product of the Point-to-Point Protocol
Working Group of the Internet Engineering Task Force (IETF).
Comments on this memo should be submitted to the ietf-ppp@ucdavis.edu
mailing list.
Table of Contents
1. Introduction ............................................... 2
1.1 Specification Requirements ................................. 2
1.2 Terminology ................................................ 3
2. Password Authentication Protocol ............................ 3
2.1 Configuration Option Format ................................ 4
2.2 Packet Format .............................................. 5
2.2.1 Authenticate-Request ..................................... 5
2.2.2 Authenticate-Ack and Authenticate-Nak .................... 7
3. Challenge-Handshake Authentication Protocol.................. 8
3.1 Configuration Option Format ................................ 9
3.2 Packet Format .............................................. 10
3.2.1 Challenge and Response ................................... 11
3.2.2 Success and Failure ...................................... 13
Lloyd & Simpson [Page 1]
RFC 1334 PPP Authentication October 1992
SECURITY CONSIDERATIONS ........................................ 14
REFERENCES ..................................................... 15
ACKNOWLEDGEMENTS ............................................... 16
CHAIR'S ADDRESS ................................................ 16
AUTHOR'S ADDRESS ............................................... 16
1. Introduction
PPP has three main components:
1. A method for encapsulating datagrams over serial links.
2. A Link Control Protocol (LCP) for establishing, configuring,
and testing the data-link connection.
3. A family of Network Control Protocols (NCPs) for establishing
and configuring different network-layer protocols.
In order to establish communications over a point-to-point link, each
end of the PPP link must first send LCP packets to configure the data
link during Link Establishment phase. After the link has been
established, PPP provides for an optional Authentication phase before
proceeding to the Network-Layer Protocol phase.
By default, authentication is not mandatory. If authentication of
the link is desired, an implementation MUST specify the
Authentication-Protocol Configuration Option during Link
Establishment phase.
These authentication protocols are intended for use primarily by
hosts and routers that connect to a PPP network server via switched
circuits or dial-up lines, but might be applied to dedicated links as
well. The server can use the identification of the connecting host
or router in the selection of options for network layer negotiations.
This document defines the PPP authentication protocols. The Link
Establishment and Authentication phases, and the Authentication-
Protocol Configuration Option, are defined in The Point-to-Point
Protocol (PPP) [1].
1.1. Specification Requirements
In this document, several words are used to signify the requirements
of the specification. These words are often capitalized.
MUST
This word, or the adjective "required", means that the definition
is an absolute requirement of the specification.
Lloyd & Simpson [Page 2]
RFC 1334 PPP Authentication October 1992
MUST NOT
This phrase means that the definition is an absolute prohibition
of the specification.
SHOULD
This word, or the adjective "recommended", means that there may
exist valid reasons in particular circumstances to ignore this
item, but the full implications should be understood and carefully
weighed before choosing a different course.
MAY
This word, or the adjective "optional", means that this item is
one of an allowed set of alternatives. An implementation which
does not include this option MUST be prepared to interoperate with
another implementation which does include the option.
1.2. Terminology
This document frequently uses the following terms:
authenticator
The end of the link requiring the authentication. The
authenticator specifies the authentication protocol to be used in
the Configure-Request during Link Establishment phase.
peer
The other end of the point-to-point link; the end which is being
authenticated by the authenticator.
silently discard
This means the implementation discards the packet without further
processing. The implementation SHOULD provide the capability of
logging the error, including the contents of the silently
discarded packet, and SHOULD record the event in a statistics
counter.
2. Password Authentication Protocol
The Password Authentication Protocol (PAP) provides a simple method
for the peer to establish its identity using a 2-way handshake. This
is done only upon initial link establishment.
After the Link Establishment phase is complete, an Id/Password pair
is repeatedly sent by the peer to the authenticator until
authentication is acknowledged or the connection is terminated.
PAP is not a strong authentication method. Passwords are sent over
the circuit "in the clear", and there is no protection from playback
Lloyd & Simpson [Page 3]
RFC 1334 PPP Authentication October 1992
or repeated trial and error attacks. The peer is in control of the
frequency and timing of the attempts.
Any implementations which include a stronger authentication method
(such as CHAP, described below) MUST offer to negotiate that method
prior to PAP.
This authentication method is most appropriately used where a
plaintext password must be available to simulate a login at a remote
host. In such use, this method provides a similar level of security
to the usual user login at the remote host.
Implementation Note: It is possible to limit the exposure of the
plaintext password to transmission over the PPP link, and avoid
sending the plaintext password over the entire network. When the
remote host password is kept as a one-way transformed value, and
the algorithm for the transform function is implemented in the
local server, the plaintext password SHOULD be locally transformed
before comparison with the transformed password from the remote
host.
2.1. Configuration Option Format
A summary of the Authentication-Protocol Configuration Option format
to negotiate the Password Authentication Protocol is shown below.
The fields are transmitted from left to right.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Type | Length | Authentication-Protocol |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Type
3
Length
4
Authentication-Protocol
c023 (hex) for Password Authentication Protocol.
Data
There is no Data field.
Lloyd & Simpson [Page 4]
RFC 1334 PPP Authentication October 1992
2.2. Packet Format
Exactly one Password Authentication Protocol packet is encapsulated
in the Information field of a PPP Data Link Layer frame where the
protocol field indicates type hex c023 (Password Authentication
Protocol). A summary of the PAP packet format is shown below. The
fields are transmitted from left to right.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Code | Identifier | Length |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Data ...
+-+-+-+-+
Code
The Code field is one octet and identifies the type of PAP packet.
PAP Codes are assigned as follows:
1 Authenticate-Request
2 Authenticate-Ack
3 Authenticate-Nak
Identifier
The Identifier field is one octet and aids in matching requests
and replies.
Length
The Length field is two octets and indicates the length of the PAP
packet including the Code, Identifier, Length and Data fields.
Octets outside the range of the Length field should be treated as
Data Link Layer padding and should be ignored on reception.
Data
The Data field is zero or more octets. The format of the Data
field is determined by the Code field.
2.2.1. Authenticate-Request
Description
The Authenticate-Request packet is used to begin the Password
Authentication Protocol. The link peer MUST transmit a PAP packet
Lloyd & Simpson [Page 5]
RFC 1334 PPP Authentication October 1992
with the Code field set to 1 (Authenticate-Request) during the
Authentication phase. The Authenticate-Request packet MUST be
repeated until a valid reply packet is received, or an optional
retry counter expires.
The authenticator SHOULD expect the peer to send an Authenticate-
Request packet. Upon reception of an Authenticate-Request packet,
some type of Authenticate reply (described below) MUST be
returned.
Implementation Note: Because the Authenticate-Ack might be
lost, the authenticator MUST allow repeated Authenticate-
Request packets after completing the Authentication phase.
Protocol phase MUST return the same reply Code returned when
the Authentication phase completed (the message portion MAY be
different). Any Authenticate-Request packets received during
any other phase MUST be silently discarded.
When the Authenticate-Nak is lost, and the authenticator
terminates the link, the LCP Terminate-Request and Terminate-
Ack provide an alternative indication that authentication
failed.
A summary of the Authenticate-Request packet format is shown below.
The fields are transmitted from left to right.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Code | Identifier | Length |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Peer-ID Length| Peer-Id ...
+-+-+-+-+-+-+-+-+-+-+-+-+
| Passwd-Length | Password ...
+-+-+-+-+-+-+-+-+-+-+-+-+-+
Code
1 for Authenticate-Request.
Identifier
The Identifier field is one octet and aids in matching requests
and replies. The Identifier field MUST be changed each time an
Authenticate-Request packet is issued.
Lloyd & Simpson [Page 6]
RFC 1334 PPP Authentication October 1992
Peer-ID-Length
The Peer-ID-Length field is one octet and indicates the length of
the Peer-ID field.
Peer-ID
The Peer-ID field is zero or more octets and indicates the name of
the peer to be authenticated.
Passwd-Length
The Passwd-Length field is one octet and indicates the length of
the Password field.
Password
The Password field is zero or more octets and indicates the
password to be used for authentication.
2.2.2. Authenticate-Ack and Authenticate-Nak
Description
If the Peer-ID/Password pair received in an Authenticate-Request
is both recognizable and acceptable, then the authenticator MUST
transmit a PAP packet with the Code field set to 2 (Authenticate-
Ack).
If the Peer-ID/Password pair received in a Authenticate-Request is
not recognizable or acceptable, then the authenticator MUST
transmit a PAP packet with the Code field set to 3 (Authenticate-
Nak), and SHOULD take action to terminate the link.
A summary of the Authenticate-Ack and Authenticate-Nak packet format
is shown below. The fields are transmitted from left to right.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Code | Identifier | Length |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Msg-Length | Message ...
+-+-+-+-+-+-+-+-+-+-+-+-+-
Code
2 for Authenticate-Ack;
Lloyd & Simpson [Page 7]
RFC 1334 PPP Authentication October 1992
3 for Authenticate-Nak.
Identifier
The Identifier field is one octet and aids in matching requests
and replies. The Identifier field MUST be copied from the
Identifier field of the Authenticate-Request which caused this
reply.
Msg-Length
The Msg-Length field is one octet and indicates the length of the
Message field.
Message
The Message field is zero or more octets, and its contents are
implementation dependent. It is intended to be human readable,
and MUST NOT affect operation of the protocol. It is recommended
that the message contain displayable ASCII characters 32 through
126 decimal. Mechanisms for extension to other character sets are
the topic of future research.
3. Challenge-Handshake Authentication Protocol
The Challenge-Handshake Authentication Protocol (CHAP) is used to
periodically verify the identity of the peer using a 3-way handshake.
This is done upon initial link establishment, and MAY be repeated
anytime after the link has been established.
After the Link Establishment phase is complete, the authenticator
sends a "challenge" message to the peer. The peer responds with a
value calculated using a "one-way hash" function. The authenticator
checks the response against its own calculation of the expected hash
value. If the values match, the authentication is acknowledged;
otherwise the connection SHOULD be terminated.
CHAP provides protection against playback attack through the use of
an incrementally changing identifier and a variable challenge value.
The use of repeated challenges is intended to limit the time of
exposure to any single attack. The authenticator is in control of
the frequency and timing of the challenges.
This authentication method depends upon a "secret" known only to the
authenticator and that peer. The secret is not sent over the link.
This method is most likely used where the same secret is easily
accessed from both ends of the link.
Lloyd & Simpson [Page 8]
RFC 1334 PPP Authentication October 1992
Implementation Note: CHAP requires that the secret be available in
plaintext form. To avoid sending the secret over other links in
the network, it is recommended that the challenge and response
values be examined at a central server, rather than each network
access server. Otherwise, the secret SHOULD be sent to such
servers in a reversably encrypted form.
The CHAP algorithm requires that the length of the secret MUST be at
least 1 octet. The secret SHOULD be at least as large and
unguessable as a well-chosen password. It is preferred that the
secret be at least the length of the hash value for the hashing
algorithm chosen (16 octets for MD5). This is to ensure a
sufficiently large range for the secret to provide protection against
exhaustive search attacks.
The one-way hash algorithm is chosen such that it is computationally
infeasible to determine the secret from the known challenge and
response values.
The challenge value SHOULD satisfy two criteria: uniqueness and
unpredictability. Each challenge value SHOULD be unique, since
repetition of a challenge value in conjunction with the same secret
would permit an attacker to reply with a previously intercepted
response. Since it is expected that the same secret MAY be used to
authenticate with servers in disparate geographic regions, the
challenge SHOULD exhibit global and temporal uniqueness. Each
challenge value SHOULD also be unpredictable, least an attacker trick
a peer into responding to a predicted future challenge, and then use
the response to masquerade as that peer to an authenticator.
Although protocols such as CHAP are incapable of protecting against
realtime active wiretapping attacks, generation of unique
unpredictable challenges can protect against a wide range of active
attacks.
A discussion of sources of uniqueness and probability of divergence
is included in the Magic-Number Configuration Option [1].
3.1. Configuration Option Format
A summary of the Authentication-Protocol Configuration Option format
to negotiate the Challenge-Handshake Authentication Protocol is shown
below. The fields are transmitted from left to right.
Lloyd & Simpson [Page 9]
RFC 1334 PPP Authentication October 1992
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Type | Length | Authentication-Protocol |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Algorithm |
+-+-+-+-+-+-+-+-+
Type
3
Length
5
Authentication-Protocol
c223 (hex) for Challenge-Handshake Authentication Protocol.
Algorithm
The Algorithm field is one octet and indicates the one-way hash
method to be used. The most up-to-date values of the CHAP
Algorithm field are specified in the most recent "Assigned
Numbers" RFC [2]. Current values are assigned as follows:
0-4 unused (reserved)
5 MD5 [3]
3.2. Packet Format
Exactly one Challenge-Handshake Authentication Protocol packet is
encapsulated in the Information field of a PPP Data Link Layer frame
where the protocol field indicates type hex c223 (Challenge-Handshake
Authentication Protocol). A summary of the CHAP packet format is
shown below. The fields are transmitted from left to right.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Code | Identifier | Length |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Data ...
+-+-+-+-+
Lloyd & Simpson [Page 10]
RFC 1334 PPP Authentication October 1992
Code
The Code field is one octet and identifies the type of CHAP
packet. CHAP Codes are assigned as follows:
1 Challenge
2 Response
3 Success
4 Failure
Identifier
The Identifier field is one octet and aids in matching challenges,
responses and replies.
Length
The Length field is two octets and indicates the length of the
CHAP packet including the Code, Identifier, Length and Data
fields. Octets outside the range of the Length field should be
treated as Data Link Layer padding and should be ignored on
reception.
Data
The Data field is zero or more octets. The format of the Data
field is determined by the Code field.
3.2.1. Challenge and Response
Description
The Challenge packet is used to begin the Challenge-Handshake
Authentication Protocol. The authenticator MUST transmit a CHAP
packet with the Code field set to 1 (Challenge). Additional
Challenge packets MUST be sent until a valid Response packet is
received, or an optional retry counter expires.
A Challenge packet MAY also be transmitted at any time during the
Network-Layer Protocol phase to ensure that the connection has not
been altered.
The peer SHOULD expect Challenge packets during the Authentication
phase and the Network-Layer Protocol phase. Whenever a Challenge
packet is received, the peer MUST transmit a CHAP packet with the
Code field set to 2 (Response).
Whenever a Response packet is received, the authenticator compares
Lloyd & Simpson [Page 11]
RFC 1334 PPP Authentication October 1992
the Response Value with its own calculation of the expected value.
Based on this comparison, the authenticator MUST send a Success or
Failure packet (described below).
Implementation Note: Because the Success might be lost, the
authenticator MUST allow repeated Response packets after
completing the Authentication phase. To prevent discovery of
alternative Names and Secrets, any Response packets received
having the current Challenge Identifier MUST return the same
reply Code returned when the Authentication phase completed
(the message portion MAY be different). Any Response packets
received during any other phase MUST be silently discarded.
When the Failure is lost, and the authenticator terminates the
link, the LCP Terminate-Request and Terminate-Ack provide an
alternative indication that authentication failed.
A summary of the Challenge and Response packet format is shown below.
The fields are transmitted from left to right.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Code | Identifier | Length |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Value-Size | Value ...
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Name ...
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Code
1 for Challenge;
2 for Response.
Identifier
The Identifier field is one octet. The Identifier field MUST be
changed each time a Challenge is sent.
The Response Identifier MUST be copied from the Identifier field
of the Challenge which caused the Response.
Value-Size
This field is one octet and indicates the length of the Value
field.
Lloyd & Simpson [Page 12]
RFC 1334 PPP Authentication October 1992
Value
The Value field is one or more octets. The most significant octet
is transmitted first.
The Challenge Value is a variable stream of octets. The
importance of the uniqueness of the Challenge Value and its
relationship to the secret is described above. The Challenge
Value MUST be changed each time a Challenge is sent. The length
of the Challenge Value depends upon the method used to generate
the octets, and is independent of the hash algorithm used.
The Response Value is the one-way hash calculated over a stream of
octets consisting of the Identifier, followed by (concatenated
with) the "secret", followed by (concatenated with) the Challenge
Value. The length of the Response Value depends upon the hash
algorithm used (16 octets for MD5).
Name
The Name field is one or more octets representing the
identification of the system transmitting the packet. There are
no limitations on the content of this field. For example, it MAY
contain ASCII character strings or globally unique identifiers in
ASN.1 syntax. The Name should not be NUL or CR/LF terminated.
The size is determined from the Length field.
Since CHAP may be used to authenticate many different systems, the
content of the name field(s) may be used as a key to locate the
proper secret in a database of secrets. This also makes it
possible to support more than one name/secret pair per system.
3.2.2. Success and Failure
Description
If the Value received in a Response is equal to the expected
value, then the implementation MUST transmit a CHAP packet with
the Code field set to 3 (Success).
If the Value received in a Response is not equal to the expected
value, then the implementation MUST transmit a CHAP packet with
the Code field set to 4 (Failure), and SHOULD take action to
terminate the link.
A summary of the Success and Failure packet format is shown below.
The fields are transmitted from left to right.
Lloyd & Simpson [Page 13]
RFC 1334 PPP Authentication October 1992
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Code | Identifier | Length |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Message ...
+-+-+-+-+-+-+-+-+-+-+-+-+-
Code
3 for Success;
4 for Failure.
Identifier
The Identifier field is one octet and aids in matching requests
and replies. The Identifier field MUST be copied from the
Identifier field of the Response which caused this reply.
Message
The Message field is zero or more octets, and its contents are
implementation dependent. It is intended to be human readable,
and MUST NOT affect operation of the protocol. It is recommended
that the message contain displayable ASCII characters 32 through
126 decimal. Mechanisms for extension to other character sets are
the topic of future research. The size is determined from the
Length field.
Security Considerations
Security issues are the primary topic of this RFC.
The interaction of the authentication protocols within PPP are
highly implementation dependent. This is indicated by the use of
SHOULD throughout the document.
For example, upon failure of authentication, some implementations
do not terminate the link. Instead, the implementation limits the
kind of traffic in the Network-Layer Protocols to a filtered
subset, which in turn allows the user opportunity to update
secrets or send mail to the network administrator indicating a
problem.
There is no provision for re-tries of failed authentication.
However, the LCP state machine can renegotiate the authentication
protocol at any time, thus allowing a new attempt. It is
Lloyd & Simpson [Page 14]
RFC 1334 PPP Authentication October 1992
recommended that any counters used for authentication failure not
be reset until after successful authentication, or subsequent
termination of the failed link.
There is no requirement that authentication be full duplex or that
the same protocol be used in both directions. It is perfectly
acceptable for different protocols to be used in each direction.
This will, of course, depend on the specific protocols negotiated.
In practice, within or associated with each PPP server, there is a
database which associates "user" names with authentication
information ("secrets"). It is not anticipated that a particular
named user would be authenticated by multiple methods. This would
make the user vulnerable to attacks which negotiate the least
secure method from among a set (such as PAP rather than CHAP).
Instead, for each named user there should be an indication of
exactly one method used to authenticate that user name. If a user
needs to make use of different authentication method under
different circumstances, then distinct user names SHOULD be
employed, each of which identifies exactly one authentication
method.
Passwords and other secrets should be stored at the respective
ends such that access to them is as limited as possible. Ideally,
the secrets should only be accessible to the process requiring
access in order to perform the authentication.
The secrets should be distributed with a mechanism that limits the
number of entities that handle (and thus gain knowledge of) the
secret. Ideally, no unauthorized person should ever gain
knowledge of the secrets. It is possible to achieve this with
SNMP Security Protocols [4], but such a mechanism is outside the
scope of this specification.
Other distribution methods are currently undergoing research and
experimentation. The SNMP Security document also has an excellent
overview of threats to network protocols.
References
[1] Simpson, W., "The Point-to-Point Protocol (PPP)", RFC 1331,
Daydreamer, May 1992.
[2] Reynolds, J., and J. Postel, "Assigned Numbers", RFC 1340,
USC/Information Sciences Institute, July 1992.
Lloyd & Simpson [Page 15]
RFC 1334 PPP Authentication October 1992
[3] Rivest, R., and S. Dusse, "The MD5 Message-Digest Algorithm", MIT
Laboratory for Computer Science and RSA Data Security, Inc. RFC
1321, April 1992.
[4] Galvin, J., McCloghrie, K., and J. Davin, "SNMP Security
Protocols", Trusted Information Systems, Inc., Hughes LAN
Systems, Inc., MIT Laboratory for Computer Science, RFC 1352,
July 1992.
Acknowledgments
Some of the text in this document is taken from RFC 1172, by Drew
Perkins of Carnegie Mellon University, and by Russ Hobby of the
University of California at Davis.
Special thanks to Dave Balenson, Steve Crocker, James Galvin, and
Steve Kent, for their extensive explanations and suggestions. Now,
if only we could get them to agree with each other.
Chair's Address
The working group can be contacted via the current chair:
Brian Lloyd
Lloyd & Associates
3420 Sudbury Road
Cameron Park, California 95682
Phone: (916) 676-1147
EMail: brian@lloyd.com
Author's Address
Questions about this memo can also be directed to:
William Allen Simpson
Daydreamer
Computer Systems Consulting Services
P O Box 6205
East Lansing, MI 48826-6205
EMail: Bill.Simpson@um.cc.umich.edu
Lloyd & Simpson [Page 16]

View File

@ -0,0 +1,619 @@
Network Working Group R. Braden
Request for Comments: 1337 ISI
May 1992
TIME-WAIT Assassination Hazards in TCP
Status of This Memo
This memo provides information for the Internet community. It does
not specify an Internet standard. Distribution of this memo is
unlimited.
Abstract
This note describes some theoretically-possible failure modes for TCP
connections and discusses possible remedies. In particular, one very
simple fix is identified.
1. INTRODUCTION
Experiments to validate the recently-proposed TCP extensions [RFC-
1323] have led to the discovery of a new class of TCP failures, which
have been dubbed the "TIME-WAIT Assassination hazards". This note
describes these hazards, gives examples, and discusses possible
prevention measures.
The failures in question all result from old duplicate segments. In
brief, the TCP mechanisms to protect against old duplicate segments
are [RFC-793]:
(1) The 3-way handshake rejects old duplicate initial <SYN>
segments, avoiding the hazard of replaying a connection.
(2) Sequence numbers are used to reject old duplicate data and ACK
segments from the current incarnation of a given connection
(defined by a particular host and port pair). Sequence numbers
are also used to reject old duplicate <SYN,ACK> segments.
For very high-speed connections, Jacobson's PAWS ("Protect
Against Wrapped Sequences") mechanism [RFC-1323] effectively
extends the sequence numbers so wrap-around will not introduce a
hazard within the same incarnation.
(3) There are two mechanisms to avoid hazards due to old duplicate
segments from an earlier instance of the same connection; see
the Appendix to [RFC-1185] for details.
Braden [Page 1]
RFC 1337 TCP TIME-WAIT Hazards May 1992
For "short and slow" connections [RFC-1185], the clock-driven
ISN (initial sequence number) selection prevents the overlap of
the sequence spaces of the old and new incarnations [RFC-793].
(The algorithm used by Berkeley BSD TCP for stepping ISN
complicates the analysis slightly but does not change the
conclusions.)
(4) TIME-WAIT state removes the hazard of old duplicates for "fast"
or "long" connections, in which clock-driven ISN selection is
unable to prevent overlap of the old and new sequence spaces.
The TIME-WAIT delay allows all old duplicate segments time
enough to die in the Internet before the connection is reopened.
(5) After a system crash, the Quiet Time at system startup allows
old duplicates to disappear before any connections are opened.
Our new observation is that (4) is unreliable: TIME-WAIT state can be
prematurely terminated ("assassinated") by an old duplicate data or
ACK segment from the current or an earlier incarnation of the same
connection. We refer to this as "TIME-WAIT Assassination" (TWA).
Figure 1 shows an example of TIME-WAIT assassination. Segments 1-5
are copied exactly from Figure 13 of RFC-793, showing a normal close
handshake. Packets 5.1, 5.2, and 5.3 are an extension to this
sequence, illustrating TWA. Here 5.1 is *any* old segment that is
unacceptable to TCP A. It might be unacceptable because of its
sequence number or because of an old PAWS timestamp. In either case,
TCP A sends an ACK segment 5.2 for its current SND.NXT and RCV.NXT.
Since it has no state for this connection, TCP B reflects this as RST
segment 5.3, which assassinates the TIME-WAIT state at A!
Braden [Page 2]
RFC 1337 TCP TIME-WAIT Hazards May 1992
TCP A TCP B
1. ESTABLISHED ESTABLISHED
(Close)
2. FIN-WAIT-1 --> <SEQ=100><ACK=300><CTL=FIN,ACK> --> CLOSE-WAIT
3. FIN-WAIT-2 <-- <SEQ=300><ACK=101><CTL=ACK> <-- CLOSE-WAIT
(Close)
4. TIME-WAIT <-- <SEQ=300><ACK=101><CTL=FIN,ACK> <-- LAST-ACK
5. TIME-WAIT --> <SEQ=101><ACK=301><CTL=ACK> --> CLOSED
- - - - - - - - - - - - - - - - - - - - - - - - - - - -
5.1. TIME-WAIT <-- <SEQ=255><ACK=33> ... old duplicate
5.2 TIME-WAIT --> <SEQ=101><ACK=301><CTL=ACK> --> ????
5.3 CLOSED <-- <SEQ=301><CTL=RST> <-- ????
(prematurely)
Figure 1. TWA Example
Note that TWA is not at all an unlikely event if there are any
duplicate segments that may be delayed in the network. Furthermore,
TWA cannot be prevented by PAWS timestamps; the event may happen
within the same tick of the timestamp clock. TWA is a consequence of
TCP's half-open connection discovery mechanism (see pp 33-34 of
[RFC-793]), which is designed to clean up after a system crash.
2. The TWA Hazards
2.1 Introduction
If the connection is immediately reopened after a TWA event, the
new incarnation will be exposed to old duplicate segments (except
for the initial <SYN> segment, which is handled by the 3-way
handshake). There are three possible hazards that result:
H1. Old duplicate data may be accepted erroneously.
H2. The new connection may be de-synchronized, with the two ends
in permanent disagreement on the state. Following the spec
of RFC-793, this desynchronization results in an infinite ACK
Braden [Page 3]
RFC 1337 TCP TIME-WAIT Hazards May 1992
loop. (It might be reasonable to change this aspect of RFC-
793 and kill the connection instead.)
This hazard results from acknowledging something that was not
sent. This may result from an old duplicate ACK or as a
side-effect of hazard H1.
H3. The new connection may die.
A duplicate segment (data or ACK) arriving in SYN-SENT state
may kill the new connection after it has apparently opened
successfully.
Each of these hazards requires that the seqence space of the new
connection overlap to some extent with the sequence space of the
previous incarnation. As noted above, this is only possible for
"fast" or "long" connections. Since these hazards all require the
coincidence of an old duplicate falling into a particular range of
new sequence numbers, they are much less probable than TWA itself.
TWA and the three hazards H1, H2, and H3 have been demonstrated on
a stock Sun OS 4.1.1 TCP running in an simulated environment that
massively duplicates segments. This environment is far more
hazardous than most real TCP's must cope with, and the conditions
were carefully tuned to create the necessary conditions for the
failures. However, these demonstrations are in effect an
existence proof for the hazards.
We now present example scenarios for each of these hazards. Each
scenario is assumed to follow immediately after a TWA event
terminated the previous incarnation of the same connection.
2.2 HAZARD H1: Acceptance of erroneous old duplicate data.
Without the protection of the TIME-WAIT delay, it is possible for
erroneous old duplicate data from the earlier incarnation to be
accepted. Figure 2 shows precisely how this might happen.
Braden [Page 4]
RFC 1337 TCP TIME-WAIT Hazards May 1992
TCP A TCP B
1. ESTABL. --> <SEQ=400><ACK=101><DATA=100><CTL=ACK> --> ESTABL.
2. ESTABL. <-- <SEQ=101><ACK=500><CTL=ACK> <-- ESTABL.
3. (old dupl)...<SEQ=560><ACK=101><DATA=80><CTL=ACK> --> ESTABL.
4. ESTABL. <-- <SEQ=101><ACK=500><CTL=ACK> <-- ESTABL.
5. ESTABL. --> <SEQ=500><ACK=101><DATA=100><CTL=ACK> --> ESTABL.
6. ... <SEQ=101><ACK=640><CTL=ACK> <-- ESTABL.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
7a. ESTABL. --> <SEQ=600><ACK=101><DATA=100><CTL=ACK> --> ESTABL.
8a. ESTABL. <-- <SEQ=101><ACK=640><CTL=ACK> ...
9a. ESTABL. --> <SEQ=700><ACK=101><DATA=100><CTL=ACK> --> ESTABL.
Figure 2: Accepting Erroneous Data
The connection has already been successfully reopened after the
assumed TWA event. Segment 1 is a normal data segment and segment
2 is the corresponding ACK segment. Old duplicate data segment 3
from the earlier incarnation happens to fall within the current
receive window, resulting in a duplicate ACK segment #4. The
erroneous data is queued and "lurks" in the TCP reassembly queue
until data segment 5 overlaps it. At that point, either 80 or 40
bytes of erroneous data is delivered to the user B; the choice
depends upon the particulars of the reassembly algorithm, which
may accept the first or the last duplicate data.
As a result, B sends segment 6, an ACK for sequence = 640, which
is 40 beyond any data sent by A. Assume for the present that this
ACK arrives at A *after* A has sent segment 7a, the next full data
segment. In that case, the ACK segment 8a acknowledges data that
has been sent, and the error goes undetected. Another possible
continuation after segment 6 leads to hazard H3, shown below.
2.3 HAZARD H2: De-synchronized Connection
This hazard may result either as a side effect of H1 or directly
from an old duplicate ACK that happens to be acceptable but
acknowledges something that has not been sent.
Braden [Page 5]
RFC 1337 TCP TIME-WAIT Hazards May 1992
Referring to Figure 2 above, suppose that the ACK generated by the
old duplicate data segment arrived before the next data segment
had been sent. The result is an infinite ACK loop, as shown by
the following alternate continuation of Figure 2.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
7b. ESTABL. <-- <SEQ=101><ACK=640><CTL=ACK> ...
(ACK something not yet
sent => send ACK)
8b. ESTABL. --> <SEQ=600><ACK101><CTL=ACK> --> ESTABL.
(Below window =>
send ACK)
9b. ESTABL. <-- <SEQ=101><ACK=640><CTL=ACK> <-- ESTABL.
(etc.!)
Figure 3: Infinite ACK loop
2.4 HAZARD H3: Connection Failure
An old duplicate ACK segment may lead to an apparent refusal of
TCP A's next connection attempt, as illustrated in Figure 4. Here
<W=...> indicates the TCP window field SEG.WIND.*
TCP A TCP B
1. CLOSED LISTEN
2. SYN-SENT --> <SEQ=100><CTL=SYN> --> SYN-RCVD
3. ... <SEQ=400><ACK=101><CTL=SYN,ACK><W=800> <-- SYN-RCVD
4. SYN-SENT <-- <SEQ=300><ACK=123><CTL=ACK> ... (old duplicate)
5. SYN-SENT --> <SEQ=123><CTL=RST> --> LISTEN
6. ESTABLISHED <-- <SEQ=400><ACK=101><CTL=SYN,ACK><W=900> ...
7. ESTABLISHED --> <SEQ=101><ACK=401><CTL=ACK> --> LISTEN
8. CLOSED <-- <SEQ=401><CTL=RST> <-- LISTEN
Figure 4: Connection Failure from Old Duplicate
Braden [Page 6]
RFC 1337 TCP TIME-WAIT Hazards May 1992
The key to the failure in Figure 4 is that the RST segment 5 is
acceptable to TCP B in SYN-RECEIVED state, because the sequence
space of the earlier connection that produced this old duplicate
overlaps the new connection space. Thus, <SEQ=123> in segment #5
falls within TCP B's receive window [101,900). In experiments,
this failure mode was very easy to demonstrate. (Kurt Matthys has
pointed out that this scenario is time-dependent: if TCP A should
timeout and retransmit the initial SYN after segment 5 arrives and
before segment 6, then the open will complete successfully.)
3. Fixes for TWA Hazards
We discuss three possible fixes to TCP to avoid these hazards.
(F1) Ignore RST segments in TIME-WAIT state.
If the 2 minute MSL is enforced, this fix avoids all three
hazards.
This is the simplest fix. One could also argue that it is
formally the correct thing to do; since allowing time for old
duplicate segments to die is one of TIME-WAIT state's functions,
the state should not be truncated by a RST segment.
(F2) Use PAWS to avoid the hazards.
Suppose that the TCP ignores RST segments in TIME-WAIT state,
but only long enough to guarantee that the timestamp clocks on
both ends have ticked. Then the PAWS mechanism [RFC-1323] will
prevent old duplicate data segments from interfering with the
new incarnation, eliminating hazard H1. For reasons explained
below, however, it may not eliminate all old duplicate ACK
segments, so hazards H2 and H3 will still exist.
In the language of the TCP Extensions RFC [RFC-1323]:
When processing a RST bit in TIME-WAIT state:
If (Snd.TS.OK is off) or (Time.in.TW.state() >= W)
then enter the CLOSED state, delete the TCB,
drop the RST segment, and return.
else simply drop the RST segment and return.
Here "Time.in.TW.state()" is a function returning the elapsed
time since TIME-WAIT state was entered, and W is a constant that
is at least twice the longest possible period for timestamp
clocks, i.e., W = 2 secs [RFC-1323].
Braden [Page 7]
RFC 1337 TCP TIME-WAIT Hazards May 1992
This assumes that the timestamp clock at each end continues to
advance at a constant rate whether or not there are any open
connections. We do not have to consider what happens across a
system crash (e.g., the timestamp clock may jump randomly),
because of the assumed Quiet Time at system startup.
Once this change is in place, the initial timestamps that occur
on the SYN and {SYN,ACK} segments reopening the connection will
be larger than any timestamp on a segment from earlier
incarnations. As a result, the PAWS mechanism operating in the
new connection incarnation will avoid the H1 hazard, ie.
acceptance of old duplicate data.
The effectiveness of fix (F2) in preventing acceptance of old
duplicate data segments, i.e., hazard H1, has been demonstrated
in the Sun OS TCP mentioned earlier. Unfortunately, these tests
revealed a somewhat surprising fact: old duplicate ACKs from
the earlier incarnation can still slip past PAWS, so that (F2)
will not prevent failures H2 or H3. What happens is that TIME-
WAIT state effectively regenerates the timestamp of an old
duplicate ACK. That is, when an old duplicate arrives in TIME-
WAIT state, an extended TCP will send out its own ACK with a
timestamp option containing its CURRENT timestamp clock value.
If this happens immediately before the TWA mechanism kills
TIME-WAIT state, the result will be a "new old duplicate"
segment with a current timestamp that may pass the PAWS test on
the reopened connection.
Whether H2 and H3 are critical depends upon how often they
happen and what assumptions the applications make about TCP
semantics. In the case of the H3 hazard, merely trying the open
again is likely to succeed. Furthermore, many production TCPs
have (despite the advice of the researchers who developed TCP)
incorporated a "keep-alive" mechanism, which may kill
connections unnecessarily. The frequency of occurrence of H2
and H3 may well be much lower than keep-alive failures or
transient internet routing failures.
(F3) Use 64-bit Sequence Numbers
O'Malley and Peterson [RFC-1264] have suggested expansion of the
TCP sequence space to 64 bits as an alternative to PAWS for
avoiding the hazard of wrapped sequence numbers within the same
incarnation. It is worthwhile to inquire whether 64-bit
sequence numbers could be used to avoid the TWA hazards as well.
Using 64 bit sequence numbers would not prevent TWA - the early
termination of TIME-WAIT state. However, it appears that a
Braden [Page 8]
RFC 1337 TCP TIME-WAIT Hazards May 1992
combination of 64-bit sequence numbers with an appropriate
modification of the TCP parameters could defeat all of the TWA
hazards H1, H2, and H3. The basis for this is explained in an
appendix to this memo. In summary, it could be arranged that
the same sequence space would be reused only after a very long
period of time, so every connection would be "slow" and "short".
4. Conclusions
Of the three fixes described in the previous section, fix (F1),
ignoring RST segments in TIME-WAIT state, seems like the best short-
term solution. It is certainly the simplest. It would be very
desirable to do an extended test of this change in a production
environment, to ensure there is no unexpected bad effect of ignoring
RSTs in TIME-WAIT state.
Fix (F2) is more complex and is at best a partial fix. (F3), using
64-bit sequence numbers, would be a significant change in the
protocol, and its implications need to be thoroughly understood.
(F3) may turn out to be a long-term fix for the hazards discussed in
this note.
APPENDIX: Using 64-bit Sequence Numbers
This appendix provides a justification of our statement that 64-bit
sequence numbers could prevent the TWA hazards.
The theoretical ISN calculation used by TCP is:
ISN = (R*T) mod 2**n.
where T is the real time in seconds (from an arbitrary origin, fixed
when the system is started), R is a constant, currently 250 KBps, and
n = 32 is the size of the sequence number field.
The limitations of current TCP are established by n, R, and the
maximum segment lifetime MSL = 4 minutes. The shortest time Twrap to
wrap the sequence space is:
Twrap = (2**n)/r
where r is the maximum transfer rate. To avoid old duplicate
segments in the same connection, we require that Twrap > MSL (in
practice, we need Twrap >> MSL).
Braden [Page 9]
RFC 1337 TCP TIME-WAIT Hazards May 1992
The clock-driven ISN numbers wrap in time TwrapISN:
TwrapISN = (2**n)/R
For current TCP, TwrapISN = 4.55 hours.
The cases for old duplicates from previous connections can be divided
into four regions along two dimensions:
* Slow vs. fast connections, corresponding to r < R or r >= R.
* Short vs. long connections, corresponding to duration E <
TwrapISN or E >= TwrapISN.
On short slow connections, the clock-driven ISN selection rejects old
duplicates. For all other cases, the TIME-WAIT delay of 2*MSL is
required so old duplicates can expire before they infect a new
incarnation. This is discussed in detail in the Appendix to [RFC-
1185].
With this background, we can consider the effect of increasing n to
64. We would like to increase both R and TwrapISN far enough that
all connections will be short and slow, i.e., so that the clock-
driven ISN selection will reject all old duplicates. Put another
way, we want to every connection to have a unique chunk of the
seqence space. For this purpose, we need R larger than the maximum
foreseeable rate r, and TwrapISN greater than the longest foreseeable
connection duration E.
In fact, this appears feasible with n = 64 bits. Suppose that we use
R = 2**33 Bps; this is approximately 8 gigabytes per second, a
reasonable upper limit on throughput of a single TCP connection.
Then TwrapISN = 68 years, a reasonable upper limit on TCP connection
duration. Note that this particular choice of R corresponds to
incrementing the ISN by 2**32 every 0.5 seconds, as would happen with
the Berkeley BSD implementation of TCP. Then the low-order 32 bits
of a 64-bit ISN would always be exactly zero.
REFERENCES
[RFC-793] Postel, J., "Transmission Control Protocol", RFC-793,
USC/Information Sciences Institute, September 1981.
[RFC-1185] Jacobson, V., Braden, R., and Zhang, L., "TCP
Extension for High-Speed Paths", RFC-1185, Lawrence Berkeley Labs,
USC/Information Sciences Institute, and Xerox Palo Alto Research
Center, October 1990.
Braden [Page 10]
RFC 1337 TCP TIME-WAIT Hazards May 1992
[RFC-1263] O'Malley, S. and L. Peterson, "TCP Extensions
Considered Harmful", RFC-1263, University of Arizona, October
1991.
[RFC-1323] Jacobson, V., Braden, R. and D. Borman "TCP Extensions
for High Performance", RFC-1323, Lawrence Berkeley Labs,
USC/Information Sciences Institute, and Cray Research, May 1992.
Security Considerations
Security issues are not discussed in this memo.
Author's Address:
Bob Braden
University of Southern California
Information Sciences Institute
4676 Admiralty Way
Marina del Rey, CA 90292
Phone: (213) 822-1511
EMail: Braden@ISI.EDU
Braden [Page 11]

View File

@ -0,0 +1,619 @@
Network Working Group K. Sollins
Request For Comments: 1350 MIT
STD: 33 July 1992
Obsoletes: RFC 783
THE TFTP PROTOCOL (REVISION 2)
Status of this Memo
This RFC specifies an IAB standards track protocol for the Internet
community, and requests discussion and suggestions for improvements.
Please refer to the current edition of the "IAB Official Protocol
Standards" for the standardization state and status of this protocol.
Distribution of this memo is unlimited.
Summary
TFTP is a very simple protocol used to transfer files. It is from
this that its name comes, Trivial File Transfer Protocol or TFTP.
Each nonterminal packet is acknowledged separately. This document
describes the protocol and its types of packets. The document also
explains the reasons behind some of the design decisions.
Acknowlegements
The protocol was originally designed by Noel Chiappa, and was
redesigned by him, Bob Baldwin and Dave Clark, with comments from
Steve Szymanski. The current revision of the document includes
modifications stemming from discussions with and suggestions from
Larry Allen, Noel Chiappa, Dave Clark, Geoff Cooper, Mike Greenwald,
Liza Martin, David Reed, Craig Milo Rogers (of USC-ISI), Kathy
Yellick, and the author. The acknowledgement and retransmission
scheme was inspired by TCP, and the error mechanism was suggested by
PARC's EFTP abort message.
The May, 1992 revision to fix the "Sorcerer's Apprentice" protocol
bug [4] and other minor document problems was done by Noel Chiappa.
This research was supported by the Advanced Research Projects Agency
of the Department of Defense and was monitored by the Office of Naval
Research under contract number N00014-75-C-0661.
1. Purpose
TFTP is a simple protocol to transfer files, and therefore was named
the Trivial File Transfer Protocol or TFTP. It has been implemented
on top of the Internet User Datagram protocol (UDP or Datagram) [2]
Sollins [Page 1]
RFC 1350 TFTP Revision 2 July 1992
so it may be used to move files between machines on different
networks implementing UDP. (This should not exclude the possibility
of implementing TFTP on top of other datagram protocols.) It is
designed to be small and easy to implement. Therefore, it lacks most
of the features of a regular FTP. The only thing it can do is read
and write files (or mail) from/to a remote server. It cannot list
directories, and currently has no provisions for user authentication.
In common with other Internet protocols, it passes 8 bit bytes of
data.
Three modes of transfer are currently supported: netascii (This is
ascii as defined in "USA Standard Code for Information Interchange"
[1] with the modifications specified in "Telnet Protocol
Specification" [3].) Note that it is 8 bit ascii. The term
"netascii" will be used throughout this document to mean this
particular version of ascii.); octet (This replaces the "binary" mode
of previous versions of this document.) raw 8 bit bytes; mail,
netascii characters sent to a user rather than a file. (The mail
mode is obsolete and should not be implemented or used.) Additional
modes can be defined by pairs of cooperating hosts.
Reference [4] (section 4.2) should be consulted for further valuable
directives and suggestions on TFTP.
2. Overview of the Protocol
Any transfer begins with a request to read or write a file, which
also serves to request a connection. If the server grants the
request, the connection is opened and the file is sent in fixed
length blocks of 512 bytes. Each data packet contains one block of
data, and must be acknowledged by an acknowledgment packet before the
next packet can be sent. A data packet of less than 512 bytes
signals termination of a transfer. If a packet gets lost in the
network, the intended recipient will timeout and may retransmit his
last packet (which may be data or an acknowledgment), thus causing
the sender of the lost packet to retransmit that lost packet. The
sender has to keep just one packet on hand for retransmission, since
the lock step acknowledgment guarantees that all older packets have
been received. Notice that both machines involved in a transfer are
considered senders and receivers. One sends data and receives
acknowledgments, the other sends acknowledgments and receives data.
Most errors cause termination of the connection. An error is
signalled by sending an error packet. This packet is not
acknowledged, and not retransmitted (i.e., a TFTP server or user may
terminate after sending an error message), so the other end of the
connection may not get it. Therefore timeouts are used to detect
such a termination when the error packet has been lost. Errors are
Sollins [Page 2]
RFC 1350 TFTP Revision 2 July 1992
caused by three types of events: not being able to satisfy the
request (e.g., file not found, access violation, or no such user),
receiving a packet which cannot be explained by a delay or
duplication in the network (e.g., an incorrectly formed packet), and
losing access to a necessary resource (e.g., disk full or access
denied during a transfer).
TFTP recognizes only one error condition that does not cause
termination, the source port of a received packet being incorrect.
In this case, an error packet is sent to the originating host.
This protocol is very restrictive, in order to simplify
implementation. For example, the fixed length blocks make allocation
straight forward, and the lock step acknowledgement provides flow
control and eliminates the need to reorder incoming data packets.
3. Relation to other Protocols
As mentioned TFTP is designed to be implemented on top of the
Datagram protocol (UDP). Since Datagram is implemented on the
Internet protocol, packets will have an Internet header, a Datagram
header, and a TFTP header. Additionally, the packets may have a
header (LNI, ARPA header, etc.) to allow them through the local
transport medium. As shown in Figure 3-1, the order of the contents
of a packet will be: local medium header, if used, Internet header,
Datagram header, TFTP header, followed by the remainder of the TFTP
packet. (This may or may not be data depending on the type of packet
as specified in the TFTP header.) TFTP does not specify any of the
values in the Internet header. On the other hand, the source and
destination port fields of the Datagram header (its format is given
in the appendix) are used by TFTP and the length field reflects the
size of the TFTP packet. The transfer identifiers (TID's) used by
TFTP are passed to the Datagram layer to be used as ports; therefore
they must be between 0 and 65,535. The initialization of TID's is
discussed in the section on initial connection protocol.
The TFTP header consists of a 2 byte opcode field which indicates
the packet's type (e.g., DATA, ERROR, etc.) These opcodes and the
formats of the various types of packets are discussed further in the
section on TFTP packets.
Sollins [Page 3]
RFC 1350 TFTP Revision 2 July 1992
---------------------------------------------------
| Local Medium | Internet | Datagram | TFTP |
---------------------------------------------------
Figure 3-1: Order of Headers
4. Initial Connection Protocol
A transfer is established by sending a request (WRQ to write onto a
foreign file system, or RRQ to read from it), and receiving a
positive reply, an acknowledgment packet for write, or the first data
packet for read. In general an acknowledgment packet will contain
the block number of the data packet being acknowledged. Each data
packet has associated with it a block number; block numbers are
consecutive and begin with one. Since the positive response to a
write request is an acknowledgment packet, in this special case the
block number will be zero. (Normally, since an acknowledgment packet
is acknowledging a data packet, the acknowledgment packet will
contain the block number of the data packet being acknowledged.) If
the reply is an error packet, then the request has been denied.
In order to create a connection, each end of the connection chooses a
TID for itself, to be used for the duration of that connection. The
TID's chosen for a connection should be randomly chosen, so that the
probability that the same number is chosen twice in immediate
succession is very low. Every packet has associated with it the two
TID's of the ends of the connection, the source TID and the
destination TID. These TID's are handed to the supporting UDP (or
other datagram protocol) as the source and destination ports. A
requesting host chooses its source TID as described above, and sends
its initial request to the known TID 69 decimal (105 octal) on the
serving host. The response to the request, under normal operation,
uses a TID chosen by the server as its source TID and the TID chosen
for the previous message by the requestor as its destination TID.
The two chosen TID's are then used for the remainder of the transfer.
As an example, the following shows the steps used to establish a
connection to write a file. Note that WRQ, ACK, and DATA are the
names of the write request, acknowledgment, and data types of packets
respectively. The appendix contains a similar example for reading a
file.
Sollins [Page 4]
RFC 1350 TFTP Revision 2 July 1992
1. Host A sends a "WRQ" to host B with source= A's TID,
destination= 69.
2. Host B sends a "ACK" (with block number= 0) to host A with
source= B's TID, destination= A's TID.
At this point the connection has been established and the first data
packet can be sent by Host A with a sequence number of 1. In the
next step, and in all succeeding steps, the hosts should make sure
that the source TID matches the value that was agreed on in steps 1
and 2. If a source TID does not match, the packet should be
discarded as erroneously sent from somewhere else. An error packet
should be sent to the source of the incorrect packet, while not
disturbing the transfer. This can be done only if the TFTP in fact
receives a packet with an incorrect TID. If the supporting protocols
do not allow it, this particular error condition will not arise.
The following example demonstrates a correct operation of the
protocol in which the above situation can occur. Host A sends a
request to host B. Somewhere in the network, the request packet is
duplicated, and as a result two acknowledgments are returned to host
A, with different TID's chosen on host B in response to the two
requests. When the first response arrives, host A continues the
connection. When the second response to the request arrives, it
should be rejected, but there is no reason to terminate the first
connection. Therefore, if different TID's are chosen for the two
connections on host B and host A checks the source TID's of the
messages it receives, the first connection can be maintained while
the second is rejected by returning an error packet.
5. TFTP Packets
TFTP supports five types of packets, all of which have been mentioned
above:
opcode operation
1 Read request (RRQ)
2 Write request (WRQ)
3 Data (DATA)
4 Acknowledgment (ACK)
5 Error (ERROR)
The TFTP header of a packet contains the opcode associated with
that packet.
Sollins [Page 5]
RFC 1350 TFTP Revision 2 July 1992
2 bytes string 1 byte string 1 byte
------------------------------------------------
| Opcode | Filename | 0 | Mode | 0 |
------------------------------------------------
Figure 5-1: RRQ/WRQ packet
RRQ and WRQ packets (opcodes 1 and 2 respectively) have the format
shown in Figure 5-1. The file name is a sequence of bytes in
netascii terminated by a zero byte. The mode field contains the
string "netascii", "octet", or "mail" (or any combination of upper
and lower case, such as "NETASCII", NetAscii", etc.) in netascii
indicating the three modes defined in the protocol. A host which
receives netascii mode data must translate the data to its own
format. Octet mode is used to transfer a file that is in the 8-bit
format of the machine from which the file is being transferred. It
is assumed that each type of machine has a single 8-bit format that
is more common, and that that format is chosen. For example, on a
DEC-20, a 36 bit machine, this is four 8-bit bytes to a word with
four bits of breakage. If a host receives a octet file and then
returns it, the returned file must be identical to the original.
Mail mode uses the name of a mail recipient in place of a file and
must begin with a WRQ. Otherwise it is identical to netascii mode.
The mail recipient string should be of the form "username" or
"username@hostname". If the second form is used, it allows the
option of mail forwarding by a relay computer.
The discussion above assumes that both the sender and recipient are
operating in the same mode, but there is no reason that this has to
be the case. For example, one might build a storage server. There
is no reason that such a machine needs to translate netascii into its
own form of text. Rather, the sender might send files in netascii,
but the storage server might simply store them without translation in
8-bit format. Another such situation is a problem that currently
exists on DEC-20 systems. Neither netascii nor octet accesses all
the bits in a word. One might create a special mode for such a
machine which read all the bits in a word, but in which the receiver
stored the information in 8-bit format. When such a file is
retrieved from the storage site, it must be restored to its original
form to be useful, so the reverse mode must also be implemented. The
user site will have to remember some information to achieve this. In
both of these examples, the request packets would specify octet mode
to the foreign host, but the local host would be in some other mode.
No such machine or application specific modes have been specified in
TFTP, but one would be compatible with this specification.
It is also possible to define other modes for cooperating pairs of
Sollins [Page 6]
RFC 1350 TFTP Revision 2 July 1992
hosts, although this must be done with care. There is no requirement
that any other hosts implement these. There is no central authority
that will define these modes or assign them names.
2 bytes 2 bytes n bytes
----------------------------------
| Opcode | Block # | Data |
----------------------------------
Figure 5-2: DATA packet
Data is actually transferred in DATA packets depicted in Figure 5-2.
DATA packets (opcode = 3) have a block number and data field. The
block numbers on data packets begin with one and increase by one for
each new block of data. This restriction allows the program to use a
single number to discriminate between new packets and duplicates.
The data field is from zero to 512 bytes long. If it is 512 bytes
long, the block is not the last block of data; if it is from zero to
511 bytes long, it signals the end of the transfer. (See the section
on Normal Termination for details.)
All packets other than duplicate ACK's and those used for
termination are acknowledged unless a timeout occurs [4]. Sending a
DATA packet is an acknowledgment for the first ACK packet of the
previous DATA packet. The WRQ and DATA packets are acknowledged by
ACK or ERROR packets, while RRQ
2 bytes 2 bytes
---------------------
| Opcode | Block # |
---------------------
Figure 5-3: ACK packet
and ACK packets are acknowledged by DATA or ERROR packets. Figure
5-3 depicts an ACK packet; the opcode is 4. The block number in
an ACK echoes the block number of the DATA packet being
acknowledged. A WRQ is acknowledged with an ACK packet having a
block number of zero.
Sollins [Page 7]
RFC 1350 TFTP Revision 2 July 1992
2 bytes 2 bytes string 1 byte
-----------------------------------------
| Opcode | ErrorCode | ErrMsg | 0 |
-----------------------------------------
Figure 5-4: ERROR packet
An ERROR packet (opcode 5) takes the form depicted in Figure 5-4. An
ERROR packet can be the acknowledgment of any other type of packet.
The error code is an integer indicating the nature of the error. A
table of values and meanings is given in the appendix. (Note that
several error codes have been added to this version of this
document.) The error message is intended for human consumption, and
should be in netascii. Like all other strings, it is terminated with
a zero byte.
6. Normal Termination
The end of a transfer is marked by a DATA packet that contains
between 0 and 511 bytes of data (i.e., Datagram length < 516). This
packet is acknowledged by an ACK packet like all other DATA packets.
The host acknowledging the final DATA packet may terminate its side
of the connection on sending the final ACK. On the other hand,
dallying is encouraged. This means that the host sending the final
ACK will wait for a while before terminating in order to retransmit
the final ACK if it has been lost. The acknowledger will know that
the ACK has been lost if it receives the final DATA packet again.
The host sending the last DATA must retransmit it until the packet is
acknowledged or the sending host times out. If the response is an
ACK, the transmission was completed successfully. If the sender of
the data times out and is not prepared to retransmit any more, the
transfer may still have been completed successfully, after which the
acknowledger or network may have experienced a problem. It is also
possible in this case that the transfer was unsuccessful. In any
case, the connection has been closed.
7. Premature Termination
If a request can not be granted, or some error occurs during the
transfer, then an ERROR packet (opcode 5) is sent. This is only a
courtesy since it will not be retransmitted or acknowledged, so it
may never be received. Timeouts must also be used to detect errors.
Sollins [Page 8]
RFC 1350 TFTP Revision 2 July 1992
I. Appendix
Order of Headers
2 bytes
----------------------------------------------------------
| Local Medium | Internet | Datagram | TFTP Opcode |
----------------------------------------------------------
TFTP Formats
Type Op # Format without header
2 bytes string 1 byte string 1 byte
-----------------------------------------------
RRQ/ | 01/02 | Filename | 0 | Mode | 0 |
WRQ -----------------------------------------------
2 bytes 2 bytes n bytes
---------------------------------
DATA | 03 | Block # | Data |
---------------------------------
2 bytes 2 bytes
-------------------
ACK | 04 | Block # |
--------------------
2 bytes 2 bytes string 1 byte
----------------------------------------
ERROR | 05 | ErrorCode | ErrMsg | 0 |
----------------------------------------
Initial Connection Protocol for reading a file
1. Host A sends a "RRQ" to host B with source= A's TID,
destination= 69.
2. Host B sends a "DATA" (with block number= 1) to host A with
source= B's TID, destination= A's TID.
Sollins [Page 9]
RFC 1350 TFTP Revision 2 July 1992
Error Codes
Value Meaning
0 Not defined, see error message (if any).
1 File not found.
2 Access violation.
3 Disk full or allocation exceeded.
4 Illegal TFTP operation.
5 Unknown transfer ID.
6 File already exists.
7 No such user.
Internet User Datagram Header [2]
(This has been included only for convenience. TFTP need not be
implemented on top of the Internet User Datagram Protocol.)
Format
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Source Port | Destination Port |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Length | Checksum |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Values of Fields
Source Port Picked by originator of packet.
Dest. Port Picked by destination machine (69 for RRQ or WRQ).
Length Number of bytes in UDP packet, including UDP header.
Checksum Reference 2 describes rules for computing checksum.
(The implementor of this should be sure that the
correct algorithm is used here.)
Field contains zero if unused.
Note: TFTP passes transfer identifiers (TID's) to the Internet User
Datagram protocol to be used as the source and destination ports.
Sollins [Page 10]
RFC 1350 TFTP Revision 2 July 1992
References
[1] USA Standard Code for Information Interchange, USASI X3.4-1968.
[2] Postel, J., "User Datagram Protocol," RFC 768, USC/Information
Sciences Institute, 28 August 1980.
[3] Postel, J., "Telnet Protocol Specification," RFC 764,
USC/Information Sciences Institute, June, 1980.
[4] Braden, R., Editor, "Requirements for Internet Hosts --
Application and Support", RFC 1123, USC/Information Sciences
Institute, October 1989.
Security Considerations
Since TFTP includes no login or access control mechanisms, care must
be taken in the rights granted to a TFTP server process so as not to
violate the security of the server hosts file system. TFTP is often
installed with controls such that only files that have public read
access are available via TFTP and writing files via TFTP is
disallowed.
Author's Address
Karen R. Sollins
Massachusetts Institute of Technology
Laboratory for Computer Science
545 Technology Square
Cambridge, MA 02139-1986
Phone: (617) 253-6006
EMail: SOLLINS@LCS.MIT.EDU
Sollins [Page 11]

File diff suppressed because it is too large Load Diff

10755
kernel/picotcp/RFC/rfc1470.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,339 @@
Network Working Group A. Rijsinghani, Editor
Request for Comments: 1624 Digital Equipment Corporation
Updates: 1141 May 1994
Category: Informational
Computation of the Internet Checksum
via Incremental Update
Status of this Memo
This memo provides information for the Internet community. This memo
does not specify an Internet standard of any kind. Distribution of
this memo is unlimited.
Abstract
This memo describes an updated technique for incremental computation
of the standard Internet checksum. It updates the method described
in RFC 1141.
Table of Contents
1. Introduction .......................................... 1
2. Notation and Equations ................................ 2
3. Discussion ............................................ 2
4. Examples .............................................. 3
5. Checksum verification by end systems .................. 4
6. Historical Note ....................................... 4
7. Acknowledgments ....................................... 5
8. Security Considerations ............................... 5
9. Conclusions ........................................... 5
10. Author's Address ..................................... 5
11. References ........................................... 6
1. Introduction
Incremental checksum update is useful in speeding up several
types of operations routinely performed on IP packets, such as
TTL update, IP fragmentation, and source route update.
RFC 1071, on pages 4 and 5, describes a procedure to
incrementally update the standard Internet checksum. The
relevant discussion, though comprehensive, was not complete.
Therefore, RFC 1141 was published to replace this description
on Incremental Update. In particular, RFC 1141 provides a
more detailed exposure to the procedure described in RFC 1071.
However, it computes a result for certain cases that differs
Rijsinghani [Page 1]
RFC 1624 Incremental Internet Checksum May 1994
from the one obtained from scratch (one's complement of one's
complement sum of the original fields).
For the sake of completeness, this memo briefly highlights key
points from RFCs 1071 and 1141. Based on these discussions,
an updated procedure to incrementally compute the standard
Internet checksum is developed and presented.
2. Notation and Equations
Given the following notation:
HC - old checksum in header
C - one's complement sum of old header
HC' - new checksum in header
C' - one's complement sum of new header
m - old value of a 16-bit field
m' - new value of a 16-bit field
RFC 1071 states that C' is:
C' = C + (-m) + m' -- [Eqn. 1]
= C + (m' - m)
As RFC 1141 points out, the equation above is not useful for direct
use in incremental updates since C and C' do not refer to the actual
checksum stored in the header. In addition, it is pointed out that
RFC 1071 did not specify that all arithmetic must be performed using
one's complement arithmetic.
Finally, complementing the above equation to get the actual checksum,
RFC 1141 presents the following:
HC' = ~(C + (-m) + m')
= HC + (m - m')
= HC + m + ~m' -- [Eqn. 2]
3. Discussion
Although this equation appears to work, there are boundary conditions
under which it produces a result which differs from the one obtained
by checksum computation from scratch. This is due to the way zero is
handled in one's complement arithmetic.
In one's complement, there are two representations of zero: the all
zero and the all one bit values, often referred to as +0 and -0.
One's complement addition of non-zero inputs can produce -0 as a
result, but never +0. Since there is guaranteed to be at least one
Rijsinghani [Page 2]
RFC 1624 Incremental Internet Checksum May 1994
non-zero field in the IP header, and the checksum field in the
protocol header is the complement of the sum, the checksum field can
never contain ~(+0), which is -0 (0xFFFF). It can, however, contain
~(-0), which is +0 (0x0000).
RFC 1141 yields an updated header checksum of -0 when it should be
+0. This is because it assumed that one's complement has a
distributive property, which does not hold when the result is 0 (see
derivation of [Eqn. 2]).
The problem is avoided by not assuming this property. The correct
equation is given below:
HC' = ~(C + (-m) + m') -- [Eqn. 3]
= ~(~HC + ~m + m')
4. Examples
Consider an IP packet header in which a 16-bit field m = 0x5555
changes to m' = 0x3285. Also, the one's complement sum of all other
header octets is 0xCD7A.
Then the header checksum would be:
HC = ~(0xCD7A + 0x5555)
= ~0x22D0
= 0xDD2F
The new checksum via recomputation is:
HC' = ~(0xCD7A + 0x3285)
= ~0xFFFF
= 0x0000
Using [Eqn. 2], as specified in RFC 1141, the new checksum is
computed as:
HC' = HC + m + ~m'
= 0xDD2F + 0x5555 + ~0x3285
= 0xFFFF
which does not match that computed from scratch, and moreover can
never obtain for an IP header.
Rijsinghani [Page 3]
RFC 1624 Incremental Internet Checksum May 1994
Applying [Eqn. 3] to the example above, we get the correct result:
HC' = ~(C + (-m) + m')
= ~(0x22D0 + ~0x5555 + 0x3285)
= ~0xFFFF
= 0x0000
5. Checksum verification by end systems
If an end system verifies the checksum by including the checksum
field itself in the one's complement sum and then comparing the
result against -0, as recommended by RFC 1071, it does not matter if
an intermediate system generated a -0 instead of +0 due to the RFC
1141 property described here. In the example above:
0xCD7A + 0x3285 + 0xFFFF = 0xFFFF
0xCD7A + 0x3285 + 0x0000 = 0xFFFF
However, implementations exist which verify the checksum by computing
it and comparing against the header checksum field.
It is recommended that intermediate systems compute incremental
checksum using the method described in this document, and end systems
verify checksum as per the method described in RFC 1071.
The method in [Eqn. 3] is slightly more expensive than the one in RFC
1141. If this is a concern, the two additional instructions can be
eliminated by subtracting complements with borrow [see Sec. 7]. This
would result in the following equation:
HC' = HC - ~m - m' -- [Eqn. 4]
In the example shown above,
HC' = HC - ~m - m'
= 0xDD2F - ~0x5555 - 0x3285
= 0x0000
6. Historical Note
A historical aside: the fact that standard one's complement
arithmetic produces negative zero results is one of its main
drawbacks; it makes for difficulty in interpretation. In the CDC
6000 series computers [4], this problem was avoided by using
subtraction as the primitive in one's complement arithmetic (i.e.,
addition is subtraction of the complement).
Rijsinghani [Page 4]
RFC 1624 Incremental Internet Checksum May 1994
7. Acknowledgments
The contribution of the following individuals to the work that led to
this document is acknowledged:
Manu Kaycee - Ascom Timeplex, Incorporated
Paul Koning - Digital Equipment Corporation
Tracy Mallory - 3Com Corporation
Krishna Narayanaswamy - Digital Equipment Corporation
Atul Pandya - Digital Equipment Corporation
The failure condition was uncovered as a result of IP testing on a
product which implemented the RFC 1141 algorithm. It was analyzed,
and the updated algorithm devised. This algorithm was also verified
using simulation. It was also shown that the failure condition
disappears if the checksum verification is done as per RFC 1071.
8. Security Considerations
Security issues are not discussed in this memo.
9. Conclusions
It is recommended that either [Eqn. 3] or [Eqn. 4] be the
implementation technique used for incremental update of the standard
Internet checksum.
10. Author's Address
Anil Rijsinghani
Digital Equipment Corporation
550 King St
Littleton, MA 01460
Phone: (508) 486-6786
EMail: anil@levers.enet.dec.com
Rijsinghani [Page 5]
RFC 1624 Incremental Internet Checksum May 1994
11. References
[1] Postel, J., "Internet Protocol - DARPA Internet Program Protocol
Specification", STD 5, RFC 791, DARPA, September 1981.
[2] Braden, R., Borman, D., and C. Partridge, "Computing the Internet
Checksum", RFC 1071, ISI, Cray Research, BBN Laboratories,
September 1988.
[3] Mallory, T., and A. Kullberg, "Incremental Updating of the
Internet Checksum", RFC 1141, BBN Communications, January 1990.
[4] Thornton, J., "Design of a Computer -- the Control
Data 6600", Scott, Foresman and Company, 1970.
Rijsinghani [Page 6]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,339 @@
Network Working Group S. Cobb
Request for Comments: 1877 Microsoft
Category: Informational December 1995
PPP Internet Protocol Control Protocol Extensions for
Name Server Addresses
Status of this Memo
This memo provides information for the Internet community. This memo
does not specify an Internet standard of any kind. Distribution of
this memo is unlimited.
Abstract
The Point-to-Point Protocol (PPP) [1] provides a standard method for
transporting multi-protocol datagrams over point-to-point links. PPP
defines an extensible Link Control Protocol and a family of Network
Control Protocols (NCPs) for establishing and configuring different
network-layer protocols.
This document extends the NCP for establishing and configuring the
Internet Protocol over PPP [2], defining the negotiation of primary
and secondary Domain Name System (DNS) [3] and NetBIOS Name Server
(NBNS) [4] addresses.
Table of Contents
1. Additional IPCP Configuration options ................. 1
1.1 Primary DNS Server Address .................... 2
1.2 Primary NBNS Server Address ................... 3
1.3 Secondary DNS Server Address .................. 4
1.4 Secondary NBNS Server Address ................. 5
REFRENCES .................................................... 6
SECURITY CONSIDERATIONS ...................................... 6
CHAIR'S ADDRESS .............................................. 6
AUTHOR'S ADDRESS ............................................. 6
1. Additional IPCP Configuration Options
The four name server address configuration options, 129 to 132,
provide a method of obtaining the addresses of Domain Name System
(DNS) servers and (NetBIOS Name Server (NBNS) nodes on the remote
network.
Cobb Informational [Page 1]
RFC 1877 PPP IPCP Extensions December 1995
Primary and secondary addresses are negotiated independently. They
serve identical purposes, except that when both are present an
attempt SHOULD be made to resolve names using the primary address
before using the secondary address.
For implementational convenience, these options are designed to be
identical in format and behavior to option 3 (IP-Address) which is
already present in most IPCP implementations.
Since the usefulness of name server address information is dependent
on the topology of the remote network and local peer's application,
it is suggested that these options not be included in the list of
"IPCP Recommended Options".
1.1. Primary DNS Server Address
Description
This Configuration Option defines a method for negotiating with
the remote peer the address of the primary DNS server to be used
on the local end of the link. If local peer requests an invalid
server address (which it will typically do intentionally) the
remote peer specifies the address by NAKing this option, and
returning the IP address of a valid DNS server.
By default, no primary DNS address is provided.
A summary of the Primary DNS Address Configuration Option format is
shown below. The fields are transmitted from left to right.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Type | Length | Primary-DNS-Address
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Primary-DNS-Address (cont) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Type
129
Length
6
Cobb Informational [Page 2]
RFC 1877 PPP IPCP Extensions December 1995
Primary-DNS-Address
The four octet Primary-DNS-Address is the address of the primary
DNS server to be used by the local peer. If all four octets are
set to zero, it indicates an explicit request that the peer
provide the address information in a Config-Nak packet.
Default
No address is provided.
1.2. Primary NBNS Server Address
Description
This Configuration Option defines a method for negotiating with
the remote peer the address of the primary NBNS server to be used
on the local end of the link. If local peer requests an invalid
server address (which it will typically do intentionally) the
remote peer specifies the address by NAKing this option, and
returning the IP address of a valid NBNS server.
By default, no primary NBNS address is provided.
A summary of the Primary NBNS Address Configuration Option format is
shown below. The fields are transmitted from left to right.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Type | Length | Primary-NBNS-Address
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Primary-NBNS-Address (cont) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Type
130
Length
6
Primary-NBNS-Address
The four octet Primary-NBNS-Address is the address of the primary
NBNS server to be used by the local peer. If all four octets are
set to zero, it indicates an explicit request that the peer
Cobb Informational [Page 3]
RFC 1877 PPP IPCP Extensions December 1995
provide the address information in a Config-Nak packet.
Default
No address is provided.
1.3. Secondary DNS Server Address
Description
This Configuration Option defines a method for negotiating with
the remote peer the address of the secondary DNS server to be used
on the local end of the link. If local peer requests an invalid
server address (which it will typically do intentionally) the
remote peer specifies the address by NAKing this option, and
returning the IP address of a valid DNS server.
By default, no secondary DNS address is provided.
A summary of the Secondary DNS Address Configuration Option format is
shown below. The fields are transmitted from left to right.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Type | Length | Secondary-DNS-Address
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Secondary-DNS-Address (cont) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Type
131
Length
6
Secondary-DNS-Address
The four octet Secondary-DNS-Address is the address of the primary
NBNS server to be used by the local peer. If all four octets are
set to zero, it indicates an explicit request that the peer
provide the address information in a Config-Nak packet.
Default
No address is provided.
Cobb Informational [Page 4]
RFC 1877 PPP IPCP Extensions December 1995
1.4. Secondary NBNS Server Address
Description
This Configuration Option defines a method for negotiating with
the remote peer the address of the secondary NBNS server to be
used on the local end of the link. If local peer requests an
invalid server address (which it will typically do intentionally)
the remote peer specifies the address by NAKing this option, and
returning the IP address of a valid NBNS server.
By default, no secondary NBNS address is provided.
A summary of the Secondary NBNS Address Configuration Option format
is shown below. The fields are transmitted from left to right.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Type | Length | Secondary-NBNS-Address
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Secondary-NBNS-Address (cont) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Type
132
Length
6
Secondary-NBNS-Address
The four octet Secondary-NBNS-Address is the address of the
secondary NBNS server to be used by the local peer. If all
four octets are set to zero, it indicates an explicit request
that the peer provide the address information in a Config-Nak
packet.
Default
No address is provided.
Cobb Informational [Page 5]
RFC 1877 PPP IPCP Extensions December 1995
References
[1] Simpson, W., Editor, "The Point-to-Point Protocol (PPP)", STD 51,
RFC 1661, Daydreamer, July 1994.
[2] McGregor, G., "PPP Internet Control Protocol", RFC 1332, Merit,
May 1992.
[3] Auerbach, K., and A. Aggarwal, "Protocol Standard for a NetBIOS
Service on a TCP/UDP Transport", STD 19, RFCs 1001 and 1002,
March 1987.
[4] Mockapetris, P., "Domain Names - Concepts and Facilities", STD
13, RFC 1034, USC/Information Sciences Institute, November 1987.
[5] Mockapetris, P., "Domain Names - Implementation and
Specification", STD 13, RFC 1035, USC/Information Sciences
Institute, November 1987.
Security Considerations
Security issues are not discussed in this memo.
Chair's Address
The working group can be contacted via the current chair:
Fred Baker
Cisco Systems
519 Lado Drive
Santa Barbara, California 93111
EMail: fred@cisco.com
Author's Address
Questions about this memo can also be directed to:
Steve Cobb
Microsoft Corporation
One Microsoft Way
Redmond, WA 98052-6399
Phone: (206) 882-8080
EMail: stevec@microsoft.com
Cobb Informational [Page 6]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,339 @@
Network Working Group S. Bellovin
Request for Comments: 1948 AT&T Research
Category: Informational May 1996
Defending Against Sequence Number Attacks
Status of This Memo
This memo provides information for the Internet community. This memo
does not specify an Internet standard of any kind. Distribution of
this memo is unlimited.
Abstract
IP spoofing attacks based on sequence number spoofing have become a
serious threat on the Internet (CERT Advisory CA-95:01). While
ubiquitous crypgraphic authentication is the right answer, we propose
a simple modification to TCP implementations that should be a very
substantial block to the current wave of attacks.
Overview and Rational
In 1985, Morris [1] described a form of attack based on guessing what
sequence numbers TCP [2] will use for new connections. Briefly, the
attacker gags a host trusted by the target, impersonates the IP
address of the trusted host when talking to the target, and completes
the 3-way handshake based on its guess at the next initial sequence
number to be used. An ordinary connection to the target is used to
gather sequence number state information. This entire sequence,
coupled with address-based authentication, allows the attacker to
execute commands on the target host.
Clearly, the proper solution is cryptographic authentication [3,4].
But it will quite a long time before that is deployed. It has
therefore been necessary for many sites to restrict use of protocols
that rely on address-based authentication, such as rlogin and rsh.
Unfortunately, the prevalence of "sniffer attacks" -- network
eavesdropping (CERT Advisory CA-94:01) -- has rendered ordinary
TELNET [5] very dangerous as well. The Internet is thus left without
a safe, secure mechanism for remote login.
We propose a simple change to TCP implementations that will block
most sequence number guessing attacks. More precisely, such attacks
will remain possible if and only if the Bad Guy already has the
ability to launch even more devastating attacks.
Bellovin Informational [Page 1]
RFC 1948 Sequence Number Attacks May 1996
Details of the Attack
In order to understand the particular case of sequence number
guessing, one must look at the 3-way handshake used in the TCP open
sequence [2]. Suppose client machine A wants to talk to rsh server
B. It sends the following message:
A->B: SYN, ISNa
That is, it sends a packet with the SYN ("synchronize sequence
number") bit set and an initial sequence number ISNa.
B replies with
B->A: SYN, ISNb, ACK(ISNa)
In addition to sending its own initial sequence number, it
acknowledges A's. Note that the actual numeric value ISNa must
appear in the message.
A concludes the handshake by sending
A->B: ACK(ISNb)
The initial sequence numbers are intended to be more or less random.
More precisely, RFC 793 specifies that the 32-bit counter be
incremented by 1 in the low-order position about every 4
microseconds. Instead, Berkeley-derived kernels increment it by a
constant every second, and by another constant for each new
connection. Thus, if you open a connection to a machine, you know to
a very high degree of confidence what sequence number it will use for
its next connection. And therein lies the attack.
The attacker X first opens a real connection to its target B -- say,
to the mail port or the TCP echo port. This gives ISNb. It then
impersonates A and sends
Ax->B: SYN, ISNx
where "Ax" denotes a packet sent by X pretending to be A.
B's response to X's original SYN (so to speak)
B->A: SYN, ISNb', ACK(ISNx)
Bellovin Informational [Page 2]
RFC 1948 Sequence Number Attacks May 1996
goes to the legitimate A, about which more anon. X never sees that
message but can still send
Ax->B: ACK(ISNb')
using the predicted value for ISNb'. If the guess is right -- and
usually it will be -- B's rsh server thinks it has a legitimate
connection with A, when in fact X is sending the packets. X can't
see the output from this session, but it can execute commands as more
or less any user -- and in that case, the game is over and X has won.
There is a minor difficulty here. If A sees B's message, it will
realize that B is acknowledging something it never sent, and will
send a RST packet in response to tear down the connection. There are
a variety of ways to prevent this; the easiest is to wait until the
real A is down (possibly as a result of enemy action, of course). In
actual practice, X can gag A by exploiting a very common
implementation bug; this is described below.
The Fix
The choice of initial sequence numbers for a connection is not
random. Rather, it must be chosen so as to minimize the probability
of old stale packets being accepted by new incarnations of the same
connection [6, Appendix A]. Furthermore, implementations of TCP
derived from 4.2BSD contain special code to deal with such
reincarnations when the server end of the original connection is
still in TIMEWAIT state [7, pp. 945]. Accordingly, simple
randomization, as suggested in [8], will not work well.
But duplicate packets, and hence the restrictions on the initial
sequence number for reincarnations, are peculiar to individual
connections. That is, there is no connection, syntactic or semantic,
between the sequence numbers used for two different connections. We
can prevent sequence number guessing attacks by giving each
connection -- that is, each 4-tuple of <localhost, localport,
remotehost, remoteport> -- a separate sequence number space. Within
each space, the initial sequence number is incremented according to
[2]; however, there is no obvious relationship between the numbering
in different spaces.
The obvious way to do this is to maintain state for dead connections,
and the easiest way to do that is to change the TCP state transition
diagram so that both ends of all connections go to TIMEWAIT state.
That would work, but it's inelegant and consumes storage space.
Instead, we use the current 4 microsecond timer M and set
ISN = M + F(localhost, localport, remotehost, remoteport).
Bellovin Informational [Page 3]
RFC 1948 Sequence Number Attacks May 1996
It is vital that F not be computable from the outside, or an attacker
could still guess at sequence numbers from the initial sequence
number used for some other connection. We therefore suggest that F
be a cryptographic hash function of the connection-id and some secret
data. MD5 [9] is a good choice, since the code is widely available.
The secret data can either be a true random number [10], or it can be
the combination of some per-host secret and the boot time of the
machine. The boot time is included to ensure that the secret is
changed on occasion. Other data, such as the host's IP address and
name, may be included in the hash as well; this eases administration
by permitting a network of workstations to share the same secret data
while still giving them separate sequence number spaces. Our
recommendation, in fact, is to use all three of these items: as
random a number as the hardware can generate, an administratively-
installed pass phrase, and the machine's IP address. This allows for
local choice on how secure the secret is.
Note that the secret cannot easily be changed on a live machine.
Doing so would change the initial sequence numbers used for
reincarnated connections; to maintain safety, either dead connection
state must be kept or a quiet time observed for two maximum segment
lifetimes after such a change.
A Common TCP Bug
As mentioned earlier, attackers using sequence number guessing have
to "gag" the trusted machine first. While a number of strategies are
possible, most of the attacks detected thus far rely on an
implementation bug.
When SYN packets are received for a connection, the receiving system
creates a new TCB in SYN-RCVD state. To avoid overconsumption of
resources, 4.2BSD-derived systems permit only a limited number of
TCBs in this state per connection. Once this limit is reached,
future SYN packets for new connections are discarded; it is assumed
that the client will retransmit them as needed.
When a packet is received, the first thing that must be done is a
search for the TCB for that connection. If no TCB is found, the
kernel searches for a "wild card" TCB used by servers to accept
connections from all clients. Unfortunately, in many kernels this
code is invoked for any incoming packets, not just for initial SYN
packets. If the SYN-RCVD queue is full for the wildcard TCB, any new
packets specifying just that host and port number will be discarded,
even if they aren't SYN packets.
Bellovin Informational [Page 4]
RFC 1948 Sequence Number Attacks May 1996
To gag a host, then, the attacker sends a few dozen SYN packets to
the rlogin port from different port numbers on some non-existent
machine. This fills up the SYN-RCVD queue, while the SYN+ACK packets
go off to the bit bucket. The attack on the target machine then
appears to come from the rlogin port on the trusted machine. The
replies -- the SYN+ACKs from the target -- will be perceived as
packets belonging to a full queue, and will be dropped silently.
This could be avoided if the full queue code checked for the ACK bit,
which cannot legally be on for legitimate open requests. If it is
on, RST should be sent in reply.
Security Considerations
Good sequence numbers are not a replacement for cryptographic
authentication. At best, they're a palliative measure.
An eavesdropper who can observe the initial messages for a connection
can determine its sequence number state, and may still be able to
launch sequence number guessing attacks by impersonating that
connection. However, such an eavesdropper can also hijack existing
connections [11], so the incremental threat isn't that high. Still,
since the offset between a fake connection and a given real
connection will be more or less constant for the lifetime of the
secret, it is important to ensure that attackers can never capture
such packets. Typical attacks that could disclose them include both
eavesdropping and the variety of routing attacks discussed in [8].
If random numbers are used as the sole source of the secret, they
MUST be chosen in accordance with the recommendations given in [10].
Acknowledgments
Matt Blaze and Jim Ellis contributed some crucial ideas to this RFC.
Frank Kastenholz contributed constructive comments to this memo.
References
[1] R.T. Morris, "A Weakness in the 4.2BSD UNIX TCP/IP Software",
CSTR 117, 1985, AT&T Bell Laboratories, Murray Hill, NJ.
[2] Postel, J., "Transmission Control Protocol", STD 7, RFC 793,
September 1981.
[3] Kohl, J., and C. Neuman, "The Kerberos Network Authentication
Service (V5)", RFC 1510, September 1993.
[4] Atkinson, R., "Security Architecture for the Internet
Protocol", RFC 1825, August 1995.
Bellovin Informational [Page 5]
RFC 1948 Sequence Number Attacks May 1996
[5] Postel, J., and J. Reynolds, "Telnet Protocol Specification",
STD 8, RFC 854, May 1983.
[6] Jacobson, V., Braden, R., and L. Zhang, "TCP Extension for
High-Speed Paths", RFC 1885, October 1990.
[7] G.R. Wright, W. R. Stevens, "TCP/IP Illustrated, Volume 2",
1995. Addison-Wesley.
[8] S. Bellovin, "Security Problems in the TCP/IP Protocol Suite",
April 1989, Computer Communications Review, vol. 19, no. 2, pp.
32-48.
[9] Rivest, R., "The MD5 Message-Digest Algorithm", RFC 1321,
April 1992.
[10] Eastlake, D., Crocker, S., and J. Schiller, "Randomness
Recommendations for Security", RFC 1750, December 1994.
[11] L. Joncheray, "A Simple Active Attack Against TCP, 1995, Proc.
Fifth Usenix UNIX Security Symposium.
Author's Address
Steven M. Bellovin
AT&T Research
600 Mountain Avenue
Murray Hill, NJ 07974
Phone: (908) 582-5886
EMail: smb@research.att.com
Bellovin Informational [Page 6]

View File

@ -0,0 +1,732 @@
Network Working Group W. Simpson
Request for Comments: 1994 DayDreamer
Obsoletes: 1334 August 1996
Category: Standards Track
PPP Challenge Handshake Authentication Protocol (CHAP)
Status of this Memo
This document specifies an Internet standards track protocol for the
Internet community, and requests discussion and suggestions for
improvements. Please refer to the current edition of the "Internet
Official Protocol Standards" (STD 1) for the standardization state
and status of this protocol. Distribution of this memo is unlimited.
Abstract
The Point-to-Point Protocol (PPP) [1] provides a standard method for
transporting multi-protocol datagrams over point-to-point links.
PPP also defines an extensible Link Control Protocol, which allows
negotiation of an Authentication Protocol for authenticating its peer
before allowing Network Layer protocols to transmit over the link.
This document defines a method for Authentication using PPP, which
uses a random Challenge, with a cryptographically hashed Response
which depends upon the Challenge and a secret key.
Table of Contents
1. Introduction .......................................... 1
1.1 Specification of Requirements ................... 1
1.2 Terminology ..................................... 2
2. Challenge-Handshake Authentication Protocol ........... 2
2.1 Advantages ...................................... 3
2.2 Disadvantages ................................... 3
2.3 Design Requirements ............................. 4
3. Configuration Option Format ........................... 5
4. Packet Format ......................................... 6
4.1 Challenge and Response .......................... 7
4.2 Success and Failure ............................. 9
SECURITY CONSIDERATIONS ...................................... 10
ACKNOWLEDGEMENTS ............................................. 11
REFERENCES ................................................... 12
CONTACTS ..................................................... 12
Simpson [Page i]
RFC 1994 PPP CHAP August 1996
1. Introduction
In order to establish communications over a point-to-point link, each
end of the PPP link must first send LCP packets to configure the data
link during Link Establishment phase. After the link has been
established, PPP provides for an optional Authentication phase before
proceeding to the Network-Layer Protocol phase.
By default, authentication is not mandatory. If authentication of
the link is desired, an implementation MUST specify the
Authentication-Protocol Configuration Option during Link
Establishment phase.
These authentication protocols are intended for use primarily by
hosts and routers that connect to a PPP network server via switched
circuits or dial-up lines, but might be applied to dedicated links as
well. The server can use the identification of the connecting host
or router in the selection of options for network layer negotiations.
This document defines a PPP authentication protocol. The Link
Establishment and Authentication phases, and the Authentication-
Protocol Configuration Option, are defined in The Point-to-Point
Protocol (PPP) [1].
1.1. Specification of Requirements
In this document, several words are used to signify the requirements
of the specification. These words are often capitalized.
MUST This word, or the adjective "required", means that the
definition is an absolute requirement of the specification.
MUST NOT This phrase means that the definition is an absolute
prohibition of the specification.
SHOULD This word, or the adjective "recommended", means that there
may exist valid reasons in particular circumstances to
ignore this item, but the full implications must be
understood and carefully weighed before choosing a
different course.
MAY This word, or the adjective "optional", means that this
item is one of an allowed set of alternatives. An
implementation which does not include this option MUST be
prepared to interoperate with another implementation which
does include the option.
Simpson [Page 1]
RFC 1994 PPP CHAP August 1996
1.2. Terminology
This document frequently uses the following terms:
authenticator
The end of the link requiring the authentication. The
authenticator specifies the authentication protocol to be
used in the Configure-Request during Link Establishment
phase.
peer The other end of the point-to-point link; the end which is
being authenticated by the authenticator.
silently discard
This means the implementation discards the packet without
further processing. The implementation SHOULD provide the
capability of logging the error, including the contents of
the silently discarded packet, and SHOULD record the event
in a statistics counter.
2. Challenge-Handshake Authentication Protocol
The Challenge-Handshake Authentication Protocol (CHAP) is used to
periodically verify the identity of the peer using a 3-way handshake.
This is done upon initial link establishment, and MAY be repeated
anytime after the link has been established.
1. After the Link Establishment phase is complete, the
authenticator sends a "challenge" message to the peer.
2. The peer responds with a value calculated using a "one-way
hash" function.
3. The authenticator checks the response against its own
calculation of the expected hash value. If the values match,
the authentication is acknowledged; otherwise the connection
SHOULD be terminated.
4. At random intervals, the authenticator sends a new challenge to
the peer, and repeats steps 1 to 3.
Simpson [Page 2]
RFC 1994 PPP CHAP August 1996
2.1. Advantages
CHAP provides protection against playback attack by the peer through
the use of an incrementally changing identifier and a variable
challenge value. The use of repeated challenges is intended to limit
the time of exposure to any single attack. The authenticator is in
control of the frequency and timing of the challenges.
This authentication method depends upon a "secret" known only to the
authenticator and that peer. The secret is not sent over the link.
Although the authentication is only one-way, by negotiating CHAP in
both directions the same secret set may easily be used for mutual
authentication.
Since CHAP may be used to authenticate many different systems, name
fields may be used as an index to locate the proper secret in a large
table of secrets. This also makes it possible to support more than
one name/secret pair per system, and to change the secret in use at
any time during the session.
2.2. Disadvantages
CHAP requires that the secret be available in plaintext form.
Irreversably encrypted password databases commonly available cannot
be used.
It is not as useful for large installations, since every possible
secret is maintained at both ends of the link.
Implementation Note: To avoid sending the secret over other links
in the network, it is recommended that the challenge and response
values be examined at a central server, rather than each network
access server. Otherwise, the secret SHOULD be sent to such
servers in a reversably encrypted form. Either case requires a
trusted relationship, which is outside the scope of this
specification.
Simpson [Page 3]
RFC 1994 PPP CHAP August 1996
2.3. Design Requirements
The CHAP algorithm requires that the length of the secret MUST be at
least 1 octet. The secret SHOULD be at least as large and
unguessable as a well-chosen password. It is preferred that the
secret be at least the length of the hash value for the hashing
algorithm chosen (16 octets for MD5). This is to ensure a
sufficiently large range for the secret to provide protection against
exhaustive search attacks.
The one-way hash algorithm is chosen such that it is computationally
infeasible to determine the secret from the known challenge and
response values.
Each challenge value SHOULD be unique, since repetition of a
challenge value in conjunction with the same secret would permit an
attacker to reply with a previously intercepted response. Since it
is expected that the same secret MAY be used to authenticate with
servers in disparate geographic regions, the challenge SHOULD exhibit
global and temporal uniqueness.
Each challenge value SHOULD also be unpredictable, least an attacker
trick a peer into responding to a predicted future challenge, and
then use the response to masquerade as that peer to an authenticator.
Although protocols such as CHAP are incapable of protecting against
realtime active wiretapping attacks, generation of unique
unpredictable challenges can protect against a wide range of active
attacks.
A discussion of sources of uniqueness and probability of divergence
is included in the Magic-Number Configuration Option [1].
Simpson [Page 4]
RFC 1994 PPP CHAP August 1996
3. Configuration Option Format
A summary of the Authentication-Protocol Configuration Option format
to negotiate the Challenge-Handshake Authentication Protocol is shown
below. The fields are transmitted from left to right.
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Type | Length | Authentication-Protocol |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Algorithm |
+-+-+-+-+-+-+-+-+
Type
3
Length
5
Authentication-Protocol
c223 (hex) for Challenge-Handshake Authentication Protocol.
Algorithm
The Algorithm field is one octet and indicates the authentication
method to be used. Up-to-date values are specified in the most
recent "Assigned Numbers" [2]. One value is required to be
implemented:
5 CHAP with MD5 [3]
Simpson [Page 5]
RFC 1994 PPP CHAP August 1996
4. Packet Format
Exactly one Challenge-Handshake Authentication Protocol packet is
encapsulated in the Information field of a PPP Data Link Layer frame
where the protocol field indicates type hex c223 (Challenge-Handshake
Authentication Protocol). A summary of the CHAP packet format is
shown below. The fields are transmitted from left to right.
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Code | Identifier | Length |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Data ...
+-+-+-+-+
Code
The Code field is one octet and identifies the type of CHAP
packet. CHAP Codes are assigned as follows:
1 Challenge
2 Response
3 Success
4 Failure
Identifier
The Identifier field is one octet and aids in matching challenges,
responses and replies.
Length
The Length field is two octets and indicates the length of the
CHAP packet including the Code, Identifier, Length and Data
fields. Octets outside the range of the Length field should be
treated as Data Link Layer padding and should be ignored on
reception.
Data
The Data field is zero or more octets. The format of the Data
field is determined by the Code field.
Simpson [Page 6]
RFC 1994 PPP CHAP August 1996
4.1. Challenge and Response
Description
The Challenge packet is used to begin the Challenge-Handshake
Authentication Protocol. The authenticator MUST transmit a CHAP
packet with the Code field set to 1 (Challenge). Additional
Challenge packets MUST be sent until a valid Response packet is
received, or an optional retry counter expires.
A Challenge packet MAY also be transmitted at any time during the
Network-Layer Protocol phase to ensure that the connection has not
been altered.
The peer SHOULD expect Challenge packets during the Authentication
phase and the Network-Layer Protocol phase. Whenever a Challenge
packet is received, the peer MUST transmit a CHAP packet with the
Code field set to 2 (Response).
Whenever a Response packet is received, the authenticator compares
the Response Value with its own calculation of the expected value.
Based on this comparison, the authenticator MUST send a Success or
Failure packet (described below).
Implementation Notes: Because the Success might be lost, the
authenticator MUST allow repeated Response packets during the
Network-Layer Protocol phase after completing the
Authentication phase. To prevent discovery of alternative
Names and Secrets, any Response packets received having the
current Challenge Identifier MUST return the same reply Code
previously returned for that specific Challenge (the message
portion MAY be different). Any Response packets received
during any other phase MUST be silently discarded.
When the Failure is lost, and the authenticator terminates the
link, the LCP Terminate-Request and Terminate-Ack provide an
alternative indication that authentication failed.
Simpson [Page 7]
RFC 1994 PPP CHAP August 1996
A summary of the Challenge and Response packet format is shown below.
The fields are transmitted from left to right.
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Code | Identifier | Length |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Value-Size | Value ...
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Name ...
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Code
1 for Challenge;
2 for Response.
Identifier
The Identifier field is one octet. The Identifier field MUST be
changed each time a Challenge is sent.
The Response Identifier MUST be copied from the Identifier field
of the Challenge which caused the Response.
Value-Size
This field is one octet and indicates the length of the Value
field.
Value
The Value field is one or more octets. The most significant octet
is transmitted first.
The Challenge Value is a variable stream of octets. The
importance of the uniqueness of the Challenge Value and its
relationship to the secret is described above. The Challenge
Value MUST be changed each time a Challenge is sent. The length
of the Challenge Value depends upon the method used to generate
the octets, and is independent of the hash algorithm used.
The Response Value is the one-way hash calculated over a stream of
octets consisting of the Identifier, followed by (concatenated
with) the "secret", followed by (concatenated with) the Challenge
Value. The length of the Response Value depends upon the hash
algorithm used (16 octets for MD5).
Simpson [Page 8]
RFC 1994 PPP CHAP August 1996
Name
The Name field is one or more octets representing the
identification of the system transmitting the packet. There are
no limitations on the content of this field. For example, it MAY
contain ASCII character strings or globally unique identifiers in
ASN.1 syntax. The Name should not be NUL or CR/LF terminated.
The size is determined from the Length field.
4.2. Success and Failure
Description
If the Value received in a Response is equal to the expected
value, then the implementation MUST transmit a CHAP packet with
the Code field set to 3 (Success).
If the Value received in a Response is not equal to the expected
value, then the implementation MUST transmit a CHAP packet with
the Code field set to 4 (Failure), and SHOULD take action to
terminate the link.
A summary of the Success and Failure packet format is shown below.
The fields are transmitted from left to right.
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Code | Identifier | Length |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Message ...
+-+-+-+-+-+-+-+-+-+-+-+-+-
Code
3 for Success;
4 for Failure.
Identifier
The Identifier field is one octet and aids in matching requests
and replies. The Identifier field MUST be copied from the
Identifier field of the Response which caused this reply.
Simpson [Page 9]
RFC 1994 PPP CHAP August 1996
Message
The Message field is zero or more octets, and its contents are
implementation dependent. It is intended to be human readable,
and MUST NOT affect operation of the protocol. It is recommended
that the message contain displayable ASCII characters 32 through
126 decimal. Mechanisms for extension to other character sets are
the topic of future research. The size is determined from the
Length field.
Security Considerations
Security issues are the primary topic of this RFC.
The interaction of the authentication protocols within PPP are highly
implementation dependent. This is indicated by the use of SHOULD
throughout the document.
For example, upon failure of authentication, some implementations do
not terminate the link. Instead, the implementation limits the kind
of traffic in the Network-Layer Protocols to a filtered subset, which
in turn allows the user opportunity to update secrets or send mail to
the network administrator indicating a problem.
There is no provision for re-tries of failed authentication.
However, the LCP state machine can renegotiate the authentication
protocol at any time, thus allowing a new attempt. It is recommended
that any counters used for authentication failure not be reset until
after successful authentication, or subsequent termination of the
failed link.
There is no requirement that authentication be full duplex or that
the same protocol be used in both directions. It is perfectly
acceptable for different protocols to be used in each direction.
This will, of course, depend on the specific protocols negotiated.
The secret SHOULD NOT be the same in both directions. This allows an
attacker to replay the peer's challenge, accept the computed
response, and use that response to authenticate.
In practice, within or associated with each PPP server, there is a
database which associates "user" names with authentication
information ("secrets"). It is not anticipated that a particular
named user would be authenticated by multiple methods. This would
make the user vulnerable to attacks which negotiate the least secure
method from among a set (such as PAP rather than CHAP). If the same
Simpson [Page 10]
RFC 1994 PPP CHAP August 1996
secret was used, PAP would reveal the secret to be used later with
CHAP.
Instead, for each user name there should be an indication of exactly
one method used to authenticate that user name. If a user needs to
make use of different authentication methods under different
circumstances, then distinct user names SHOULD be employed, each of
which identifies exactly one authentication method.
Passwords and other secrets should be stored at the respective ends
such that access to them is as limited as possible. Ideally, the
secrets should only be accessible to the process requiring access in
order to perform the authentication.
The secrets should be distributed with a mechanism that limits the
number of entities that handle (and thus gain knowledge of) the
secret. Ideally, no unauthorized person should ever gain knowledge
of the secrets. Such a mechanism is outside the scope of this
specification.
Acknowledgements
David Kaufman, Frank Heinrich, and Karl Auerbach used a challenge
handshake at SDC when designing one of the protocols for a "secure"
network in the mid-1970s. Tom Bearson built a prototype Sytek
product ("Poloneous"?) on the challenge-response notion in the 1982-
83 timeframe. Another variant is documented in the various IBM SNA
manuals. Yet another variant was implemented by Karl Auerbach in the
Telebit NetBlazer circa 1991.
Kim Toms and Barney Wolff provided useful critiques of earlier
versions of this document.
Special thanks to Dave Balenson, Steve Crocker, James Galvin, and
Steve Kent, for their extensive explanations and suggestions. Now,
if only we could get them to agree with each other.
Simpson [Page 11]
RFC 1994 PPP CHAP August 1996
References
[1] Simpson, W., Editor, "The Point-to-Point Protocol (PPP)", STD
51, RFC 1661, DayDreamer, July 1994.
[2] Reynolds, J., and J. Postel, "Assigned Numbers", STD 2, RFC
1700, USC/Information Sciences Institute, October 1994.
[3] Rivest, R., and S. Dusse, "The MD5 Message-Digest Algorithm",
MIT Laboratory for Computer Science and RSA Data Security,
Inc., RFC 1321, April 1992.
Contacts
Comments should be submitted to the ietf-ppp@merit.edu mailing list.
This document was reviewed by the Point-to-Point Protocol Working
Group of the Internet Engineering Task Force (IETF). The working
group can be contacted via the current chair:
Karl Fox
Ascend Communications
3518 Riverside Drive, Suite 101
Columbus, Ohio 43221
karl@MorningStar.com
karl@Ascend.com
Questions about this memo can also be directed to:
William Allen Simpson
DayDreamer
Computer Systems Consulting Services
1384 Fontaine
Madison Heights, Michigan 48071
wsimpson@UMich.edu
wsimpson@GreenDragon.com (preferred)
Simpson [Page 12]

View File

@ -0,0 +1,563 @@
Network Working Group K. McCloghrie, Editor
Request for Comments: 2012 Cisco Systems
Updates: 1213 November 1996
Category: Standards Track
SNMPv2 Management Information Base
for the Transmission Control Protocol using SMIv2
Status of this Memo
This document specifies an Internet standards track protocol for the
Internet community, and requests discussion and suggestions for
improvements. Please refer to the current edition of the "Internet
Official Protocol Standards" (STD 1) for the standardization state
and status of this protocol. Distribution of this memo is unlimited.
IESG Note:
The IP, UDP, and TCP MIB modules currently support only IPv4. These
three modules use the IpAddress type defined as an OCTET STRING of
length 4 to represent the IPv4 32-bit internet addresses. (See RFC
1902, SMI for SNMPv2.) They do not support the new 128-bit IPv6
internet addresses.
Table of Contents
1. Introduction ................................................ 1
2. Definitions ................................................. 2
2.1 The TCP Group .............................................. 3
2.2 Conformance Information .................................... 8
2.2.1 Compliance Statements .................................... 8
2.2.2 Units of Conformance ..................................... 9
3. Acknowledgements ............................................ 10
4. References .................................................. 10
5. Security Considerations ..................................... 10
6. Editor's Address ............................................ 10
1. Introduction
A management system contains: several (potentially many) nodes, each
with a processing entity, termed an agent, which has access to
management instrumentation; at least one management station; and, a
management protocol, used to convey management information between
the agents and management stations. Operations of the protocol are
carried out under an administrative framework which defines
authentication, authorization, access control, and privacy policies.
McCloghrie Standards Track [Page 1]
RFC 2012 SNMPv2 MIB for TCP November 1996
Management stations execute management applications which monitor and
control managed elements. Managed elements are devices such as
hosts, routers, terminal servers, etc., which are monitored and
controlled via access to their management information.
Management information is viewed as a collection of managed objects,
residing in a virtual information store, termed the Management
Information Base (MIB). Collections of related objects are defined
in MIB modules. These modules are written using a subset of OSI's
Abstract Syntax Notation One (ASN.1) [1], termed the Structure of
Management Information (SMI) [2].
This document is the MIB module which defines managed objects for
managing implementations of the Transmission Control Protocol (TCP)
[3].
The managed objects in this MIB module were originally defined using
the SNMPv1 framework as a part of MIB-II [4]. This document defines
the same objects for TCP using the SNMPv2 framework.
2. Definitions
TCP-MIB DEFINITIONS ::= BEGIN
IMPORTS
MODULE-IDENTITY, OBJECT-TYPE, Integer32, Gauge32,
Counter32, IpAddress, mib-2 FROM SNMPv2-SMI
MODULE-COMPLIANCE, OBJECT-GROUP FROM SNMPv2-CONF;
tcpMIB MODULE-IDENTITY
LAST-UPDATED "9411010000Z"
ORGANIZATION "IETF SNMPv2 Working Group"
CONTACT-INFO
" Keith McCloghrie
Postal: Cisco Systems, Inc.
170 West Tasman Drive
San Jose, CA 95134-1706
US
Phone: +1 408 526 5260
Email: kzm@cisco.com"
McCloghrie Standards Track [Page 2]
RFC 2012 SNMPv2 MIB for TCP November 1996
DESCRIPTION
"The MIB module for managing TCP implementations."
REVISION "9103310000Z"
DESCRIPTION
"The initial revision of this MIB module was part of MIB-
II."
::= { mib-2 49 }
-- the TCP group
tcp OBJECT IDENTIFIER ::= { mib-2 6 }
tcpRtoAlgorithm OBJECT-TYPE
SYNTAX INTEGER {
other(1), -- none of the following
constant(2), -- a constant rto
rsre(3), -- MIL-STD-1778, Appendix B
vanj(4) -- Van Jacobson's algorithm [5]
}
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The algorithm used to determine the timeout value used for
retransmitting unacknowledged octets."
::= { tcp 1 }
tcpRtoMin OBJECT-TYPE
SYNTAX Integer32
UNITS "milliseconds"
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The minimum value permitted by a TCP implementation for the
retransmission timeout, measured in milliseconds. More
refined semantics for objects of this type depend upon the
algorithm used to determine the retransmission timeout. In
particular, when the timeout algorithm is rsre(3), an object
of this type has the semantics of the LBOUND quantity
described in RFC 793."
::= { tcp 2 }
tcpRtoMax OBJECT-TYPE
SYNTAX Integer32
UNITS "milliseconds"
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The maximum value permitted by a TCP implementation for the
McCloghrie Standards Track [Page 3]
RFC 2012 SNMPv2 MIB for TCP November 1996
retransmission timeout, measured in milliseconds. More
refined semantics for objects of this type depend upon the
algorithm used to determine the retransmission timeout. In
particular, when the timeout algorithm is rsre(3), an object
of this type has the semantics of the UBOUND quantity
described in RFC 793."
::= { tcp 3 }
tcpMaxConn OBJECT-TYPE
SYNTAX Integer32
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The limit on the total number of TCP connections the entity
can support. In entities where the maximum number of
connections is dynamic, this object should contain the value
-1."
::= { tcp 4 }
tcpActiveOpens OBJECT-TYPE
SYNTAX Counter32
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The number of times TCP connections have made a direct
transition to the SYN-SENT state from the CLOSED state."
::= { tcp 5 }
tcpPassiveOpens OBJECT-TYPE
SYNTAX Counter32
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The number of times TCP connections have made a direct
transition to the SYN-RCVD state from the LISTEN state."
::= { tcp 6 }
tcpAttemptFails OBJECT-TYPE
SYNTAX Counter32
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The number of times TCP connections have made a direct
transition to the CLOSED state from either the SYN-SENT
state or the SYN-RCVD state, plus the number of times TCP
connections have made a direct transition to the LISTEN
state from the SYN-RCVD state."
::= { tcp 7 }
McCloghrie Standards Track [Page 4]
RFC 2012 SNMPv2 MIB for TCP November 1996
tcpEstabResets OBJECT-TYPE
SYNTAX Counter32
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The number of times TCP connections have made a direct
transition to the CLOSED state from either the ESTABLISHED
state or the CLOSE-WAIT state."
::= { tcp 8 }
tcpCurrEstab OBJECT-TYPE
SYNTAX Gauge32
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The number of TCP connections for which the current state
is either ESTABLISHED or CLOSE- WAIT."
::= { tcp 9 }
tcpInSegs OBJECT-TYPE
SYNTAX Counter32
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The total number of segments received, including those
received in error. This count includes segments received on
currently established connections."
::= { tcp 10 }
tcpOutSegs OBJECT-TYPE
SYNTAX Counter32
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The total number of segments sent, including those on
current connections but excluding those containing only
retransmitted octets."
::= { tcp 11 }
tcpRetransSegs OBJECT-TYPE
SYNTAX Counter32
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The total number of segments retransmitted - that is, the
number of TCP segments transmitted containing one or more
previously transmitted octets."
McCloghrie Standards Track [Page 5]
RFC 2012 SNMPv2 MIB for TCP November 1996
::= { tcp 12 }
-- the TCP Connection table
-- The TCP connection table contains information about this
-- entity's existing TCP connections.
tcpConnTable OBJECT-TYPE
SYNTAX SEQUENCE OF TcpConnEntry
MAX-ACCESS not-accessible
STATUS current
DESCRIPTION
"A table containing TCP connection-specific information."
::= { tcp 13 }
tcpConnEntry OBJECT-TYPE
SYNTAX TcpConnEntry
MAX-ACCESS not-accessible
STATUS current
DESCRIPTION
"A conceptual row of the tcpConnTable containing information
about a particular current TCP connection. Each row of this
table is transient, in that it ceases to exist when (or soon
after) the connection makes the transition to the CLOSED
state."
INDEX { tcpConnLocalAddress,
tcpConnLocalPort,
tcpConnRemAddress,
tcpConnRemPort }
::= { tcpConnTable 1 }
TcpConnEntry ::= SEQUENCE {
tcpConnState INTEGER,
tcpConnLocalAddress IpAddress,
tcpConnLocalPort INTEGER,
tcpConnRemAddress IpAddress,
tcpConnRemPort INTEGER
}
tcpConnState OBJECT-TYPE
SYNTAX INTEGER {
closed(1),
listen(2),
synSent(3),
synReceived(4),
established(5),
finWait1(6),
McCloghrie Standards Track [Page 6]
RFC 2012 SNMPv2 MIB for TCP November 1996
finWait2(7),
closeWait(8),
lastAck(9),
closing(10),
timeWait(11),
deleteTCB(12)
}
MAX-ACCESS read-write
STATUS current
DESCRIPTION
"The state of this TCP connection.
The only value which may be set by a management station is
deleteTCB(12). Accordingly, it is appropriate for an agent
to return a `badValue' response if a management station
attempts to set this object to any other value.
If a management station sets this object to the value
deleteTCB(12), then this has the effect of deleting the TCB
(as defined in RFC 793) of the corresponding connection on
the managed node, resulting in immediate termination of the
connection.
As an implementation-specific option, a RST segment may be
sent from the managed node to the other TCP endpoint (note
however that RST segments are not sent reliably)."
::= { tcpConnEntry 1 }
tcpConnLocalAddress OBJECT-TYPE
SYNTAX IpAddress
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The local IP address for this TCP connection. In the case
of a connection in the listen state which is willing to
accept connections for any IP interface associated with the
node, the value 0.0.0.0 is used."
::= { tcpConnEntry 2 }
tcpConnLocalPort OBJECT-TYPE
SYNTAX INTEGER (0..65535)
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The local port number for this TCP connection."
::= { tcpConnEntry 3 }
tcpConnRemAddress OBJECT-TYPE
McCloghrie Standards Track [Page 7]
RFC 2012 SNMPv2 MIB for TCP November 1996
SYNTAX IpAddress
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The remote IP address for this TCP connection."
::= { tcpConnEntry 4 }
tcpConnRemPort OBJECT-TYPE
SYNTAX INTEGER (0..65535)
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The remote port number for this TCP connection."
::= { tcpConnEntry 5 }
tcpInErrs OBJECT-TYPE
SYNTAX Counter32
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The total number of segments received in error (e.g., bad
TCP checksums)."
::= { tcp 14 }
tcpOutRsts OBJECT-TYPE
SYNTAX Counter32
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"The number of TCP segments sent containing the RST flag."
::= { tcp 15 }
-- conformance information
tcpMIBConformance OBJECT IDENTIFIER ::= { tcpMIB 2 }
tcpMIBCompliances OBJECT IDENTIFIER ::= { tcpMIBConformance 1 }
tcpMIBGroups OBJECT IDENTIFIER ::= { tcpMIBConformance 2 }
-- compliance statements
tcpMIBCompliance MODULE-COMPLIANCE
STATUS current
DESCRIPTION
"The compliance statement for SNMPv2 entities which
implement TCP."
MODULE -- this module
McCloghrie Standards Track [Page 8]
RFC 2012 SNMPv2 MIB for TCP November 1996
MANDATORY-GROUPS { tcpGroup
}
::= { tcpMIBCompliances 1 }
-- units of conformance
tcpGroup OBJECT-GROUP
OBJECTS { tcpRtoAlgorithm, tcpRtoMin, tcpRtoMax,
tcpMaxConn, tcpActiveOpens,
tcpPassiveOpens, tcpAttemptFails,
tcpEstabResets, tcpCurrEstab, tcpInSegs,
tcpOutSegs, tcpRetransSegs, tcpConnState,
tcpConnLocalAddress, tcpConnLocalPort,
tcpConnRemAddress, tcpConnRemPort,
tcpInErrs, tcpOutRsts }
STATUS current
DESCRIPTION
"The tcp group of objects providing for management of TCP
entities."
::= { tcpMIBGroups 1 }
END
McCloghrie Standards Track [Page 9]
RFC 2012 SNMPv2 MIB for TCP November 1996
3. Acknowledgements
This document contains a modified subset of RFC 1213.
4. References
[1] Information processing systems - Open Systems Interconnection -
Specification of Abstract Syntax Notation One (ASN.1),
International Organization for Standardization. International
Standard 8824, (December, 1987).
[2] McCloghrie, K., Editor, "Structure of Management Information
for version 2 of the Simple Network Management Protocol
(SNMPv2)", RFC 1902, Cisco Systems, January 1996.
[3] Postel, J., "Transmission Control Protocol - DARPA Internet
Program Protocol Specification", STD 7, RFC 793, DARPA,
September 1981.
[4] McCloghrie, K., and M. Rose, "Management Information Base for
Network Management of TCP/IP-based internets: MIB-II", STD 17,
RFC 1213, March 1991.
[5] Jacobson, V., "Congestion Avoidance and Control", SIGCOMM 1988,
Stanford, California.
5. Security Considerations
Security issues are not discussed in this memo.
6. Editor's Address
Keith McCloghrie
Cisco Systems, Inc.
170 West Tasman Drive
San Jose, CA 95134-1706
US
Phone: +1 408 526 5260
EMail: kzm@cisco.com
McCloghrie Standards Track [Page 10]

View File

@ -0,0 +1,675 @@
Network Working Group M. Mathis
Request for Comments: 2018 J. Mahdavi
Category: Standards Track PSC
S. Floyd
LBNL
A. Romanow
Sun Microsystems
October 1996
TCP Selective Acknowledgment Options
Status of this Memo
This document specifies an Internet standards track protocol for the
Internet community, and requests discussion and suggestions for
improvements. Please refer to the current edition of the "Internet
Official Protocol Standards" (STD 1) for the standardization state
and status of this protocol. Distribution of this memo is unlimited.
Abstract
TCP may experience poor performance when multiple packets are lost
from one window of data. With the limited information available
from cumulative acknowledgments, a TCP sender can only learn about a
single lost packet per round trip time. An aggressive sender could
choose to retransmit packets early, but such retransmitted segments
may have already been successfully received.
A Selective Acknowledgment (SACK) mechanism, combined with a
selective repeat retransmission policy, can help to overcome these
limitations. The receiving TCP sends back SACK packets to the sender
informing the sender of data that has been received. The sender can
then retransmit only the missing data segments.
This memo proposes an implementation of SACK and discusses its
performance and related issues.
Acknowledgements
Much of the text in this document is taken directly from RFC1072 "TCP
Extensions for Long-Delay Paths" by Bob Braden and Van Jacobson. The
authors would like to thank Kevin Fall (LBNL), Christian Huitema
(INRIA), Van Jacobson (LBNL), Greg Miller (MITRE), Greg Minshall
(Ipsilon), Lixia Zhang (XEROX PARC and UCLA), Dave Borman (BSDI),
Allison Mankin (ISI) and others for their review and constructive
comments.
Mathis, et. al. Standards Track [Page 1]
RFC 2018 TCP Selective Acknowledgement Options October 1996
1. Introduction
Multiple packet losses from a window of data can have a catastrophic
effect on TCP throughput. TCP [Postel81] uses a cumulative
acknowledgment scheme in which received segments that are not at the
left edge of the receive window are not acknowledged. This forces
the sender to either wait a roundtrip time to find out about each
lost packet, or to unnecessarily retransmit segments which have been
correctly received [Fall95]. With the cumulative acknowledgment
scheme, multiple dropped segments generally cause TCP to lose its
ACK-based clock, reducing overall throughput.
Selective Acknowledgment (SACK) is a strategy which corrects this
behavior in the face of multiple dropped segments. With selective
acknowledgments, the data receiver can inform the sender about all
segments that have arrived successfully, so the sender need
retransmit only the segments that have actually been lost.
Several transport protocols, including NETBLT [Clark87], XTP
[Strayer92], RDP [Velten84], NADIR [Huitema81], and VMTP [Cheriton88]
have used selective acknowledgment. There is some empirical evidence
in favor of selective acknowledgments -- simple experiments with RDP
have shown that disabling the selective acknowledgment facility
greatly increases the number of retransmitted segments over a lossy,
high-delay Internet path [Partridge87]. A recent simulation study by
Kevin Fall and Sally Floyd [Fall95], demonstrates the strength of TCP
with SACK over the non-SACK Tahoe and Reno TCP implementations.
RFC1072 [VJ88] describes one possible implementation of SACK options
for TCP. Unfortunately, it has never been deployed in the Internet,
as there was disagreement about how SACK options should be used in
conjunction with the TCP window shift option (initially described
RFC1072 and revised in [Jacobson92]).
We propose slight modifications to the SACK options as proposed in
RFC1072. Specifically, sending a selective acknowledgment for the
most recently received data reduces the need for long SACK options
[Keshav94, Mathis95]. In addition, the SACK option now carries full
32 bit sequence numbers. These two modifications represent the only
changes to the proposal in RFC1072. They make SACK easier to
implement and address concerns about robustness.
The selective acknowledgment extension uses two TCP options. The
first is an enabling option, "SACK-permitted", which may be sent in a
SYN segment to indicate that the SACK option can be used once the
connection is established. The other is the SACK option itself,
which may be sent over an established connection once permission has
been given by SACK-permitted.
Mathis, et. al. Standards Track [Page 2]
RFC 2018 TCP Selective Acknowledgement Options October 1996
The SACK option is to be included in a segment sent from a TCP that
is receiving data to the TCP that is sending that data; we will refer
to these TCP's as the data receiver and the data sender,
respectively. We will consider a particular simplex data flow; any
data flowing in the reverse direction over the same connection can be
treated independently.
2. Sack-Permitted Option
This two-byte option may be sent in a SYN by a TCP that has been
extended to receive (and presumably process) the SACK option once the
connection has opened. It MUST NOT be sent on non-SYN segments.
TCP Sack-Permitted Option:
Kind: 4
+---------+---------+
| Kind=4 | Length=2|
+---------+---------+
3. Sack Option Format
The SACK option is to be used to convey extended acknowledgment
information from the receiver to the sender over an established TCP
connection.
TCP SACK Option:
Kind: 5
Length: Variable
+--------+--------+
| Kind=5 | Length |
+--------+--------+--------+--------+
| Left Edge of 1st Block |
+--------+--------+--------+--------+
| Right Edge of 1st Block |
+--------+--------+--------+--------+
| |
/ . . . /
| |
+--------+--------+--------+--------+
| Left Edge of nth Block |
+--------+--------+--------+--------+
| Right Edge of nth Block |
+--------+--------+--------+--------+
Mathis, et. al. Standards Track [Page 3]
RFC 2018 TCP Selective Acknowledgement Options October 1996
The SACK option is to be sent by a data receiver to inform the data
sender of non-contiguous blocks of data that have been received and
queued. The data receiver awaits the receipt of data (perhaps by
means of retransmissions) to fill the gaps in sequence space between
received blocks. When missing segments are received, the data
receiver acknowledges the data normally by advancing the left window
edge in the Acknowledgement Number Field of the TCP header. The SACK
option does not change the meaning of the Acknowledgement Number
field.
This option contains a list of some of the blocks of contiguous
sequence space occupied by data that has been received and queued
within the window.
Each contiguous block of data queued at the data receiver is defined
in the SACK option by two 32-bit unsigned integers in network byte
order:
* Left Edge of Block
This is the first sequence number of this block.
* Right Edge of Block
This is the sequence number immediately following the last
sequence number of this block.
Each block represents received bytes of data that are contiguous and
isolated; that is, the bytes just below the block, (Left Edge of
Block - 1), and just above the block, (Right Edge of Block), have not
been received.
A SACK option that specifies n blocks will have a length of 8*n+2
bytes, so the 40 bytes available for TCP options can specify a
maximum of 4 blocks. It is expected that SACK will often be used in
conjunction with the Timestamp option used for RTTM [Jacobson92],
which takes an additional 10 bytes (plus two bytes of padding); thus
a maximum of 3 SACK blocks will be allowed in this case.
The SACK option is advisory, in that, while it notifies the data
sender that the data receiver has received the indicated segments,
the data receiver is permitted to later discard data which have been
reported in a SACK option. A discussion appears below in Section 8
of the consequences of advisory SACK, in particular that the data
receiver may renege, or drop already SACKed data.
Mathis, et. al. Standards Track [Page 4]
RFC 2018 TCP Selective Acknowledgement Options October 1996
4. Generating Sack Options: Data Receiver Behavior
If the data receiver has received a SACK-Permitted option on the SYN
for this connection, the data receiver MAY elect to generate SACK
options as described below. If the data receiver generates SACK
options under any circumstance, it SHOULD generate them under all
permitted circumstances. If the data receiver has not received a
SACK-Permitted option for a given connection, it MUST NOT send SACK
options on that connection.
If sent at all, SACK options SHOULD be included in all ACKs which do
not ACK the highest sequence number in the data receiver's queue. In
this situation the network has lost or mis-ordered data, such that
the receiver holds non-contiguous data in its queue. RFC 1122,
Section 4.2.2.21, discusses the reasons for the receiver to send ACKs
in response to additional segments received in this state. The
receiver SHOULD send an ACK for every valid segment that arrives
containing new data, and each of these "duplicate" ACKs SHOULD bear a
SACK option.
If the data receiver chooses to send a SACK option, the following
rules apply:
* The first SACK block (i.e., the one immediately following the
kind and length fields in the option) MUST specify the contiguous
block of data containing the segment which triggered this ACK,
unless that segment advanced the Acknowledgment Number field in
the header. This assures that the ACK with the SACK option
reflects the most recent change in the data receiver's buffer
queue.
* The data receiver SHOULD include as many distinct SACK blocks as
possible in the SACK option. Note that the maximum available
option space may not be sufficient to report all blocks present in
the receiver's queue.
* The SACK option SHOULD be filled out by repeating the most
recently reported SACK blocks (based on first SACK blocks in
previous SACK options) that are not subsets of a SACK block
already included in the SACK option being constructed. This
assures that in normal operation, any segment remaining part of a
non-contiguous block of data held by the data receiver is reported
in at least three successive SACK options, even for large-window
TCP implementations [RFC1323]). After the first SACK block, the
following SACK blocks in the SACK option may be listed in
arbitrary order.
Mathis, et. al. Standards Track [Page 5]
RFC 2018 TCP Selective Acknowledgement Options October 1996
It is very important that the SACK option always reports the block
containing the most recently received segment, because this provides
the sender with the most up-to-date information about the state of
the network and the data receiver's queue.
5. Interpreting the Sack Option and Retransmission Strategy: Data
Sender Behavior
When receiving an ACK containing a SACK option, the data sender
SHOULD record the selective acknowledgment for future reference. The
data sender is assumed to have a retransmission queue that contains
the segments that have been transmitted but not yet acknowledged, in
sequence-number order. If the data sender performs re-packetization
before retransmission, the block boundaries in a SACK option that it
receives may not fall on boundaries of segments in the retransmission
queue; however, this does not pose a serious difficulty for the
sender.
One possible implementation of the sender's behavior is as follows.
Let us suppose that for each segment in the retransmission queue
there is a (new) flag bit "SACKed", to be used to indicate that this
particular segment has been reported in a SACK option.
When an acknowledgment segment arrives containing a SACK option, the
data sender will turn on the SACKed bits for segments that have been
selectively acknowledged. More specifically, for each block in the
SACK option, the data sender will turn on the SACKed flags for all
segments in the retransmission queue that are wholly contained within
that block. This requires straightforward sequence number
comparisons.
After the SACKed bit is turned on (as the result of processing a
received SACK option), the data sender will skip that segment during
any later retransmission. Any segment that has the SACKed bit turned
off and is less than the highest SACKed segment is available for
retransmission.
After a retransmit timeout the data sender SHOULD turn off all of the
SACKed bits, since the timeout might indicate that the data receiver
has reneged. The data sender MUST retransmit the segment at the left
edge of the window after a retransmit timeout, whether or not the
SACKed bit is on for that segment. A segment will not be dequeued
and its buffer freed until the left window edge is advanced over it.
Mathis, et. al. Standards Track [Page 6]
RFC 2018 TCP Selective Acknowledgement Options October 1996
5.1 Congestion Control Issues
This document does not attempt to specify in detail the congestion
control algorithms for implementations of TCP with SACK. However,
the congestion control algorithms present in the de facto standard
TCP implementations MUST be preserved [Stevens94]. In particular, to
preserve robustness in the presence of packets reordered by the
network, recovery is not triggered by a single ACK reporting out-of-
order packets at the receiver. Further, during recovery the data
sender limits the number of segments sent in response to each ACK.
Existing implementations limit the data sender to sending one segment
during Reno-style fast recovery, or to two segments during slow-start
[Jacobson88]. Other aspects of congestion control, such as reducing
the congestion window in response to congestion, must similarly be
preserved.
The use of time-outs as a fall-back mechanism for detecting dropped
packets is unchanged by the SACK option. Because the data receiver
is allowed to discard SACKed data, when a retransmit timeout occurs
the data sender MUST ignore prior SACK information in determining
which data to retransmit.
Future research into congestion control algorithms may take advantage
of the additional information provided by SACK. One such area for
future research concerns modifications to TCP for a wireless or
satellite environment where packet loss is not necessarily an
indication of congestion.
6. Efficiency and Worst Case Behavior
If the return path carrying ACKs and SACK options were lossless, one
block per SACK option packet would always be sufficient. Every
segment arriving while the data receiver holds discontinuous data
would cause the data receiver to send an ACK with a SACK option
containing the one altered block in the receiver's queue. The data
sender is thus able to construct a precise replica of the receiver's
queue by taking the union of all the first SACK blocks.
Mathis, et. al. Standards Track [Page 7]
RFC 2018 TCP Selective Acknowledgement Options October 1996
Since the return path is not lossless, the SACK option is defined to
include more than one SACK block in a single packet. The redundant
blocks in the SACK option packet increase the robustness of SACK
delivery in the presence of lost ACKs. For a receiver that is also
using the time stamp option [Jacobson92], the SACK option has room to
include three SACK blocks. Thus each SACK block will generally be
repeated at least three times, if necessary, once in each of three
successive ACK packets. However, if all of the ACK packets reporting
a particular SACK block are dropped, then the sender might assume
that the data in that SACK block has not been received, and
unnecessarily retransmit those segments.
The deployment of other TCP options may reduce the number of
available SACK blocks to 2 or even to 1. This will reduce the
redundancy of SACK delivery in the presence of lost ACKs. Even so,
the exposure of TCP SACK in regard to the unnecessary retransmission
of packets is strictly less than the exposure of current
implementations of TCP. The worst-case conditions necessary for the
sender to needlessly retransmit data is discussed in more detail in a
separate document [Floyd96].
Older TCP implementations which do not have the SACK option will not
be unfairly disadvantaged when competing against SACK-capable TCPs.
This issue is discussed in more detail in [Floyd96].
7. Sack Option Examples
The following examples attempt to demonstrate the proper behavior of
SACK generation by the data receiver.
Assume the left window edge is 5000 and that the data transmitter
sends a burst of 8 segments, each containing 500 data bytes.
Case 1: The first 4 segments are received but the last 4 are
dropped.
The data receiver will return a normal TCP ACK segment
acknowledging sequence number 7000, with no SACK option.
Mathis, et. al. Standards Track [Page 8]
RFC 2018 TCP Selective Acknowledgement Options October 1996
Case 2: The first segment is dropped but the remaining 7 are
received.
Upon receiving each of the last seven packets, the data
receiver will return a TCP ACK segment that acknowledges
sequence number 5000 and contains a SACK option specifying
one block of queued data:
Triggering ACK Left Edge Right Edge
Segment
5000 (lost)
5500 5000 5500 6000
6000 5000 5500 6500
6500 5000 5500 7000
7000 5000 5500 7500
7500 5000 5500 8000
8000 5000 5500 8500
8500 5000 5500 9000
Case 3: The 2nd, 4th, 6th, and 8th (last) segments are
dropped.
The data receiver ACKs the first packet normally. The
third, fifth, and seventh packets trigger SACK options as
follows:
Triggering ACK First Block 2nd Block 3rd Block
Segment Left Right Left Right Left Right
Edge Edge Edge Edge Edge Edge
5000 5500
5500 (lost)
6000 5500 6000 6500
6500 (lost)
7000 5500 7000 7500 6000 6500
7500 (lost)
8000 5500 8000 8500 7000 7500 6000 6500
8500 (lost)
Mathis, et. al. Standards Track [Page 9]
RFC 2018 TCP Selective Acknowledgement Options October 1996
Suppose at this point, the 4th packet is received out of order.
(This could either be because the data was badly misordered in the
network, or because the 2nd packet was retransmitted and lost, and
then the 4th packet was retransmitted). At this point the data
receiver has only two SACK blocks to report. The data receiver
replies with the following Selective Acknowledgment:
Triggering ACK First Block 2nd Block 3rd Block
Segment Left Right Left Right Left Right
Edge Edge Edge Edge Edge Edge
6500 5500 6000 7500 8000 8500
Suppose at this point, the 2nd segment is received. The data
receiver then replies with the following Selective Acknowledgment:
Triggering ACK First Block 2nd Block 3rd Block
Segment Left Right Left Right Left Right
Edge Edge Edge Edge Edge Edge
5500 7500 8000 8500
8. Data Receiver Reneging
Note that the data receiver is permitted to discard data in its queue
that has not been acknowledged to the data sender, even if the data
has already been reported in a SACK option. Such discarding of
SACKed packets is discouraged, but may be used if the receiver runs
out of buffer space.
The data receiver MAY elect not to keep data which it has reported in
a SACK option. In this case, the receiver SACK generation is
additionally qualified:
* The first SACK block MUST reflect the newest segment. Even if
the newest segment is going to be discarded and the receiver has
already discarded adjacent segments, the first SACK block MUST
report, at a minimum, the left and right edges of the newest
segment.
* Except for the newest segment, all SACK blocks MUST NOT report
any old data which is no longer actually held by the receiver.
Since the data receiver may later discard data reported in a SACK
option, the sender MUST NOT discard data before it is acknowledged by
the Acknowledgment Number field in the TCP header.
Mathis, et. al. Standards Track [Page 10]
RFC 2018 TCP Selective Acknowledgement Options October 1996
9. Security Considerations
This document neither strengthens nor weakens TCP's current security
properties.
10. References
[Cheriton88] Cheriton, D., "VMTP: Versatile Message Transaction
Protocol", RFC 1045, Stanford University, February 1988.
[Clark87] Clark, D., Lambert, M., and L. Zhang, "NETBLT: A Bulk Data
Transfer Protocol", RFC 998, MIT, March 1987.
[Fall95] Fall, K. and Floyd, S., "Comparisons of Tahoe, Reno, and
Sack TCP", ftp://ftp.ee.lbl.gov/papers/sacks.ps.Z, December 1995.
[Floyd96] Floyd, S., "Issues of TCP with SACK",
ftp://ftp.ee.lbl.gov/papers/issues_sa.ps.Z, January 1996.
[Huitema81] Huitema, C., and Valet, I., An Experiment on High Speed
File Transfer using Satellite Links, 7th Data Communication
Symposium, Mexico, October 1981.
[Jacobson88] Jacobson, V., "Congestion Avoidance and Control",
Proceedings of SIGCOMM '88, Stanford, CA., August 1988.
[Jacobson88}, Jacobson, V. and R. Braden, "TCP Extensions for Long-
Delay Paths", RFC 1072, October 1988.
[Jacobson92] Jacobson, V., Braden, R., and D. Borman, "TCP Extensions
for High Performance", RFC 1323, May 1992.
[Keshav94] Keshav, presentation to the Internet End-to-End Research
Group, November 1994.
[Mathis95] Mathis, M., and Mahdavi, J., TCP Forward Acknowledgment
Option, presentation to the Internet End-to-End Research Group, June
1995.
[Partridge87] Partridge, C., "Private Communication", February 1987.
[Postel81] Postel, J., "Transmission Control Protocol - DARPA
Internet Program Protocol Specification", RFC 793, DARPA, September
1981.
[Stevens94] Stevens, W., TCP/IP Illustrated, Volume 1: The Protocols,
Addison-Wesley, 1994.
Mathis, et. al. Standards Track [Page 11]
RFC 2018 TCP Selective Acknowledgement Options October 1996
[Strayer92] Strayer, T., Dempsey, B., and Weaver, A., XTP -- the
xpress transfer protocol. Addison-Wesley Publishing Company, 1992.
[Velten84] Velten, D., Hinden, R., and J. Sax, "Reliable Data
Protocol", RFC 908, BBN, July 1984.
11. Authors' Addresses
Matt Mathis and Jamshid Mahdavi
Pittsburgh Supercomputing Center
4400 Fifth Ave
Pittsburgh, PA 15213
mathis@psc.edu
mahdavi@psc.edu
Sally Floyd
Lawrence Berkeley National Laboratory
One Cyclotron Road
Berkeley, CA 94720
floyd@ee.lbl.gov
Allyn Romanow
Sun Microsystems, Inc.
2550 Garcia Ave., MPK17-202
Mountain View, CA 94043
allyn@eng.sun.com
Mathis, et. al. Standards Track [Page 12]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,619 @@
Network Working Group J. Touch
Request for Comments: 2140 ISI
Category: Informational April 1997
TCP Control Block Interdependence
Status of this Memo
This memo provides information for the Internet community. This memo
does not specify an Internet standard of any kind. Distribution of
this memo is unlimited.
Abstract
This memo makes the case for interdependent TCP control blocks, where
part of the TCP state is shared among similar concurrent connections,
or across similar connection instances. TCP state includes a
combination of parameters, such as connection state, current round-
trip time estimates, congestion control information, and process
information. This state is currently maintained on a per-connection
basis in the TCP control block, but should be shared across
connections to the same host. The goal is to improve transient
transport performance, while maintaining backward-compatibility with
existing implementations.
This document is a product of the LSAM project at ISI.
Introduction
TCP is a connection-oriented reliable transport protocol layered over
IP [9]. Each TCP connection maintains state, usually in a data
structure called the TCP Control Block (TCB). The TCB contains
information about the connection state, its associated local process,
and feedback parameters about the connection's transmission
properties. As originally specified and usually implemented, the TCB
is maintained on a per-connection basis. This document discusses the
implications of that decision, and argues for an alternate
implementation that shares some of this state across similar
connection instances and among similar simultaneous connections. The
resulting implementation can have better transient performance,
especially for numerous short-lived and simultaneous connections, as
often used in the World-Wide Web [1]. These changes affect only the
TCB initialization, and so have no effect on the long-term behavior
of TCP after a connection has been established.
Touch Informational [Page 1]
RFC 2140 TCP Control Block Interdependence April 1997
The TCP Control Block (TCB)
A TCB is associated with each connection, i.e., with each association
of a pair of applications across the network. The TCB can be
summarized as containing [9]:
Local process state
pointers to send and receive buffers
pointers to retransmission queue and current segment
pointers to Internet Protocol (IP) PCB
Per-connection shared state
macro-state
connection state
timers
flags
local and remote host numbers and ports
micro-state
send and receive window state (size*, current number)
round-trip time and variance
cong. window size*
cong. window size threshold*
max windows seen*
MSS#
round-trip time and variance#
The per-connection information is shown as split into macro-state and
micro-state, terminology borrowed from [5]. Macro-state describes the
finite state machine; we include the endpoint numbers and components
(timers, flags) used to help maintain that state. This includes the
protocol for establishing and maintaining shared state about the
connection. Micro-state describes the protocol after a connection has
been established, to maintain the reliability and congestion control
of the data transferred in the connection.
We further distinguish two other classes of shared micro-state that
are associated more with host-pairs than with application pairs. One
class is clearly host-pair dependent (#, e.g., MSS, RTT), and the
other is host-pair dependent in its aggregate (*, e.g., cong. window
info., curr. window sizes).
Touch Informational [Page 2]
RFC 2140 TCP Control Block Interdependence April 1997
TCB Interdependence
The observation that some TCB state is host-pair specific rather than
application-pair dependent is not new, and is a common engineering
decision in layered protocol implementations. A discussion of sharing
RTT information among protocols layered over IP, including UDP and
TCP, occurred in [8]. T/TCP uses caches to maintain TCB information
across instances, e.g., smoothed RTT, RTT variance, congestion
avoidance threshold, and MSS [3]. These values are in addition to
connection counts used by T/TCP to accelerate data delivery prior to
the full three-way handshake during an OPEN. The goal is to aggregate
TCB components where they reflect one association - that of the
host-pair, rather than artificially separating those components by
connection.
At least one current T/TCP implementation saves the MSS and
aggregates the RTT parameters across multiple connections, but omits
caching the congestion window information [4], as originally
specified in [2]. There may be other values that may be cached, such
as current window size, to permit new connections full access to
accumulated channel resources.
We observe that there are two cases of TCB interdependence. Temporal
sharing occurs when the TCB of an earlier (now CLOSED) connection to
a host is used to initialize some parameters of a new connection to
that same host. Ensemble sharing occurs when a currently active
connection to a host is used to initialize another (concurrent)
connection to that host. T/TCP documents considered the temporal
case; we consider both.
An Example of Temporal Sharing
Temporal sharing of cached TCB data has been implemented in the SunOS
4.1.3 T/TCP extensions [4] and the FreeBSD port of same [7]. As
mentioned before, only the MSS and RTT parameters are cached, as
originally specified in [2]. Later discussion of T/TCP suggested
including congestion control parameters in this cache [3].
The cache is accessed in two ways: it is read to initialize new TCBs,
and written when more current per-host state is available. New TCBs
are initialized as follows; snd_cwnd reuse is not yet implemented,
although discussed in the T/TCP concepts [2]:
Touch Informational [Page 3]
RFC 2140 TCP Control Block Interdependence April 1997
TEMPORAL SHARING - TCB Initialization
Cached TCB New TCB
----------------------------------------
old-MSS old-MSS
old-RTT old-RTT
old-RTTvar old-RTTvar
old-snd_cwnd old-snd_cwnd (not yet impl.)
Most cached TCB values are updated when a connection closes. An
exception is MSS, which is updated whenever the MSS option is
received in a TCP header.
TEMPORAL SHARING - Cache Updates
Cached TCB Current TCB when? New Cached TCB
---------------------------------------------------------------
old-MSS curr-MSS MSSopt curr-MSS
old-RTT curr-RTT CLOSE old += (curr - old) >> 2
old-RTTvar curr-RTTvar CLOSE old += (curr - old) >> 2
old-snd_cwnd curr-snd_cwnd CLOSE curr-snd_cwnd (not yet impl.)
MSS caching is trivial; reported values are cached, and the most
recent value is used. The cache is updated when the MSS option is
received, so the cache always has the most recent MSS value from any
connection. The cache is consulted only at connection establishment,
and not otherwise updated, which means that MSS options do not affect
current connections. The default MSS is never saved; only reported
MSS values update the cache, so an explicit override is required to
reduce the MSS.
RTT values are updated by a more complicated mechanism [3], [8].
Dynamic RTT estimation requires a sequence of RTT measurements, even
though a single T/TCP transaction may not accumulate enough samples.
As a result, the cached RTT (and its variance) is an average of its
previous value with the contents of the currently active TCB for that
host, when a TCB is closed. RTT values are updated only when a
connection is closed. Further, the method for averaging the RTT
values is not the same as the method for computing the RTT values
within a connection, so that the cached value may not be appropriate.
Touch Informational [Page 4]
RFC 2140 TCP Control Block Interdependence April 1997
For temporal sharing, the cache requires updating only when a
connection closes, because the cached values will not yet be used to
initialize a new TCB. For the ensemble sharing, this is not the case,
as discussed below.
Other TCB variables may also be cached between sequential instances,
such as the congestion control window information. Old cache values
can be overwritten with the current TCB estimates, or a MAX or MIN
function can be used to merge the results, depending on the optimism
or pessimism of the reused values. For example, the congestion window
can be reused if there are no concurrent connections.
An Example of Ensemble Sharing
Sharing cached TCB data across concurrent connections requires
attention to the aggregate nature of some of the shared state.
Although MSS and RTT values can be shared by copying, it may not be
appropriate to copy congestion window information. At this point, we
present only the MSS and RTT rules:
ENSEMBLE SHARING - TCB Initialization
Cached TCB New TCB
----------------------------------
old-MSS old-MSS
old-RTT old-RTT
old-RTTvar old-RTTvar
ENSEMBLE SHARING - Cache Updates
Cached TCB Current TCB when? New Cached TCB
-----------------------------------------------------------
old-MSS curr-MSS MSSopt curr-MSS
old-RTT curr-RTT update rtt_update(old,curr)
old-RTTvar curr-RTTvar update rtt_update(old,curr)
For ensemble sharing, TCB information should be cached as early as
possible, sometimes before a connection is closed. Otherwise, opening
multiple concurrent connections may not result in TCB data sharing if
no connection closes before others open. An optimistic solution would
Touch Informational [Page 5]
RFC 2140 TCP Control Block Interdependence April 1997
be to update cached data as early as possible, rather than only when
a connection is closing. Some T/TCP implementations do this for MSS
when the TCP MSS header option is received [4], although it is not
addressed specifically in the concepts or functional specification
[2][3].
In current T/TCP, RTT values are updated only after a CLOSE, which
does not benefit concurrent sessions. As mentioned in the temporal
case, averaging values between concurrent connections requires
incorporating new RTT measurements. The amount of work involved in
updating the aggregate average should be minimized, but the resulting
value should be equivalent to having all values measured within a
single connection. The function "rtt_update" in the ensemble sharing
table indicates this operation, which occurs whenever the RTT would
have been updated in the individual TCP connection. As a result, the
cache contains the shared RTT variables, which no longer need to
reside in the TCB [8].
Congestion window size aggregation is more complicated in the
concurrent case. When there is an ensemble of connections, we need
to decide how that ensemble would have shared the congestion window,
in order to derive initial values for new TCBs. Because concurrent
connections between two hosts share network paths (usually), they
also share whatever capacity exists along that path. With regard to
congestion, the set of connections might behave as if it were
multiplexed prior to TCP, as if all data were part of a single
connection. As a result, the current window sizes would maintain a
constant sum, presuming sufficient offered load. This would go beyond
caching to truly sharing state, as in the RTT case.
We pause to note that any assumption of this sharing can be
incorrect, including this one. In current implementations, new
congestion windows are set at an initial value of one segment, so
that the sum of the current windows is increased for any new
connection. This can have detrimental consequences where several
connections share a highly congested link, such as in trans-Atlantic
Web access.
There are several ways to initialize the congestion window in a new
TCB among an ensemble of current connections to a host, as shown
below. Current TCP implementations initialize it to one segment [9],
and T/TCP hinted that it should be initialized to the old window size
[3]. In the former, the assumption is that new connections should
behave as conservatively as possible. In the latter, no accommodation
is made to concurrent aggregate behavior.
In either case, the sum of window sizes can increase, rather than
remain constant. Another solution is to give each pending connection
Touch Informational [Page 6]
RFC 2140 TCP Control Block Interdependence April 1997
its "fair share" of the available congestion window, and let the
connections balance from there. The assumption we make here is that
new connections are implicit requests for an equal share of available
link bandwidth which should be granted at the expense of current
connections. This may or may not be the appropriate function; we
propose that it be examined further.
ENSEMBLE SHARING - TCB Initialization
Some Options for Sharing Window-size
Cached TCB New TCB
-----------------------------------------------------------------
old-snd_cwnd (current) one segment
(T/TCP hint) old-snd_cwnd
(proposed) old-snd_cwnd/(N+1)
subtract old-snd_cwnd/(N+1)/N
from each concurrent
ENSEMBLE SHARING - Cache Updates
Cached TCB Current TCB when? New Cached TCB
----------------------------------------------------------------
old-snd_cwnd curr-snd_cwnd update (adjust sum as appropriate)
Compatibility Issues
Current TCP implementations do not use TCB caching, with the
exception of T/TCP variants [4][7]. New connections use the default
initial values of all non-instantiated TCB variables. As a result,
each connection calculates its own RTT measurements, MSS value, and
congestion information. Eventually these values are updated for each
connection.
For the congestion and current window information, the initial values
may not be consistent with the long-term aggregate behavior of a set
of concurrent connections. If a single connection has a window of 4
segments, new connections assume initial windows of 1 segment (the
minimum), although the current connection's window doesn't decrease
to accommodate this additional load. As a result, connections can
mutually interfere. One example of this has been seen on trans-
Atlantic links, where concurrent connections supporting Web traffic
can collide because their initial windows are too large, even when
set at one segment.
Touch Informational [Page 7]
RFC 2140 TCP Control Block Interdependence April 1997
Because this proposal attempts to anticipate the aggregate steady-
state values of TCB state among a group or over time, it should avoid
the transient effects of new connections. In addition, because it
considers the ensemble and temporal properties of those aggregates,
it should also prevent the transients of short-lived or multiple
concurrent connections from adversely affecting the overall network
performance. We are performing analysis and experiments to validate
these assumptions.
Performance Considerations
Here we attempt to optimize transient behavior of TCP without
modifying its long-term properties. The predominant expense is in
maintaining the cached values, or in using per-host state rather than
per-connection state. In cases where performance is affected,
however, we note that the per-host information can be kept in per-
connection copies (as done now), because with higher performance
should come less interference between concurrent connections.
Sharing TCB state can occur only at connection establishment and
close (to update the cache), to minimize overhead, optimize transient
behavior, and minimize the effect on the steady-state. It is possible
that sharing state during a connection, as in the RTT or window-size
variables, may be of benefit, provided its implementation cost is not
high.
Implications
There are several implications to incorporating TCB interdependence
in TCP implementations. First, it may prevent the need for
application-layer multiplexing for performance enhancement [6].
Protocols like persistent-HTTP avoid connection reestablishment costs
by serializing or multiplexing a set of per-host connections across a
single TCP connection. This avoids TCP's per-connection OPEN
handshake, and also avoids recomputing MSS, RTT, and congestion
windows. By avoiding the so-called, "slow-start restart," performance
can be optimized. Our proposal provides the MSS, RTT, and OPEN
handshake avoidance of T/TCP, and the "slow-start restart avoidance"
of multiplexing, without requiring a multiplexing mechanism at the
application layer. This multiplexing will be complicated when
quality-of-service mechanisms (e.g., "integrated services
scheduling") are provided later.
Second, we are attempting to push some of the TCP implementation from
the traditional transport layer (in the ISO model [10]), to the
network layer. This acknowledges that some state currently maintained
as per-connection is in fact per-path, which we simplify as per-
host-pair. Transport protocols typically manage per-application-pair
Touch Informational [Page 8]
RFC 2140 TCP Control Block Interdependence April 1997
associations (per stream), and network protocols manage per-path
associations (routing). Round-trip time, MSS, and congestion
information is more appropriately handled in a network-layer fashion,
aggregated among concurrent connections, and shared across connection
instances.
An earlier version of RTT sharing suggested implementing RTT state at
the IP layer, rather than at the TCP layer [8]. Our observations are
for sharing state among TCP connections, which avoids some of the
difficulties in an IP-layer solution. One such problem is determining
the associated prior outgoing packet for an incoming packet, to infer
RTT from the exchange. Because RTTs are still determined inside the
TCP layer, this is simpler than at the IP layer. This is a case where
information should be computed at the transport layer, but shared at
the network layer.
We also note that per-host-pair associations are not the limit of
these techniques. It is possible that TCBs could be similarly shared
between hosts on a LAN, because the predominant path can be LAN-LAN,
rather than host-host.
There may be other information that can be shared between concurrent
connections. For example, knowing that another connection has just
tried to expand its window size and failed, a connection may not
attempt to do the same for some period. The idea is that existing TCP
implementations infer the behavior of all competing connections,
including those within the same host or LAN. One possible
optimization is to make that implicit feedback explicit, via extended
information in the per-host TCP area.
Security Considerations
These suggested implementation enhancements do not have additional
ramifications for direct attacks. These enhancements may be
susceptible to denial-of-service attacks if not otherwise secured.
For example, an application can open a connection and set its window
size to 0, denying service to any other subsequent connection between
those hosts.
TCB sharing may be susceptible to denial-of-service attacks, wherever
the TCB is shared, between connections in a single host, or between
hosts if TCB sharing is implemented on the LAN (see Implications
section). Some shared TCB parameters are used only to create new
TCBs, others are shared among the TCBs of ongoing connections. New
connections can join the ongoing set, e.g., to optimize send window
size among a set of connections to the same host.
Touch Informational [Page 9]
RFC 2140 TCP Control Block Interdependence April 1997
Attacks on parameters used only for initialization affect only the
transient performance of a TCP connection. For short connections,
the performance ramification can approach that of a denial-of-service
attack. E.g., if an application changes its TCB to have a false and
small window size, subsequent connections would experience
performance degradation until their window grew appropriately.
The solution is to limit the effect of compromised TCB values. TCBs
are compromised when they are modified directly by an application or
transmitted between hosts via unauthenticated means (e.g., by using a
dirty flag). TCBs that are not compromised by application
modification do not have any unique security ramifications. Note that
the proposed parameters for TCB sharing are not currently modifiable
by an application.
All shared TCBs MUST be validated against default minimum parameters
before used for new connections. This validation would not impact
performance, because it occurs only at TCB initialization. This
limits the effect of attacks on new connections, to reducing the
benefit of TCB sharing, resulting in the current default TCP
performance. For ongoing connections, the effect of incoming packets
on shared information should be both limited and validated against
constraints before use. This is a beneficial precaution for existing
TCP implementations as well.
TCBs modified by an application SHOULD not be shared, unless the new
connection sharing the compromised information has been given
explicit permission to use such information by the connection API. No
mechanism for that indication currently exists, but it could be
supported by an augmented API. This sharing restriction SHOULD be
implemented in both the host and the LAN. Sharing on a LAN SHOULD
utilize authentication to prevent undetected tampering of shared TCB
parameters. These restrictions limit the security impact of modified
TCBs both for connection initialization and for ongoing connections.
Finally, shared values MUST be limited to performance factors only.
Other information, such as TCP sequence numbers, when shared, are
already known to compromise security.
Acknowledgements
The author would like to thank the members of the High-Performance
Computing and Communications Division at ISI, notably Bill Manning,
Bob Braden, Jon Postel, Ted Faber, and Cliff Neuman for their
assistance in the development of this memo.
Touch Informational [Page 10]
RFC 2140 TCP Control Block Interdependence April 1997
References
[1] Berners-Lee, T., et al., "The World-Wide Web," Communications of
the ACM, V37, Aug. 1994, pp. 76-82.
[2] Braden, R., "Transaction TCP -- Concepts," RFC-1379,
USC/Information Sciences Institute, September 1992.
[3] Braden, R., "T/TCP -- TCP Extensions for Transactions Functional
Specification," RFC-1644, USC/Information Sciences Institute,
July 1994.
[4] Braden, B., "T/TCP -- Transaction TCP: Source Changes for Sun OS
4.1.3,", Release 1.0, USC/ISI, September 14, 1994.
[5] Comer, D., and Stevens, D., Internetworking with TCP/IP, V2,
Prentice-Hall, NJ, 1991.
[6] Fielding, R., et al., "Hypertext Transfer Protocol -- HTTP/1.1,"
Work in Progress.
[7] FreeBSD source code, Release 2.10, <http://www.freebsd.org/>.
[8] Jacobson, V., (mail to public list "tcp-ip", no archive found),
1986.
[9] Postel, Jon, "Transmission Control Protocol," Network Working
Group RFC-793/STD-7, ISI, Sept. 1981.
[10] Tannenbaum, A., Computer Networks, Prentice-Hall, NJ, 1988.
Author's Address
Joe Touch
University of Southern California/Information Sciences Institute
4676 Admiralty Way
Marina del Rey, CA 90292-6695
USA
Phone: +1 310-822-1511 x151
Fax: +1 310-823-6714
URL: http://www.isi.edu/~touch
Email: touch@isi.edu
Touch Informational [Page 11]

View File

@ -0,0 +1,395 @@
Network Working Group G. Malkin
Request for Commments: 2347 Bay Networks
Updates: 1350 A. Harkin
Obsoletes: 1782 Hewlett Packard Co.
Category: Standards Track May 1998
TFTP Option Extension
Status of this Memo
This document specifies an Internet standards track protocol for the
Internet community, and requests discussion and suggestions for
improvements. Please refer to the current edition of the "Internet
Official Protocol Standards" (STD 1) for the standardization state
and status of this protocol. Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (1998). All Rights Reserved.
Abstract
The Trivial File Transfer Protocol [1] is a simple, lock-step, file
transfer protocol which allows a client to get or put a file onto a
remote host. This document describes a simple extension to TFTP to
allow option negotiation prior to the file transfer.
Introduction
The option negotiation mechanism proposed in this document is a
backward-compatible extension to the TFTP protocol. It allows file
transfer options to be negotiated prior to the transfer using a
mechanism which is consistent with TFTP's Request Packet format. The
mechanism is kept simple by enforcing a request-respond-acknowledge
sequence, similar to the lock-step approach taken by TFTP itself.
While the option negotiation mechanism is general purpose, in that
many types of options may be negotiated, it was created to support
the Blocksize option defined in [2]. Additional options are defined
in [3].
Packet Formats
TFTP options are appended to the Read Request and Write Request
packets. A new type of TFTP packet, the Option Acknowledgment
(OACK), is used to acknowledge a client's option negotiation request.
A new error code, 8, is hereby defined to indicate that a transfer
Malkin & Harkin Standards Track [Page 1]
RFC 2347 TFTP Option Extension May 1998
should be terminated due to option negotiation.
Options are appended to a TFTP Read Request or Write Request packet
as follows:
+-------+---~~---+---+---~~---+---+---~~---+---+---~~---+---+-->
| opc |filename| 0 | mode | 0 | opt1 | 0 | value1 | 0 | <
+-------+---~~---+---+---~~---+---+---~~---+---+---~~---+---+-->
>-------+---+---~~---+---+
< optN | 0 | valueN | 0 |
>-------+---+---~~---+---+
opc
The opcode field contains either a 1, for Read Requests, or 2,
for Write Requests, as defined in [1].
filename
The name of the file to be read or written, as defined in [1].
This is a NULL-terminated field.
mode
The mode of the file transfer: "netascii", "octet", or "mail",
as defined in [1]. This is a NULL-terminated field.
opt1
The first option, in case-insensitive ASCII (e.g., blksize).
This is a NULL-terminated field.
value1
The value associated with the first option, in case-
insensitive ASCII. This is a NULL-terminated field.
optN, valueN
The final option/value pair. Each NULL-terminated field is
specified in case-insensitive ASCII.
The options and values are all NULL-terminated, in keeping with the
original request format. If multiple options are to be negotiated,
they are appended to each other. The order in which options are
specified is not significant. The maximum size of a request packet
is 512 octets.
The OACK packet has the following format:
Malkin & Harkin Standards Track [Page 2]
RFC 2347 TFTP Option Extension May 1998
+-------+---~~---+---+---~~---+---+---~~---+---+---~~---+---+
| opc | opt1 | 0 | value1 | 0 | optN | 0 | valueN | 0 |
+-------+---~~---+---+---~~---+---+---~~---+---+---~~---+---+
opc
The opcode field contains a 6, for Option Acknowledgment.
opt1
The first option acknowledgment, copied from the original
request.
value1
The acknowledged value associated with the first option. If
and how this value may differ from the original request is
detailed in the specification for the option.
optN, valueN
The final option/value acknowledgment pair.
Negotiation Protocol
The client appends options at the end of the Read Request or Write
request packet, as shown above. Any number of options may be
specified; however, an option may only be specified once. The order
of the options is not significant.
If the server supports option negotiation, and it recognizes one or
more of the options specified in the request packet, the server may
respond with an Options Acknowledgment (OACK). Each option the
server recognizes, and accepts the value for, is included in the
OACK. Some options may allow alternate values to be proposed, but
this is an option specific feature. The server must not include in
the OACK any option which had not been specifically requested by the
client; that is, only the client may initiate option negotiation.
Options which the server does not support should be omitted from the
OACK; they should not cause an ERROR packet to be generated. If the
value of a supported option is invalid, the specification for that
option will indicate whether the server should simply omit the option
from the OACK, respond with an alternate value, or send an ERROR
packet, with error code 8, to terminate the transfer.
An option not acknowledged by the server must be ignored by the
client and server as if it were never requested. If multiple options
were requested, the client must use those options which were
acknowledged by the server and must not use those options which were
not acknowledged by the server.
Malkin & Harkin Standards Track [Page 3]
RFC 2347 TFTP Option Extension May 1998
When the client appends options to the end of a Read Request packet,
three possible responses may be returned by the server:
OACK - acknowledge of Read Request and the options;
DATA - acknowledge of Read Request, but not the options;
ERROR - the request has been denied.
When the client appends options to the end of a Write Request packet,
three possible responses may be returned by the server:
OACK - acknowledge of Write Request and the options;
ACK - acknowledge of Write Request, but not the options;
ERROR - the request has been denied.
If a server implementation does not support option negotiation, it
will likely ignore any options appended to the client's request. In
this case, the server will return a DATA packet for a Read Request
and an ACK packet for a Write Request establishing normal TFTP data
transfer. In the event that a server returns an error for a request
which carries an option, the client may attempt to repeat the request
without appending any options. This implementation option would
handle servers which consider extraneous data in the request packet
to be erroneous.
Depending on the original transfer request there are two ways for a
client to confirm acceptance of a server's OACK. If the transfer was
initiated with a Read Request, then an ACK (with the data block
number set to 0) is sent by the client to confirm the values in the
server's OACK packet. If the transfer was initiated with a Write
Request, then the client begins the transfer with the first DATA
packet, using the negotiated values. If the client rejects the OACK,
then it sends an ERROR packet, with error code 8, to the server and
the transfer is terminated.
Once a client acknowledges an OACK, with an appropriate non-error
response, that client has agreed to use only the options and values
returned by the server. Remember that the server cannot request an
option; it can only respond to them. If the client receives an OACK
containing an unrequested option, it should respond with an ERROR
packet, with error code 8, and terminate the transfer.
Malkin & Harkin Standards Track [Page 4]
RFC 2347 TFTP Option Extension May 1998
Examples
Read Request
client server
-------------------------------------------------------
|1|foofile|0|octet|0|blksize|0|1432|0| --> RRQ
<-- |6|blksize|0|1432|0| OACK
|4|0| --> ACK
<-- |3|1| 1432 octets of data | DATA
|4|1| --> ACK
<-- |3|2| 1432 octets of data | DATA
|4|2| --> ACK
<-- |3|3|<1432 octets of data | DATA
|4|3| --> ACK
Write Request
client server
-------------------------------------------------------
|2|barfile|0|octet|0|blksize|0|2048|0| --> RRQ
<-- |6|blksize|0|2048|0| OACK
|3|1| 2048 octets of data | --> DATA
<-- |4|1| ACK
|3|2| 2048 octets of data | --> DATA
<-- |4|2| ACK
|3|3|<2048 octets of data | --> DATA
<-- |4|3| ACK
Security Considerations
The basic TFTP protocol has no security mechanism. This is why it
has no rename, delete, or file overwrite capabilities. This document
does not add any security to TFTP; however, the specified extensions
do not add any additional security risks.
References
[1] Sollins, K., "The TFTP Protocol (Revision 2)", STD 33, RFC 1350,
October 1992.
[2] Malkin, G., and A. Harkin, "TFTP Blocksize Option", RFC 2348,
May 1998.
[3] Malkin, G., and A. Harkin, "TFTP Timeout Interval and Transfer
Size Options", RFC 2349, May 1998.
Malkin & Harkin Standards Track [Page 5]
RFC 2347 TFTP Option Extension May 1998
Authors' Addresses
Gary Scott Malkin
Bay Networks
8 Federal Street
Billerica, MA 01821
Phone: (978) 916-4237
EMail: gmalkin@baynetworks.com
Art Harkin
Internet Services Project
Information Networks Division
19420 Homestead Road MS 43LN
Cupertino, CA 95014
Phone: (408) 447-3755
EMail: ash@cup.hp.com
Malkin & Harkin Standards Track [Page 6]
RFC 2347 TFTP Option Extension May 1998
Full Copyright Statement
Copyright (C) The Internet Society (1998). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Malkin & Harkin Standards Track [Page 7]

View File

@ -0,0 +1,283 @@
Network Working Group G. Malkin
Request for Commments: 2349 Bay Networks
Updates: 1350 A. Harkin
Obsoletes: 1784 Hewlett Packard Co.
Category: Standards Track May 1998
TFTP Timeout Interval and Transfer Size Options
Status of this Memo
This document specifies an Internet standards track protocol for the
Internet community, and requests discussion and suggestions for
improvements. Please refer to the current edition of the "Internet
Official Protocol Standards" (STD 1) for the standardization state
and status of this protocol. Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (1998). All Rights Reserved.
Abstract
The Trivial File Transfer Protocol [1] is a simple, lock-step, file
transfer protocol which allows a client to get or put a file onto a
remote host.
This document describes two TFTP options. The first allows the client
and server to negotiate the Timeout Interval. The second allows the
side receiving the file to determine the ultimate size of the
transfer before it begins. The TFTP Option Extension mechanism is
described in [2].
Timeout Interval Option Specification
The TFTP Read Request or Write Request packet is modified to include
the timeout option as follows:
+-------+---~~---+---+---~~---+---+---~~---+---+---~~---+---+
| opc |filename| 0 | mode | 0 | timeout| 0 | #secs | 0 |
+-------+---~~---+---+---~~---+---+---~~---+---+---~~---+---+
opc
The opcode field contains either a 1, for Read Requests, or 2,
for Write Requests, as defined in [1].
Malkin & Harkin Standards Track [Page 1]
RFC 2349 TFTP Timeout Interval and Transfer Size Options May 1998
filename
The name of the file to be read or written, as defined in [1].
This is a NULL-terminated field.
mode
The mode of the file transfer: "netascii", "octet", or "mail",
as defined in [1]. This is a NULL-terminated field.
timeout
The Timeout Interval option, "timeout" (case in-sensitive).
This is a NULL-terminated field.
#secs
The number of seconds to wait before retransmitting, specified
in ASCII. Valid values range between "1" and "255" seconds,
inclusive. This is a NULL-terminated field.
For example:
+-------+--------+---+--------+---+--------+---+-------+---+
| 1 | foobar | 0 | octet | 0 | timeout| 0 | 1 | 0 |
+-------+--------+---+--------+---+--------+---+-------+---+
is a Read Request, for the file named "foobar", in octet (binary)
transfer mode, with a timeout interval of 1 second.
If the server is willing to accept the timeout option, it sends an
Option Acknowledgment (OACK) to the client. The specified timeout
value must match the value specified by the client.
Transfer Size Option Specification
The TFTP Read Request or Write Request packet is modified to include
the tsize option as follows:
+-------+---~~---+---+---~~---+---+---~~---+---+---~~---+---+
| opc |filename| 0 | mode | 0 | tsize | 0 | size | 0 |
+-------+---~~---+---+---~~---+---+---~~---+---+---~~---+---+
opc
The opcode field contains either a 1, for Read Requests, or 2,
for Write Requests, as defined in [1].
filename
The name of the file to be read or written, as defined in [1].
This is a NULL-terminated field.
Malkin & Harkin Standards Track [Page 2]
RFC 2349 TFTP Timeout Interval and Transfer Size Options May 1998
mode
The mode of the file transfer: "netascii", "octet", or "mail",
as defined in [1]. This is a NULL-terminated field.
tsize
The Transfer Size option, "tsize" (case in-sensitive). This is
a NULL-terminated field.
size
The size of the file to be transfered. This is a NULL-
terminated field.
For example:
+-------+--------+---+--------+---+--------+---+--------+---+
| 2 | foobar | 0 | octet | 0 | tsize | 0 | 673312 | 0 |
+-------+--------+---+--------+---+--------+---+--------+---+
is a Write Request, with the 673312-octet file named "foobar", in
octet (binary) transfer mode.
In Read Request packets, a size of "0" is specified in the request
and the size of the file, in octets, is returned in the OACK. If the
file is too large for the client to handle, it may abort the transfer
with an Error packet (error code 3). In Write Request packets, the
size of the file, in octets, is specified in the request and echoed
back in the OACK. If the file is too large for the server to handle,
it may abort the transfer with an Error packet (error code 3).
Security Considerations
The basic TFTP protocol has no security mechanism. This is why it
has no rename, delete, or file overwrite capabilities. This document
does not add any security to TFTP; however, the specified extensions
do not add any additional security risks.
References
[1] Sollins, K., "The TFTP Protocol (Revision 2)", STD 33, RFC 1350,
October 92.
[2] Malkin, G., and A. Harkin, "TFTP Option Extension", RFC 2347,
May 1998.
Malkin & Harkin Standards Track [Page 3]
RFC 2349 TFTP Timeout Interval and Transfer Size Options May 1998
Authors' Addresses
Gary Scott Malkin
Bay Networks
8 Federal Street
Billerica, MA 01821
Phone: (978) 916-4237
EMail: gmalkin@baynetworks.com
Art Harkin
Internet Services Project
Information Networks Division
19420 Homestead Road MS 43LN
Cupertino, CA 95014
Phone: (408) 447-3755
EMail: ash@cup.hp.com
Malkin & Harkin Standards Track [Page 4]
RFC 2349 TFTP Timeout Interval and Transfer Size Options May 1998
Full Copyright Statement
Copyright (C) The Internet Society (1998). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Malkin & Harkin Standards Track [Page 5]

View File

@ -0,0 +1,339 @@
Network Working Group A. Heffernan
Request for Comments: 2385 cisco Systems
Category: Standards Track August 1998
Protection of BGP Sessions via the TCP MD5 Signature Option
Status of this Memo
This document specifies an Internet standards track protocol for the
Internet community, and requests discussion and suggestions for
improvements. Please refer to the current edition of the "Internet
Official Protocol Standards" (STD 1) for the standardization state
and status of this protocol. Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (1998). All Rights Reserved.
IESG Note
This document describes currrent existing practice for securing BGP
against certain simple attacks. It is understood to have security
weaknesses against concerted attacks.
Abstract
This memo describes a TCP extension to enhance security for BGP. It
defines a new TCP option for carrying an MD5 [RFC1321] digest in a
TCP segment. This digest acts like a signature for that segment,
incorporating information known only to the connection end points.
Since BGP uses TCP as its transport, using this option in the way
described in this paper significantly reduces the danger from certain
security attacks on BGP.
1.0 Introduction
The primary motivation for this option is to allow BGP to protect
itself against the introduction of spoofed TCP segments into the
connection stream. Of particular concern are TCP resets.
To spoof a connection using the scheme described in this paper, an
attacker would not only have to guess TCP sequence numbers, but would
also have had to obtain the password included in the MD5 digest.
This password never appears in the connection stream, and the actual
form of the password is up to the application. It could even change
Heffernan Standards Track [Page 1]
RFC 2385 TCP MD5 Signature Option August 1998
during the lifetime of a particular connection so long as this change
was synchronized on both ends (although retransmission can become
problematical in some TCP implementations with changing passwords).
Finally, there is no negotiation for the use of this option in a
connection, rather it is purely a matter of site policy whether or
not its connections use the option.
2.0 Proposal
Every segment sent on a TCP connection to be protected against
spoofing will contain the 16-byte MD5 digest produced by applying the
MD5 algorithm to these items in the following order:
1. the TCP pseudo-header (in the order: source IP address,
destination IP address, zero-padded protocol number, and
segment length)
2. the TCP header, excluding options, and assuming a checksum of
zero
3. the TCP segment data (if any)
4. an independently-specified key or password, known to both TCPs
and presumably connection-specific
The header and pseudo-header are in network byte order. The nature
of the key is deliberately left unspecified, but it must be known by
both ends of the connection. A particular TCP implementation will
determine what the application may specify as the key.
Upon receiving a signed segment, the receiver must validate it by
calculating its own digest from the same data (using its own key) and
comparing the two digest. A failing comparison must result in the
segment being dropped and must not produce any response back to the
sender. Logging the failure is probably advisable.
Unlike other TCP extensions (e.g., the Window Scale option
[RFC1323]), the absence of the option in the SYN,ACK segment must not
cause the sender to disable its sending of signatures. This
negotiation is typically done to prevent some TCP implementations
from misbehaving upon receiving options in non-SYN segments. This is
not a problem for this option, since the SYN,ACK sent during
connection negotiation will not be signed and will thus be ignored.
The connection will never be made, and non-SYN segments with options
will never be sent. More importantly, the sending of signatures must
be under the complete control of the application, not at the mercy of
the remote host not understanding the option.
Heffernan Standards Track [Page 2]
RFC 2385 TCP MD5 Signature Option August 1998
3.0 Syntax
The proposed option has the following format:
+---------+---------+-------------------+
| Kind=19 |Length=18| MD5 digest... |
+---------+---------+-------------------+
| |
+---------------------------------------+
| |
+---------------------------------------+
| |
+-------------------+-------------------+
| |
+-------------------+
The MD5 digest is always 16 bytes in length, and the option would
appear in every segment of a connection.
4.0 Some Implications
4.1 Connectionless Resets
A connectionless reset will be ignored by the receiver of the reset,
since the originator of that reset does not know the key, and so
cannot generate the proper signature for the segment. This means,
for example, that connection attempts by a TCP which is generating
signatures to a port with no listener will time out instead of being
refused. Similarly, resets generated by a TCP in response to
segments sent on a stale connection will also be ignored.
Operationally this can be a problem since resets help BGP recover
quickly from peer crashes.
4.2 Performance
The performance hit in calculating digests may inhibit the use of
this option. Some measurements of a sample implementation showed
that on a 100 MHz R4600, generating a signature for simple ACK
segment took an average of 0.0268 ms, while generating a signature
for a data segment carrying 4096 bytes of data took 0.8776 ms on
average. These times would be applied to both the input and output
paths, with the input path also bearing the cost of a 16-byte
compare.
Heffernan Standards Track [Page 3]
RFC 2385 TCP MD5 Signature Option August 1998
4.3 TCP Header Size
As with other options that are added to every segment, the size of
the MD5 option must be factored into the MSS offered to the other
side during connection negotiation. Specifically, the size of the
header to subtract from the MTU (whether it is the MTU of the
outgoing interface or IP's minimal MTU of 576 bytes) is now at least
18 bytes larger.
The total header size is also an issue. The TCP header specifies
where segment data starts with a 4-bit field which gives the total
size of the header (including options) in 32-byte words. This means
that the total size of the header plus option must be less than or
equal to 60 bytes -- this leaves 40 bytes for options.
As a concrete example, 4.4BSD defaults to sending window-scaling and
timestamp information for connections it initiates. The most loaded
segment will be the initial SYN packet to start the connection. With
MD5 signatures, the SYN packet will contain the following:
-- 4 bytes MSS option
-- 4 bytes window scale option (3 bytes padded to 4 in 4.4BSD)
-- 12 bytes for timestamp (4.4BSD pads the option as recommended
in RFC 1323 Appendix A)
-- 18 bytes for MD5 digest
-- 2 bytes for end-of-option-list, to pad to a 32-bit boundary.
This sums to 40 bytes, which just makes it.
4.4 MD5 as a Hashing Algorithm
Since this memo was first issued (under a different title), the MD5
algorithm has been found to be vulnerable to collision search attacks
[Dobb], and is considered by some to be insufficiently strong for
this type of application.
This memo still specifies the MD5 algorithm, however, since the
option has already been deployed operationally, and there was no
"algorithm type" field defined to allow an upgrade using the same
option number. The original document did not specify a type field
since this would require at least one more byte, and it was felt at
the time that taking 19 bytes for the complete option (which would
probably be padded to 20 bytes in TCP implementations) would be too
much of a waste of the already limited option space.
Heffernan Standards Track [Page 4]
RFC 2385 TCP MD5 Signature Option August 1998
This does not prevent the deployment of another similar option which
uses another hashing algorithm (like SHA-1). Also, if most
implementations pad the 18 byte option as defined to 20 bytes anyway,
it would be just as well to define a new option which contains an
algorithm type field.
This would need to be addressed in another document, however.
4.5 Key configuration
It should be noted that the key configuration mechanism of routers
may restrict the possible keys that may be used between peers. It is
strongly recommended that an implementation be able to support at
minimum a key composed of a string of printable ASCII of 80 bytes or
less, as this is current practice.
5.0 Security Considerations
This document defines a weak but currently practiced security
mechanism for BGP. It is anticipated that future work will provide
different stronger mechanisms for dealing with these issues.
6.0 References
[RFC1321] Rivest, R., "The MD5 Message-Digest Algorithm," RFC 1321,
April 1992.
[RFC1323] Jacobson, V., Braden, R., and D. Borman, "TCP Extensions
for High Performance", RFC 1323, May 1992.
[Dobb] H. Dobbertin, "The Status of MD5 After a Recent Attack", RSA
Labs' CryptoBytes, Vol. 2 No. 2, Summer 1996.
http://www.rsa.com/rsalabs/pubs/cryptobytes.html
Author's Address
Andy Heffernan
cisco Systems
170 West Tasman Drive
San Jose, CA 95134 USA
Phone: +1 408 526-8115
EMail: ahh@cisco.com
Heffernan Standards Track [Page 5]
RFC 2385 TCP MD5 Signature Option August 1998
Full Copyright Statement
Copyright (C) The Internet Society (1998). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Heffernan Standards Track [Page 6]

View File

@ -0,0 +1,843 @@
Network Working Group S. Parker
Request for Comments: 2398 C. Schmechel
FYI: 33 Sun Microsystems, Inc.
Category: Informational August 1998
Some Testing Tools for TCP Implementors
Status of this Memo
This memo provides information for the Internet community. It does
not specify an Internet standard of any kind. Distribution of this
memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (1998). All Rights Reserved.
1. Introduction
Available tools for testing TCP implementations are catalogued by
this memo. Hopefully disseminating this information will encourage
those responsible for building and maintaining TCP to make the best
use of available tests. The type of testing the tool provides, the
type of tests it is capable of doing, and its availability is
enumerated. This document lists only tools which can evaluate one or
more TCP implementations, or which can privde some specific results
which describe or evaluate the TCP being tested. A number of these
tools produce time-sequence plots, see
Tim Shepard's thesis [She91] for a general discussion of these plots.
Each tools is defined as follows:
Name
The name associated with the testing tool.
Category
One or more categories of tests which the tools are capable of
providing. Categories used are: functional correctness, performance,
stress. Functional correctness tests how stringent a TCP
implementation is to the RFC specifications. Performance tests how
Parker & Schmechel Informational [Page 1]
RFC 2398 Some Testing Tools for TCP Implementors August 1998
quickly a TCP implementation can send and receive data, etc. Stress
tests how a TCP implementation is effected under high load
conditions.
Description
A description of the tools construction, and the implementation
methodology of the tests.
Automation
What steps are required to complete the test? What human
intervention is required?
Availability
How do you retrieve this tool and get more information about it?
Required Environment
Compilers, OS version, etc. required to build and/or run the
associated tool.
References
A list of publications relating to the tool, if any.
2. Tools
2.1. Dbs
Author
Yukio Murayama
Category
Performance / Stress
Description
Dbs is a tool which allows multiple data transfers to be coordinated,
and the resulting TCP behavior to be reviewed. Results are presented
as ASCII log files.
Automation
Command of execution is driven by a script file.
Parker & Schmechel Informational [Page 2]
RFC 2398 Some Testing Tools for TCP Implementors August 1998
Availability
See http://www.ai3.net/products/dbs for details of precise OS
versions supported, and for download of the source code. Current
implementation supports BSDI BSD/OS, Linux, mkLinux, SunOS, IRIX,
Ultrix, NEWS OS, HP-UX. Other environments are likely easy to add.
Required Environment
C language compiler, UNIX-style socket API support.
2.2. Dummynet
Author
Luigi Rizzo
Category
Functional Correctness / Performance
Description
Dummynet is a tool which simulates the presence of finite size
queues, bandwidth limitations, and communication delays. Dummynet
inserts between two layers of the protocol stack (in the current
implementation between TCP and IP), simulating the above effects in
an operational system. This way experiments can be done using real
protocol implementations and real applications, even running on the
same host (dummynet also intercepts communications on the loopback
interface). Reconfiguration of dummynet parameters (delay, queue
size, bandwidth) can be done on the fly by using a sysctl call. The
overhead of dummynet is extremely low.
Automation
Requires merging diff files with kernel source code. Command-line
driven through the sysctl command to modify kernel variables.
Availability
See http://www.iet.unipi.it/~luigi/research.html or e-mail Luigi
Rizzo (l.rizzo@iet.unipi.it). Source code is available for FreeBSD
2.1 and FreeBSD 2.2 (easily adaptable to other BSD-derived systems).
Required Environment
C language compiler, BSD-derived system, kernel source code.
References
[Riz97]
Parker & Schmechel Informational [Page 3]
RFC 2398 Some Testing Tools for TCP Implementors August 1998
2.3. Netperf
Author
Rick Jones
Category
Performance
Description
Single connection bandwidth or latency tests for TCP, UDP, and DLPI.
Includes provisions for CPU utilization measurement.
Automation
Requires compilation (K&R C sufficient for all but-DHISTOGRAM, may
require ANSI C in the future) if starting from source. Execution as
child of inetd requires editing of /etc/services and /etc/inetd.conf.
Scripts are provided for a quick look (snapshot_script), bulk
throughput of TCP and UDP, and latency for TCP and UDP. It is
command-line driven.
Availability
See http://www.cup.hp.com/netperf/NetperfPage.html or e-mail Rick
Jones (raj@cup.hp.com). Binaries are available here for HP/UX Irix,
Solaris, and Win32.
Required Environment
C language compiler, POSIX.1, sockets.
2.4. NIST Net
Author
Mark Carson
Category
Functional Correctness / Performance
Description
NIST Net is a network emulator. The tool is packaged as a Linux
kernel patch, a kernel module, a set of programming APIs, and
command-line and X-based user interfaces.
NIST Net works by turning the system into a "selectively bad" router
- incoming packets may be delayed, dropped, duplicated, bandwidth-
constrained, etc. Packet delays may be fixed or randomly
distributed, with loadable probability distributions. Packet loss
may be uniformly distributed (constant loss probability) or
congestion-dependent (probability of loss increases with packet queue
lengths). Explicit congestion notifications may optionally be sent
Parker & Schmechel Informational [Page 4]
RFC 2398 Some Testing Tools for TCP Implementors August 1998
in place of congestion-dependent loss.
Automation
To control the operation of the emulator, there is an interactive
user interface, a non-interactive command-line interface, and a set
of APIs. Any or all of these may be used in concert. The
interactive interface is suitable for simple, spur-of-the-moment
testing, while the command-line or APIs may be used to create
scripted, non-interactive tests.
Availability
NIST Net is available for public download from the NIST Net web site,
http://www.antd.nist.gov/itg/nistnet/. The web site also has
installation instructions and documentation.
Required Environment
NIST Net requires a Linux installtion, with kernel version 2.0.27 -
2.0.33. A kernel source tree and build tools are required to build
and install the NIST Net components. Building the X interface
requires a version of XFree86 (Current Version is 3.3.2). An
Athena-replacement widget set such as neXtaw
(http://www.inf.ufrgs.br/~kojima/nextaw/) is also desirable for an
improved user interface.
NIST Net should run on any i386-compatible machine capable of running
Linux, with one or more interfaces.
2.5. Orchestra
Author
Scott Dawson, Farnam Jahanian, and Todd Mitton
Category
Functional Correctness / Performance
Description
This tool is a library which provides the user with an ability to
build a protocol layer capable of performing fault injection on
protocols. Several fault injection layers have been built using this
library, one of which has been used to test different vendor
implementations of TCP. This is accomplished by probing the vendor
implementation from one machine containing a protocol stack that has
been instrumented with Orchestra. A connection is opened from the
vendor TCP implementation to the machine which has been instrumented.
Faults may then be injected at the Orchestra side of the connection
and the vendor TCP's response may be monitored. The most recent
version of Orchestra runs inside the X-kernel protocol stack on the
OSF MK operating system.
Parker & Schmechel Informational [Page 5]
RFC 2398 Some Testing Tools for TCP Implementors August 1998
When using Orchestra to test a protocol, the fault injection layer is
placed below the target protocol in the protocol stack. This can
either be done on one machine on the network, if protocol stacks on
the other machines cannot be modified (as in the case of testing
TCP), or can be done on all machines on the network (as in the case
of testing a protocol under development). Once the fault injection
layer is in the protocol stack, all messages sent by and destined for
the target protocol pass through it on their way to/from the network.
The Orchestra fault injection layer can manipulate these messages.
In particular, it can drop, delay, re-order, duplicate, or modify
messages. It can also introduce new messages into the system if
desired.
The actions of the Orchestra fault injection layer on each message
are determined by a script, written in Tcl. This script is
interpreted by the fault injection layer when the message enters the
layer. The script has access to the header information about the
message, and can make decisions based on header values. It can also
keep information about previous messages, counters, or any other data
which the script writer deems useful. Users of Orchestra may also
define their own actions to be taken on messages, written in C, that
may be called from the fault injection scripts.
Automation
Scripts can be specified either using a graphical user interface
which generates Tcl, or by writing Tcl directly. At this time,
post-analysis of the results of the test must also be performed by
the user. Essentially this consists of looking at a packet trace
that Orchestra generates for (in)correct behavior. Must compile and
link fault generated layer with the protocol stack.
Availability
See http://www.eecs.umich.edu/RTCL/projects/orchestra/ or e-mail
Scott Dawson (sdawson@eecs.umich.edu).
Required Environment OSF MK operating system, or X-kernel like network
architecture, or adapted to network stack.
References
[DJ94], [DJM96a], [DJM96b]
Parker & Schmechel Informational [Page 6]
RFC 2398 Some Testing Tools for TCP Implementors August 1998
2.6. Packet Shell
Author
Steve Parker and Chris Schmechel
Category
Functional Correctness / Performance
Description
An extensible Tcl/Tk based software toolset for protocol development
and testing. Tcl (Tool Command Language) is an embeddable scripting
language and Tk is a graphical user interface toolkit based on Tcl.
The Packet Shell creates Tcl commands that allow you to create,
modify, send, and receive packets on networks. The operations for
each protocol are supplied by a dynamic linked library called a
protocol library. These libraries are silently linked in from a
special directory when the Packet Shell begins execution. The current
protocol libraries are: IP, IPv6, IPv6 extensions, ICMP, ICMPv6,
Ethernet layer, data layer, file layer (snoop and tcpdump support),
socket layer, TCP, TLI.
It includes harness, which is a Tk based graphical user interface for
creating test scripts within the Packet Shell. It includes tests for
no initial slow start, and retain out of sequence data as TCP test
cases mentioned in [PADHV98].
It includes tcpgraph, which is used with a snoop or tcpdump capture
file to produce a TCP time-sequence plot using xplot.
Automation
Command-line driven through Tcl commands, or graphical user interface
models are available through the harness format.
Availability
See http://playground.sun.com/psh/ or e-mail owner-packet-
shell@sunroof.eng.sun.com.
Required Environment
Solaris 2.4 or higher. Porting required for other operating systems.
Parker & Schmechel Informational [Page 7]
RFC 2398 Some Testing Tools for TCP Implementors August 1998
2.7. Tcpanaly
Author
Vern Paxson
Category
Functional Correctness / Performance
Description
This is a tool for automatically analyzing a TCP implementation's
behavior by inspecting packet traces of the TCP's activity. It does
so through packet filter traces produced by tcpdump. It has coded
within it knowledge of a large number of TCP implementations. Using
this, it can determine whether a given trace appears consistent with
a given implementation, and, if so, exactly why the TCP chose to
transmit each packet at the time it did. If a trace is found
inconsistent with a TCP, tcpanaly either diagnoses a likely
measurement error present in the trace, or indicates exactly whether
the activity in the trace deviates from that of the TCP, which can
greatly aid in determining how the traced implementation behaves.
Tcpanaly's category is somewhat difficult to classify, since it
attempts to profile the behavior of an implementation, rather than to
explicitly test specific correctness or performance issues. However,
this profile identifies correctness and performance problems.
Adding new implementations of TCP behavior is possible with tcpanaly
through the use of C++ classes.
Automation
Command-line driven and only the traces of the TCP sending and
receiving bulk data transfers are needed as input.
Availability
Contact Vern Paxson (vern@ee.lbl.gov).
Required Environment
C++ compiler.
References
[Pax97a]
Parker & Schmechel Informational [Page 8]
RFC 2398 Some Testing Tools for TCP Implementors August 1998
2.8. Tcptrace
Author
Shawn Ostermann
Category
Functional Correctness / Performance
Description
This is a TCP trace file analysis tool. It reads output trace files
in the formats of : tcpdump, snoop, etherpeek, and netm.
For each connection, it keeps track of elapsed time, bytes/segments
sent and received, retransmissions, round trip times, window
advertisements, throughput, etc from simple to very detailed output.
It can also produce three different types of graphs:
Time Sequence Graph (shows the segments sent and ACKs returned as a
function of time)
Instantaneous Throughput (shows the instantaneous, averaged over a
few segments, throughput of the connection as a function of time).
Round Trip Times (shows the round trip times for the ACKs as a
function of time)
Automation
Command-line driven, and uses the xplot program to view the graphs.
Availability
Source code is available, and Solaris binary along with sample
traces. See http://jarok.cs.ohiou.edu/software/tcptrace/tcptrace.html
or e-mail Shawn Ostermann (ostermann@cs.ohiou.edu).
Required Environment
C compiler, Solaris, FreeBSD, NetBSD, HPUX, Linux.
Parker & Schmechel Informational [Page 9]
RFC 2398 Some Testing Tools for TCP Implementors August 1998
2.9. Tracelook
Author
Greg Minshall
Category
Functional Correctness / Performance
Description
This is a Tcl/Tk program for graphically viewing the contents of
tcpdump trace files. When plotting a connection, a user can select
various variables to be plotted. In each direction of the connection,
the user can plot the advertised window in each packet, the highest
sequence number in each packet, the lowest sequence number in each
packet, and the acknowledgement number in each packet.
Automation
Command-line driven with a graphical user interface for the graph.
Availability
See http://www.ipsilon.com/~minshall/sw/tracelook/tracelook.html or
e-mail Greg Minshall (minshall@ipsilon.com).
Required Environment
A modern version of awk, and Tcl/Tk (Tk version 3.6 or higher). The
program xgraph is required to view the graphs under X11.
2.10. TReno
Author
Matt Mathis and Jamshid Mahdavi
Category
Performance
Description
This is a TCP throughput measurement tool based on sending UDP or
ICMP packets in patterns that are controlled at the user-level so
that their timing reflects what would be sent by a TCP that observes
proper congestion control (and implements SACK). This allows it to
measure throughput independent of the TCP implementation of end hosts
and serve as a useful platform for prototyping TCP changes.
Automation
Command-line driven. No "server" is required, and it only requires a
single argument of the machine to run the test to.
Parker & Schmechel Informational [Page 10]
RFC 2398 Some Testing Tools for TCP Implementors August 1998
Availability
See http://www.psc.edu/networking/treno_info.html or e-mail Matt
Mathis (mathis@psc.edu) or Jamshid Mahdavi (mahdavi@psc.edu).
Required Environment
C compiler, POSIX.1, raw sockets.
2.11. Ttcp
Author
Unknown
Category
Performance
Description
Originally written to move files around, ttcp became the classic
throughput benchmark or load generator, with the addition of support
for sourcing to/from memory. It can also be used as a traffic
absorber. It has spawned many variants, recent ones include support
for UDP, data pattern generation, page alignment, and even alignment
offset control.
Automation
Command-line driven.
Availability
See ftp://ftp.arl.mil/pub/ttcp/ or e-mail ARL (ftp@arl.mil) which
includes the most common variants available.
Required Environment
C compiler, BSD sockets.
2.12. Xplot
Author
Tim Shepard
Category
Functional Correctness / Performance
Description
This is a fairly conventional graphing/plotting tool (xplot itself),
a script to turn tcpdump output into xplot input, and some sample
code to generate xplot commands to plot the TCP time-sequence graph).
Automation
Command-line driven with a graphical user interface for the plot.
Parker & Schmechel Informational [Page 11]
RFC 2398 Some Testing Tools for TCP Implementors August 1998
Availability
See ftp://mercury.lcs.mit.edu/pub/shep/xplot.tar.gz or e-mail Tim
Shepard (shep@lcs.mit.edu).
Required Environment
C compiler, X11.
References
[She91]
3. Summary
This memo lists all TCP tests and testing tools reported to the
authors as part of TCP Implementer's working group and is not
exhaustive. These tools have been verified as available by the
authors.
4. Security Considerations
Network analysis tools are improving at a steady pace. The
continuing improvement in these tools such as the ones described make
security concerns significant.
Some of the tools could be used to create rogue packets or denial-
of-service attacks against other hosts. Also, some of the tools
require changes to the kernel (foreign code) and might require root
privileges to execute. So you are trusting code that you have
fetched from some perhaps untrustworthy remote site. This code could
contain malicious code that could present any kind of attack.
None of the listed tools evaluate security in any way or form.
There are privacy concerns when grabbing packets from the network in
that you are now able to read other people's mail, files, etc. This
impacts more than just the host running the tool but all traffic
crossing the host's physical network.
5. References
[DJ94] Scott Dawson and Farnam Jahanian, "Probing and Fault
Injection of Distributed Protocol Implementations",
University of Michigan Technical Report CSE-TR-217-94, EECS
Department.
[DJM96a] Scott Dawson, Farnam Jahanian, and Todd Mitton, "ORCHESTRA:
A Fault Injection Environment for Distributed Systems",
University of Michigan Technical Report CSE-TR-318-96, EECS
Department.
Parker & Schmechel Informational [Page 12]
RFC 2398 Some Testing Tools for TCP Implementors August 1998
[DJM96b] Scott Dawson, Farnam Jahanian, and Todd Mitton,
"Experiments on Six Commercial TCP Implementations Using a
Software Fault Injection Tool", University of Michigan
Technical Report CSE-TR-298-96, EECS Department.
[Pax97a] Vern Paxson, "Automated Packet Trace Analysis of TCP
Implementations", ACM SIGCOMM '97, September 1997, Cannes,
France.
[PADHV98] Paxson, V., Allman, M., Dawson, S., Heavens, I., and B.
Volz, "Known TCP Implementation Problems", Work In
Progress.
[Riz97] Luigi Rizzo, "Dummynet: a simple approach to the evaluation
of network protocols", ACM Computer Communication Review,
Vol. 27, N. 1, January 1997, pp. 31-41.
[She91] Tim Shepard, "TCP Packet Trace Analysis", MIT Laboratory
for Computer Science MIT-LCS-TR-494, February, 1991.
Parker & Schmechel Informational [Page 13]
RFC 2398 Some Testing Tools for TCP Implementors August 1998
6. Authors' Addresses
Steve Parker
Sun Microsystems, Inc.
901 San Antonio Road, UMPK17-202
Palo Alto, CA 94043
USA
Phone: (650) 786-5176
EMail: sparker@eng.sun.com
Chris Schmechel
Sun Microsystems, Inc.
901 San Antonio Road, UMPK17-202
Palo Alto, CA, 94043
USA
Phone: (650) 786-4053
EMail: cschmec@eng.sun.com
Parker & Schmechel Informational [Page 14]
RFC 2398 Some Testing Tools for TCP Implementors August 1998
7. Full Copyright Statement
Copyright (C) The Internet Society (1998). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Parker & Schmechel Informational [Page 15]

View File

@ -0,0 +1,619 @@
Network Working Group K. Poduri
Request for Comments: 2415 K. Nichols
Category: Informational Bay Networks
September 1998
Simulation Studies of Increased Initial TCP Window Size
Status of this Memo
This memo provides information for the Internet community. It does
not specify an Internet standard of any kind. Distribution of this
memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (1998). All Rights Reserved.
Abstract
An increase in the permissible initial window size of a TCP
connection, from one segment to three or four segments, has been
under discussion in the tcp-impl working group. This document covers
some simulation studies of the effects of increasing the initial
window size of TCP. Both long-lived TCP connections (file transfers)
and short-lived web-browsing style connections were modeled. The
simulations were performed using the publicly available ns-2
simulator and our custom models and files are also available.
1. Introduction
We present results from a set of simulations with increased TCP
initial window (IW). The main objectives were to explore the
conditions under which the larger IW was a "win" and to determine the
effects, if any, the larger IW might have on other traffic flows
using an IW of one segment.
This study was inspired by discussions at the Munich IETF tcp-impl
and tcp-sat meetings. A proposal to increase the IW size to about 4K
bytes (4380 bytes in the case of 1460 byte segments) was discussed.
Concerns about both the utility of the increase and its effect on
other traffic were raised. Some studies were presented showing the
positive effects of increased IW on individual connections, but no
studies were shown with a wide variety of simultaneous traffic flows.
It appeared that some of the questions being raised could be
addressed in an ns-2 simulation. Early results from our simulations
were previously posted to the tcp-impl mailing list and presented at
the tcp-impl WG meeting at the December 1997 IETF.
Poduri & Nichols Informational [Page 1]
RFC 2415 TCP Window Size September 1998
2. Model and Assumptions
We simulated a network topology with a bottleneck link as shown:
10Mb, 10Mb,
(all 4 links) (all 4 links)
C n2_________ ______ n6 S
l n3_________\ /______ n7 e
i \\ 1.5Mb, 50ms // r
e n0 ------------------------ n1 v
n n4__________// \ \_____ n8 e
t n5__________/ \______ n9 r
s s
URLs --> <--- FTP & Web data
File downloading and web-browsing clients are attached to the nodes
(n2-n5) on the left-hand side. These clients are served by the FTP
and Web servers attached to the nodes (n6-n9) on the right-hand side.
The links to and from those nodes are at 10 Mbps. The bottleneck link
is between n1 and n0. All links are bi-directional, but only ACKs,
SYNs, FINs, and URLs are flowing from left to right. Some simulations
were also performed with data traffic flowing from right to left
simultaneously, but it had no effect on the results.
In the simulations we assumed that all ftps transferred 1-MB files
and that all web pages had exactly three embedded URLs. The web
clients are browsing quite aggressively, requesting a new page after
a random delay uniformly distributed between 1 and 5 seconds. This is
not meant to realistically model a single user's web-browsing
pattern, but to create a reasonably heavy traffic load whose
individual tcp connections accurately reflect real web traffic. Some
discussion of these models as used in earlier studies is available in
references [3] and [4].
The maximum tcp window was set to 11 packets, maximum packet (or
segment) size to 1460 bytes, and buffer sizes were set at 25 packets.
(The ns-2 TCPs require setting window sizes and buffer sizes in
number of packets. In our tcp-full code some of the internal
parameters have been set to be byte-oriented, but external values
must still be set in number of packets.) In our simulations, we
varied the number of data segments sent into a new TCP connection (or
initial window) from one to four, keeping all segments at 1460 bytes.
A dropped packet causes a restart window of one segment to be used,
just as in current practice.
Poduri & Nichols Informational [Page 2]
RFC 2415 TCP Window Size September 1998
For ns-2 users: The tcp-full code was modified to use an
"application" class and three application client-server pairs were
written: a simple file transfer (ftp), a model of http1.0 style web
connection and a very rough model of http1.1 style web connection.
The required files and scripts for these simulations are available
under the contributed code section on the ns-simulator web page at
the sites ftp://ftp.ee.lbl.gov/IW.{tar, tar.Z} or http://www-
nrg.ee.lbl.gov/floyd/tcp_init_win.html.
Simulations were run with 8, 16, 32 web clients and a number of ftp
clients ranging from 0 to 3. The IW was varied from 1 to 4, though
the 4-packet case lies beyond what is currently recommended. The
figures of merit used were goodput, the median page delay seen by the
web clients and the median file transfer delay seen by the ftp
clients. The simulated run time was rather large, 360 seconds, to
ensure an adequate sample. (Median values remained the same for
simulations with larger run times and can be considered stable)
3. Results
In our simulations, we varied the number of file transfer clients in
order to change the congestion of the link. Recall that our ftp
clients continuously request 1 Mbyte transfers, so the link
utilization is over 90% when even a single ftp client is present.
When three file transfer clients are running simultaneously, the
resultant congestion is somewhat pathological, making the values
recorded stable. Though all connections use the same initial window,
the effect of increasing the IW on a 1 Mbyte file transfer is not
detectable, thus we focus on the web browsing connections. (In the
tables, we use "webs" to indicate number of web clients and "ftps" to
indicate the number of file transfer clients attached.) Table 1 shows
the median delays experienced by the web transfers with an increase
in the TCP IW. There is clearly an improvement in transfer delays
for the web connections with increase in the IW, in many cases on the
order of 30%. The steepness of the performance improvement going
from an IW of 1 to an IW of 2 is mainly due to the distribution of
files fetched by each URL (see references [1] and [2]); the median
size of both primary and in-line URLs fits completely into two
packets. If file distributions change, the shape of this curve may
also change.
Poduri & Nichols Informational [Page 3]
RFC 2415 TCP Window Size September 1998
Table 1. Median web page delay
#Webs #FTPs IW=1 IW=2 IW=3 IW=4
(s) (% decrease)
----------------------------------------------
8 0 0.56 14.3 17.9 16.1
8 1 1.06 18.9 25.5 32.1
8 2 1.18 16.1 17.1 28.9
8 3 1.26 11.9 19.0 27.0
16 0 0.64 11.0 15.6 18.8
16 1 1.04 17.3 24.0 35.6
16 2 1.22 17.2 20.5 25.4
16 3 1.31 10.7 21.4 22.1
32 0 0.92 17.6 28.6 21.0
32 1 1.19 19.6 25.0 26.1
32 2 1.43 23.8 35.0 33.6
32 3 1.56 19.2 29.5 33.3
Table 2 shows the bottleneck link utilization and packet drop
percentage of the same experiment. Packet drop rates did increase
with IW, but in all cases except that of the single most pathological
overload, the increase in drop percentage was less than 1%. A
decrease in packet drop percentage is observed in some overloaded
situations, specifically when ftp transfers consumed most of the link
bandwidth and a large number of web transfers shared the remaining
bandwidth of the link. In this case, the web transfers experience
severe packet loss and some of the IW=4 web clients suffer multiple
packet losses from the same window, resulting in longer recovery
times than when there is a single packet loss in a window. During the
recovery time, the connections are inactive which alleviates
congestion and thus results in a decrease in the packet drop
percentage. It should be noted that such observations were made only
in extremely overloaded scenarios.
Poduri & Nichols Informational [Page 4]
RFC 2415 TCP Window Size September 1998
Table 2. Link utilization and packet drop rates
Percentage Link Utilization | Packet drop rate
#Webs #FTPs IW=1 IW=2 IW=3 IW=4 |IW=1 IW=2 IW=3 IW=4
-----------------------------------------------------------------------
8 0 34 37 38 39 | 0.0 0.0 0.0 0.0
8 1 95 92 93 92 | 0.6 1.2 1.4 1.3
8 2 98 97 97 96 | 1.8 2.3 2.3 2.7
8 3 98 98 98 98 | 2.6 3.0 3.5 3.5
-----------------------------------------------------------------------
16 0 67 69 69 67 | 0.1 0.5 0.8 1.0
16 1 96 95 93 92 | 2.1 2.6 2.9 2.9
16 2 98 98 97 96 | 3.5 3.6 4.2 4.5
16 3 99 99 98 98 | 4.5 4.7 5.2 4.9
-----------------------------------------------------------------------
32 0 92 87 85 84 | 0.1 0.5 0.8 1.0
32 1 98 97 96 96 | 2.1 2.6 2.9 2.9
32 2 99 99 98 98 | 3.5 3.6 4.2 4.5
32 3 100 99 99 98 | 9.3 8.4 7.7 7.6
To get a more complete picture of performance, we computed the
network power, goodput divided by median delay (in Mbytes/ms), and
plotted it against IW for all scenarios. (Each scenario is uniquely
identified by its number of webs and number of file transfers.) We
plot these values in Figure 1 (in the pdf version), illustrating a
general advantage to increasing IW. When a large number of web
clients is combined with ftps, particularly multiple ftps,
pathological cases result from the extreme congestion. In these
cases, there appears to be no particular trend to the results of
increasing the IW, in fact simulation results are not particularly
stable.
To get a clearer picture of what is happening across all the tested
scenarios, we normalized the network power values for the non-
pathological scenario by the network power for that scenario at IW of
one. These results are plotted in Figure 2. As IW is increased from
one to four, network power increased by at least 15%, even in a
congested scenario dominated by bulk transfer traffic. In simulations
where web traffic has a dominant share of the available bandwidth,
the increase in network power was up to 60%.
The increase in network power at higher initial window sizes is due
to an increase in throughput and a decrease in the delay. Since the
(slightly) increased drop rates were accompanied by better
performance, drop rate is clearly not an indicator of user level
performance.
Poduri & Nichols Informational [Page 5]
RFC 2415 TCP Window Size September 1998
The gains in performance seen by the web clients need to be balanced
against the performance the file transfers are seeing. We computed
ftp network power and show this in Table 3. It appears that the
improvement in network power seen by the web connections has
negligible effect on the concurrent file transfers. It can be
observed from the table that there is a small variation in the
network power of file transfers with an increase in the size of IW
but no particular trend can be seen. It can be concluded that the
network power of file transfers essentially remained the same.
However, it should be noted that a larger IW does allow web transfers
to gain slightly more bandwidth than with a smaller IW. This could
mean fewer bytes transferred for FTP applications or a slight
decrease in network power as computed by us.
Table 3. Network power of file transfers with an increase in the TCP
IW size
#Webs #FTPs IW=1 IW=2 IW=3 IW=4
--------------------------------------------
8 1 4.7 4.2 4.2 4.2
8 2 3.0 2.8 3.0 2.8
8 3 2.2 2.2 2.2 2.2
16 1 2.3 2.4 2.4 2.5
16 2 1.8 2.0 1.8 1.9
16 3 1.4 1.6 1.5 1.7
32 1 0.7 0.9 1.3 0.9
32 2 0.8 1.0 1.3 1.1
32 3 0.7 1.0 1.2 1.0
The above simulations all used http1.0 style web connections, thus, a
natural question is to ask how results are affected by migration to
http1.1. A rough model of this behavior was simulated by using one
connection to send all of the information from both the primary URL
and the three embedded, or in-line, URLs. Since the transfer size is
now made up of four web files, the steep improvement in performance
between an IW of 1 and an IW of two, noted in the previous results,
has been smoothed. Results are shown in Tables 4 & 5 and Figs. 3 & 4.
Occasionally an increase in IW from 3 to 4 decreases the network
power owing to a non-increase or a slight decrease in the throughput.
TCP connections opening up with a higher window size into a very
congested network might experience some packet drops and consequently
a slight decrease in the throughput. This indicates that increase of
the initial window sizes to further higher values (>4) may not always
result in a favorable network performance. This can be seen clearly
in Figure 4 where the network power shows a decrease for the two
highly congested cases.
Poduri & Nichols Informational [Page 6]
RFC 2415 TCP Window Size September 1998
Table 4. Median web page delay for http1.1
#Webs #FTPs IW=1 IW=2 IW=3 IW=4
(s) (% decrease)
----------------------------------------------
8 0 0.47 14.9 19.1 21.3
8 1 0.84 17.9 19.0 25.0
8 2 0.99 11.5 17.3 23.0
8 3 1.04 12.1 20.2 28.3
16 0 0.54 07.4 14.8 20.4
16 1 0.89 14.6 21.3 27.0
16 2 1.02 14.7 19.6 25.5
16 3 1.11 09.0 17.0 18.9
32 0 0.94 16.0 29.8 36.2
32 1 1.23 12.2 28.5 21.1
32 2 1.39 06.5 13.7 12.2
32 3 1.46 04.0 11.0 15.0
Table 5. Network power of file transfers with an increase in the
TCP IW size
#Webs #FTPs IW=1 IW=2 IW=3 IW=4
--------------------------------------------
8 1 4.2 4.2 4.2 3.7
8 2 2.7 2.5 2.6 2.3
8 3 2.1 1.9 2.0 2.0
16 1 1.8 1.8 1.5 1.4
16 2 1.5 1.2 1.1 1.5
16 3 1.0 1.0 1.0 1.0
32 1 0.3 0.3 0.5 0.3
32 2 0.4 0.3 0.4 0.4
32 3 0.4 0.3 0.4 0.5
For further insight, we returned to the http1.0 model and mixed some
web-browsing connections with IWs of one with those using IWs of
three. In this experiment, we first simulated a total of 16 web-
browsing connections, all using IW of one. Then the clients were
split into two groups of 8 each, one of which uses IW=1 and the other
used IW=3.
We repeated the simulations for a total of 32 and 64 web-browsing
clients, splitting those into groups of 16 and 32 respectively. Table
6 shows these results. We report the goodput (in Mbytes), the web
page delays (in milli seconds), the percent utilization of the link
and the percent of packets dropped.
Poduri & Nichols Informational [Page 7]
RFC 2415 TCP Window Size September 1998
Table 6. Results for half-and-half scenario
Median Page Delays and Goodput (MB) | Link Utilization (%) & Drops (%)
#Webs IW=1 | IW=3 | IW=1 | IW=3
G.put dly | G.put dly | L.util Drops| L.util Drops
------------------|-------------------|---------------|---------------
16 35.5 0.64| 36.4 0.54 | 67 0.1 | 69 0.7
8/8 16.9 0.67| 18.9 0.52 | 68 0.5 |
------------------|-------------------|---------------|---------------
32 48.9 0.91| 44.7 0.68 | 92 3.5 | 85 4.3
16/16 22.8 0.94| 22.9 0.71 | 89 4.6 |
------------------|-------------------|---------------|----------------
64 51.9 1.50| 47.6 0.86 | 98 13.0 | 91 8.6
32/32 29.0 1.40| 22.0 1.20 | 98 12.0 |
Unsurprisingly, the non-split experiments are consistent with our
earlier results, clients with IW=3 outperform clients with IW=1. The
results of running the 8/8 and 16/16 splits show that running a
mixture of IW=3 and IW=1 has no negative effect on the IW=1
conversations, while IW=3 conversations maintain their performance.
However, the 32/32 split shows that web-browsing connections with
IW=3 are adversely affected. We believe this is due to the
pathological dynamics of this extremely congested scenario. Since
embedded URLs open their connections simultaneously, very large
number of TCP connections are arriving at the bottleneck link
resulting in multiple packet losses for the IW=3 conversations. The
myriad problems of this simultaneous opening strategy is, of course,
part of the motivation for the development of http1.1.
4. Discussion
The indications from these results are that increasing the initial
window size to 3 packets (or 4380 bytes) helps to improve perceived
performance. Many further variations on these simulation scenarios
are possible and we've made our simulation models and scripts
available in order to facilitate others' experiments.
We also used the RED queue management included with ns-2 to perform
some other simulation studies. We have not reported on those results
here since we don't consider the studies complete. We found that by
adding RED to the bottleneck link, we achieved similar performance
gains (with an IW of 1) to those we found with increased IWs without
RED. Others may wish to investigate this further.
Although the simulation sets were run for a T1 link, several
scenarios with varying levels of congestion and varying number of web
and ftp clients were analyzed. It is reasonable to expect that the
results would scale for links with higher bandwidth. However,
Poduri & Nichols Informational [Page 8]
RFC 2415 TCP Window Size September 1998
interested readers could investigate this aspect further.
We also used the RED queue management included with ns-2 to perform
some other simulation studies. We have not reported on those results
here since we don't consider the studies complete. We found that by
adding RED to the bottleneck link, we achieved similar performance
gains (with an IW of 1) to those we found with increased IWs without
RED. Others may wish to investigate this further.
5. References
[1] B. Mah, "An Empirical Model of HTTP Network Traffic", Proceedings
of INFOCOM '97, Kobe, Japan, April 7-11, 1997.
[2] C.R. Cunha, A. Bestavros, M.E. Crovella, "Characteristics of WWW
Client-based Traces", Boston University Computer Science
Technical Report BU-CS-95-010, July 18, 1995.
[3] K.M. Nichols and M. Laubach, "Tiers of Service for Data Access in
a HFC Architecture", Proceedings of SCTE Convergence Conference,
January, 1997.
[4] K.M. Nichols, "Improving Network Simulation with Feedback",
available from knichols@baynetworks.com
6. Acknowledgements
This work benefited from discussions with and comments from Van
Jacobson.
7. Security Considerations
This document discusses a simulation study of the effects of a
proposed change to TCP. Consequently, there are no security
considerations directly related to the document. There are also no
known security considerations associated with the proposed change.
Poduri & Nichols Informational [Page 9]
RFC 2415 TCP Window Size September 1998
8. Authors' Addresses
Kedarnath Poduri
Bay Networks
4401 Great America Parkway
SC01-04
Santa Clara, CA 95052-8185
Phone: +1-408-495-2463
Fax: +1-408-495-1299
EMail: kpoduri@Baynetworks.com
Kathleen Nichols
Bay Networks
4401 Great America Parkway
SC01-04
Santa Clara, CA 95052-8185
EMail: knichols@baynetworks.com
Poduri & Nichols Informational [Page 10]
RFC 2415 TCP Window Size September 1998
Full Copyright Statement
Copyright (C) The Internet Society (1998). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Poduri & Nichols Informational [Page 11]

View File

@ -0,0 +1,395 @@
Network Working Group T. Shepard
Request for Comments: 2416 C. Partridge
Category: Informational BBN Technologies
September 1998
When TCP Starts Up With Four Packets Into Only Three Buffers
Status of this Memo
This memo provides information for the Internet community. It does
not specify an Internet standard of any kind. Distribution of this
memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (1998). All Rights Reserved.
Abstract
This memo is to document a simple experiment. The experiment showed
that in the case of a TCP receiver behind a 9600 bps modem link at
the edge of a fast Internet where there are only 3 buffers before the
modem (and the fourth packet of a four-packet start will surely be
dropped), no significant degradation in performance is experienced by
a TCP sending with a four-packet start when compared with a normal
slow start (which starts with just one packet).
Background
Sally Floyd has proposed that TCPs start their initial slow start by
sending as many as four packets (instead of the usual one packet) as
a means of getting TCP up-to-speed faster. (Slow starts instigated
due to timeouts would still start with just one packet.) Starting
with more than one packet might reduce the start-up latency over
long-fat pipes by two round-trip times. This proposal is documented
further in [1], [2], and in [3] and we assume the reader is familiar
with the details of this proposal.
On the end2end-interest mailing list, concern was raised that in the
(allegedly common) case where a slow modem is served by a router
which only allocates three buffers per modem (one buffer being
transmitted while two packets are waiting), that starting with four
packets would not be good because the fourth packet is sure to be
dropped.
Shepard & Partridge Informational [Page 1]
RFC 2416 TCP with Four Packets into Three Buffers September 1998
Vern Paxson replied with the comment (among other things) that the
four-packet start is no worse than what happens after two round trip
times in normal slow start, hence no new problem is introduced by
starting with as many as four packets. If there is a problem with a
four-packet start, then the problem already exists in a normal slow-
start startup after two round trip times when the slow-start
algorithm will release into the net four closely spaced packets.
The experiment reported here confirmed Vern Paxson's reasoning.
Scenario and experimental setup
+--------+ 100 Mbps +---+ 1.5 Mbps +---+ 9600 bps +----------+
| source +------------+ R +-------------+ R +--------------+ receiver |
+--------+ no delay +---+ 25 ms delay +---+ 150 ms delay +----------+
| |
| |
(we spy here) (this router has only 3 buffers
to hold packets going into the
9600 bps link)
The scenario studied and simulated consists of three links between
the source and sink. The first link is a 100 Mbps link with no
delay. It connects the sender to a router. (It was included to have
a means of logging the returning ACKs at the time they would be seen
by the sender.) The second link is a 1.5 Mbps link with a 25 ms
one-way delay. (This link was included to roughly model traversing
an un-congested, intra-continental piece of the terrestrial
Internet.) The third link is a 9600 bps link with a 150 ms one-way
delay. It connects the edge of the net to a receiver which is behind
the 9600 bps link.
The queue limits for the queues at each end of the first two links
were set to 100 (a value sufficiently large that this limit was never
a factor). The queue limits at each end of the 9600 bps link were
set to 3 packets (which can hold at most two packets while one is
being sent).
Version 1.2a2 of the the NS simulator (available from LBL) was used
to simulate both one-packet and four-packet starts for each of the
available TCP algorithms (tahoe, reno, sack, fack) and the conclusion
reported here is independent of which TCP algorithm is used (in
general, we believe). In this memo, the "tahoe" module will be used
to illustrate what happens. In the 4-packet start cases, the
"window-init" variable was set to 4, and the TCP implementations were
modified to use the value of the window-init variable only on
Shepard & Partridge Informational [Page 2]
RFC 2416 TCP with Four Packets into Three Buffers September 1998
connection start, but to set cwnd to 1 on other instances of a slow-
start. (The tcp.cc module as shipped with ns-1.2a2 would use the
window-init value in all cases.)
The packets in simulation are 1024 bytes long for purposes of
determining the time it takes to transmit them through the links.
(The TCP modules included with the LBL NS simulator do not simulate
the TCP sequence number mechanisms. They use just packet numbers.)
Observations are made of all packets and acknowledgements crossing
the 100 Mbps no-delay link, near the sender. (All descriptions below
are from this point of view.)
What happens with normal slow start
At time 0.0 packet number 1 is sent.
At time 1.222 an ack is received covering packet number 1, and
packets 2 and 3 are sent.
At time 2.444 an ack is received covering packet number 2, and
packets 4 and 5 are sent.
At time 3.278 an ack is received covering packet number 3, and
packets 6 and 7 are sent.
At time 4.111 an ack is received covering packet number 4, and
packets 8 and 9 are sent.
At time 4.944 an ack is received covering packet number 5, and
packets 10 and 11 are sent.
At time 5.778 an ack is received covering packet number 6, and
packets 12 and 13 are sent.
At time 6.111 a duplicate ack is recieved (covering packet number 6).
At time 7.444 another duplicate ack is received (covering packet
number 6).
At time 8.278 a third duplicate ack is received (covering packet
number 6) and packet number 7 is retransmitted.
(And the trace continues...)
What happens with a four-packet start
At time 0.0, packets 1, 2, 3, and 4 are sent.
Shepard & Partridge Informational [Page 3]
RFC 2416 TCP with Four Packets into Three Buffers September 1998
At time 1.222 an ack is received covering packet number 1, and
packets 5 and 6 are sent.
At time 2.055 an ack is received covering packet number 2, and
packets 7 and 8 are sent.
At time 2.889 an ack is received covering packet number 3, and
packets 9 and 10 are sent.
At time 3.722 a duplicate ack is received (covering packet number 3).
At time 4.555 another duplicate ack is received (covering packet
number 3).
At time 5.389 a third duplicate ack is received (covering packet
number 3) and packet number 4 is retransmitted.
(And the trace continues...)
Discussion
At the point left off in the two traces above, the two different
systems are in almost identical states. The two traces from that
point on are almost the same, modulo a shift in time of (8.278 -
5.389) = 2.889 seconds and a shift of three packets. If the normal
TCP (with the one-packet start) will deliver packet N at time T, then
the TCP with the four-packet start will deliver packet N - 3 at time
T - 2.889 (seconds).
Note that the time to send three 1024-byte TCP segments through a
9600 bps modem is 2.66 seconds. So at what time does the four-
packet-start TCP deliver packet N? At time T - 2.889 + 2.66 = T -
0.229 in most cases, and in some cases earlier, in some cases later,
because different packets (by number) experience loss in the two
traces.
Thus the four-packet-start TCP is in some sense 0.229 seconds (or
about one fifth of a packet) ahead of where the one-packet-start TCP
would be. (This is due to the extra time the modem sits idle while
waiting for the dally timer to go off in the receiver in the case of
the one-packet-start TCP.)
The states of the two systems are not exactly identical. They differ
slightly in the round-trip-time estimators because the behavior at
the start is not identical. (The observed round trip times may differ
by a small amount due to dally timers and due to that the one-packet
start experiences more round trip times before the first loss.) In
the cases where a retransmit timer did later go off, the additional
Shepard & Partridge Informational [Page 4]
RFC 2416 TCP with Four Packets into Three Buffers September 1998
difference in timing was much smaller than the 0.229 second
difference discribed above.
Conclusion
In this particular case, the four-packet start is not harmful.
Non-conclusions, opinions, and future work
A four-packet start would be very helpful in situations where a
long-delay link is involved (as it would reduce transfer times for
moderately-sized transfers by as much as two round-trip times). But
it remains (in the authors' opinions at this time) an open question
whether or not the four-packet start would be safe for the network.
It would be nice to see if this result could be duplicated with real
TCPs, real modems, and real three-buffer limits.
Security Considerations
This document discusses a simulation study of the effects of a
proposed change to TCP. Consequently, there are no security
considerations directly related to the document. There are also no
known security considerations associated with the proposed change.
References
1. S. Floyd, Increasing TCP's Initial Window (January 29, 1997).
URL ftp://ftp.ee.lbl.gov/papers/draft.jan29.
2. S. Floyd and M. Allman, Increasing TCP's Initial Window (July,
1997). URL http://gigahertz.lerc.nasa.gov/~mallman/share/draft-
ss.txt
3. Allman, M., Floyd, S., and C. Partridge, "Increasing TCP's
Initial Window", RFC 2414, September 1998.
Shepard & Partridge Informational [Page 5]
RFC 2416 TCP with Four Packets into Three Buffers September 1998
Authors' Addresses
Tim Shepard
BBN Technologies
10 Moulton Street
Cambridge, MA 02138
EMail: shep@alum.mit.edu
Craig Partridge
BBN Technologies
10 Moulton Street
Cambridge, MA 02138
EMail: craig@bbn.com
Shepard & Partridge Informational [Page 6]
RFC 2416 TCP with Four Packets into Three Buffers September 1998
Full Copyright Statement
Copyright (C) The Internet Society (1998). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Shepard & Partridge Informational [Page 7]

View File

@ -0,0 +1,563 @@
Network Working Group M. Daniele
Request for Comments: 2452 Compaq Computer Corporation
Category: Standards Track December 1998
IP Version 6 Management Information Base
for the Transmission Control Protocol
Status of this Memo
This document specifies an Internet standards track protocol for the
Internet community, and requests discussion and suggestions for
improvements. Please refer to the current edition of the "Internet
Official Protocol Standards" (STD 1) for the standardization state
and status of this protocol. Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (1998). All Rights Reserved.
Abstract
This document is one in the series of documents that define various
MIB objects for IPv6. Specifically, this document is the MIB module
which defines managed objects for implementations of the Transmission
Control Protocol (TCP) over IP Version 6 (IPv6).
This document also recommends a specific policy with respect to the
applicability of RFC 2012 for implementations of IPv6. Namely, that
most of managed objects defined in RFC 2012 are independent of which
IP versions underlie TCP, and only the TCP connection information is
IP version-specific.
This memo defines an experimental portion of the Management
Information Base (MIB) for use with network management protocols in
IPv6-based internets.
1. Introduction
A management system contains: several (potentially many) nodes, each
with a processing entity, termed an agent, which has access to
management instrumentation; at least one management station; and, a
management protocol, used to convey management information between
the agents and management stations. Operations of the protocol are
carried out under an administrative framework which defines
authentication, authorization, access control, and privacy policies.
Daniele Standards Track [Page 1]
RFC 2452 TCP MIB for IPv6 December 1998
Management stations execute management applications which monitor and
control managed elements. Managed elements are devices such as
hosts, routers, terminal servers, etc., which are monitored and
controlled via access to their management information.
Management information is viewed as a collection of managed objects,
residing in a virtual information store, termed the Management
Information Base (MIB). Collections of related objects are defined
in MIB modules. These modules are written using a subset of OSI's
Abstract Syntax Notation One (ASN.1) [1], termed the Structure of
Management Information (SMI) [2].
2. Overview
This document is one in the series of documents that define various
MIB objects, and statements of conformance, for IPv6. This document
defines the required instrumentation for implementations of TCP over
IPv6.
3. Transparency of IP versions to TCP
The fact that a particular TCP connection uses IPv6 as opposed to
IPv4, is largely invisible to a TCP implementation. A "TCPng" did
not need to be defined, implementations simply need to support IPv6
addresses.
As such, the managed objects already defined in [TCP MIB] are
sufficient for managing TCP in the presence of IPv6. These objects
are equally applicable whether the managed node supports IPv4 only,
IPv6 only, or both IPv4 and IPv6.
For example, tcpActiveOpens counts "The number of times TCP
connections have made a direct transition to the SYN-SENT state from
the CLOSED state", regardless of which version of IP is used between
the connection endpoints.
Stated differently, TCP implementations don't need separate counters
for IPv4 and for IPv6.
4. Representing TCP Connections
The exception to the statements in section 3 is the tcpConnTable.
Since IPv6 addresses cannot be represented with the IpAddress syntax,
not all TCP connections can be represented in the tcpConnTable
defined in [TCP MIB].
Daniele Standards Track [Page 2]
RFC 2452 TCP MIB for IPv6 December 1998
This memo defines a new, separate table to represent only those TCP
connections between IPv6 endpoints. TCP connections between IPv4
endpoints continue to be represented in tcpConnTable [TCP MIB]. (It
is not possible to establish a TCP connection between an IPv4
endpoint and an IPv6 endpoint.)
A different approach would have been to define a new table to
represent all TCP connections regardless of IP version. This would
require changes to [TCP MIB] and hence to existing (IPv4-only) TCP
implementations. The approach suggested in this memo has the
advantage of leaving IPv4-only implementations intact.
It is assumed that the objects defined in this memo will eventually
be defined in an update to [TCP MIB]. For this reason, the module
identity is assigned under the experimental portion of the MIB.
5. Conformance
This memo contains conformance statements to define conformance to
this MIB for TCP over IPv6 implementations.
6. Definitions
IPV6-TCP-MIB DEFINITIONS ::= BEGIN
IMPORTS
MODULE-COMPLIANCE, OBJECT-GROUP FROM SNMPv2-CONF
MODULE-IDENTITY, OBJECT-TYPE,
mib-2, experimental FROM SNMPv2-SMI
Ipv6Address, Ipv6IfIndexOrZero FROM IPV6-TC;
ipv6TcpMIB MODULE-IDENTITY
LAST-UPDATED "9801290000Z"
ORGANIZATION "IETF IPv6 MIB Working Group"
CONTACT-INFO
" Mike Daniele
Postal: Compaq Computer Corporation
110 Spitbrook Rd
Nashua, NH 03062.
US
Phone: +1 603 884 1423
Email: daniele@zk3.dec.com"
DESCRIPTION
"The MIB module for entities implementing TCP over IPv6."
::= { experimental 86 }
Daniele Standards Track [Page 3]
RFC 2452 TCP MIB for IPv6 December 1998
-- objects specific to TCP for IPv6
tcp OBJECT IDENTIFIER ::= { mib-2 6 }
-- the TCP over IPv6 Connection table
-- This connection table contains information about this
-- entity's existing TCP connections between IPv6 endpoints.
-- Only connections between IPv6 addresses are contained in
-- this table. This entity's connections between IPv4
-- endpoints are contained in tcpConnTable.
ipv6TcpConnTable OBJECT-TYPE
SYNTAX SEQUENCE OF Ipv6TcpConnEntry
MAX-ACCESS not-accessible
STATUS current
DESCRIPTION
"A table containing TCP connection-specific information,
for only those connections whose endpoints are IPv6 addresses."
::= { tcp 16 }
ipv6TcpConnEntry OBJECT-TYPE
SYNTAX Ipv6TcpConnEntry
MAX-ACCESS not-accessible
STATUS current
DESCRIPTION
"A conceptual row of the ipv6TcpConnTable containing
information about a particular current TCP connection.
Each row of this table is transient, in that it ceases to
exist when (or soon after) the connection makes the transition
to the CLOSED state.
Note that conceptual rows in this table require an additional
index object compared to tcpConnTable, since IPv6 addresses
are not guaranteed to be unique on the managed node."
INDEX { ipv6TcpConnLocalAddress,
ipv6TcpConnLocalPort,
ipv6TcpConnRemAddress,
ipv6TcpConnRemPort,
ipv6TcpConnIfIndex }
::= { ipv6TcpConnTable 1 }
Ipv6TcpConnEntry ::=
SEQUENCE { ipv6TcpConnLocalAddress Ipv6Address,
ipv6TcpConnLocalPort INTEGER (0..65535),
ipv6TcpConnRemAddress Ipv6Address,
ipv6TcpConnRemPort INTEGER (0..65535),
ipv6TcpConnIfIndex Ipv6IfIndexOrZero,
Daniele Standards Track [Page 4]
RFC 2452 TCP MIB for IPv6 December 1998
ipv6TcpConnState INTEGER }
ipv6TcpConnLocalAddress OBJECT-TYPE
SYNTAX Ipv6Address
MAX-ACCESS not-accessible
STATUS current
DESCRIPTION
"The local IPv6 address for this TCP connection. In
the case of a connection in the listen state which
is willing to accept connections for any IPv6
address associated with the managed node, the value
::0 is used."
::= { ipv6TcpConnEntry 1 }
ipv6TcpConnLocalPort OBJECT-TYPE
SYNTAX INTEGER (0..65535)
MAX-ACCESS not-accessible
STATUS current
DESCRIPTION
"The local port number for this TCP connection."
::= { ipv6TcpConnEntry 2 }
ipv6TcpConnRemAddress OBJECT-TYPE
SYNTAX Ipv6Address
MAX-ACCESS not-accessible
STATUS current
DESCRIPTION
"The remote IPv6 address for this TCP connection."
::= { ipv6TcpConnEntry 3 }
ipv6TcpConnRemPort OBJECT-TYPE
SYNTAX INTEGER (0..65535)
MAX-ACCESS not-accessible
STATUS current
DESCRIPTION
"The remote port number for this TCP connection."
::= { ipv6TcpConnEntry 4 }
ipv6TcpConnIfIndex OBJECT-TYPE
SYNTAX Ipv6IfIndexOrZero
MAX-ACCESS not-accessible
STATUS current
DESCRIPTION
"An index object used to disambiguate conceptual rows in
the table, since the connection 4-tuple may not be unique.
If the connection's remote address (ipv6TcpConnRemAddress)
is a link-local address and the connection's local address
Daniele Standards Track [Page 5]
RFC 2452 TCP MIB for IPv6 December 1998
(ipv6TcpConnLocalAddress) is not a link-local address, this
object identifies a local interface on the same link as
the connection's remote link-local address.
Otherwise, this object identifies the local interface that
is associated with the ipv6TcpConnLocalAddress for this
TCP connection. If such a local interface cannot be determined,
this object should take on the value 0. (A possible example
of this would be if the value of ipv6TcpConnLocalAddress is ::0.)
The interface identified by a particular non-0 value of this
index is the same interface as identified by the same value
of ipv6IfIndex.
The value of this object must remain constant during the life
of the TCP connection."
::= { ipv6TcpConnEntry 5 }
ipv6TcpConnState OBJECT-TYPE
SYNTAX INTEGER {
closed(1),
listen(2),
synSent(3),
synReceived(4),
established(5),
finWait1(6),
finWait2(7),
closeWait(8),
lastAck(9),
closing(10),
timeWait(11),
deleteTCB(12) }
MAX-ACCESS read-write
STATUS current
DESCRIPTION
"The state of this TCP connection.
The only value which may be set by a management station is
deleteTCB(12). Accordingly, it is appropriate for an agent
to return an error response (`badValue' for SNMPv1, 'wrongValue'
for SNMPv2) if a management station attempts to set this
object to any other value.
If a management station sets this object to the value
deleteTCB(12), then this has the effect of deleting the TCB
(as defined in RFC 793) of the corresponding connection on
the managed node, resulting in immediate termination of the
connection.
Daniele Standards Track [Page 6]
RFC 2452 TCP MIB for IPv6 December 1998
As an implementation-specific option, a RST segment may be
sent from the managed node to the other TCP endpoint (note
however that RST segments are not sent reliably)."
::= { ipv6TcpConnEntry 6 }
--
-- conformance information
--
ipv6TcpConformance OBJECT IDENTIFIER ::= { ipv6TcpMIB 2 }
ipv6TcpCompliances OBJECT IDENTIFIER ::= { ipv6TcpConformance 1 }
ipv6TcpGroups OBJECT IDENTIFIER ::= { ipv6TcpConformance 2 }
-- compliance statements
ipv6TcpCompliance MODULE-COMPLIANCE
STATUS current
DESCRIPTION
"The compliance statement for SNMPv2 entities which
implement TCP over IPv6."
MODULE -- this module
MANDATORY-GROUPS { ipv6TcpGroup }
::= { ipv6TcpCompliances 1 }
ipv6TcpGroup OBJECT-GROUP
OBJECTS { -- these are defined in this module
-- ipv6TcpConnLocalAddress (not-accessible)
-- ipv6TcpConnLocalPort (not-accessible)
-- ipv6TcpConnRemAddress (not-accessible)
-- ipv6TcpConnRemPort (not-accessible)
-- ipv6TcpConnIfIndex (not-accessible)
ipv6TcpConnState }
STATUS current
DESCRIPTION
"The group of objects providing management of
TCP over IPv6."
::= { ipv6TcpGroups 1 }
END
Daniele Standards Track [Page 7]
RFC 2452 TCP MIB for IPv6 December 1998
7. Acknowledgments
This memo is a product of the IPng work group, and benefited
especially from the contributions of the following working group
members:
Dimitry Haskin Bay Networks
Margaret Forsythe Epilogue
Tim Hartrick Mentat
Frank Solensky FTP
Jack McCann DEC
8. References
[1] Information processing systems - Open Systems
Interconnection - Specification of Abstract Syntax
Notation One (ASN.1), International Organization for
Standardization. International Standard 8824,
(December, 1987).
[2] McCloghrie, K., Editor, "Structure of Management
Information for version 2 of the Simple Network
Management Protocol (SNMPv2)", RFC 1902, January 1996.
[TCP MIB] SNMPv2 Working Group, McCloghrie, K., Editor, "SNMPv2
Management Information Base for the Transmission
Control Protocol using SMIv2", RFC 2012, November 1996.
[IPV6 MIB TC] Haskin, D., and S. Onishi, "Management Information
Base for IP Version 6: Textual Conventions and General
Group", RFC 2465, December 1998.
[IPV6] Deering, S., and R. Hinden, "Internet Protocol, Version
6 (IPv6) Specification", RFC 2460, December 1998.
[RFC2274] Blumenthal, U., and B. Wijnen, "The User-Based Security
Model for Version 3 of the Simple Network Management
Protocol (SNMPv3)", RFC 2274, January 1998.
[RFC2275] Wijnen, B., Presuhn, R., and K. McCloghrie, "View-based
Access Control Model for the Simple Network Management
Protocol (SNMP)", RFC 2275, January 1998.
9. Security Considerations
This MIB contains a management object that has a MAX-ACCESS clause of
read-write and/or read-create. In particular, it is possible to
delete individual TCP control blocks (i.e., connections).
Daniele Standards Track [Page 8]
RFC 2452 TCP MIB for IPv6 December 1998
Consequently, anyone having the ability to issue a SET on this object
can impact the operation of the node.
There are a number of managed objects in this MIB that may be
considered to contain sensitive information in some environments.
For example, the MIB identifies the active TCP connections on the
node. Although this information might be considered sensitive in
some environments (i.e., to identify ports on which to launch
denial-of-service or other attacks), there are already other ways of
obtaining similar information. For example, sending a random TCP
packet to an unused port prompts the generation of a TCP reset
message.
Therefore, it may be important in some environments to control read
and/or write access to these objects and possibly to even encrypt the
values of these object when sending them over the network via SNMP.
Not all versions of SNMP provide features for such a secure
environment. SNMPv1 by itself does not provide encryption or strong
authentication.
It is recommended that the implementors consider the security
features as provided by the SNMPv3 framework. Specifically, the use
of the User-based Security Model [RFC2274] and the View-based Access
Control Model [RFC2275] is recommended.
It is then a customer/user responsibility to ensure that the SNMP
entity giving access to an instance of this MIB, is properly
configured to give access to those objects only to those principals
(users) that have legitimate rights to access them.
10. Author's Address
Mike Daniele
Compaq Computer Corporation
110 Spit Brook Rd
Nashua, NH 03062
Phone: +1-603-884-1423
EMail: daniele@zk3.dec.com
Daniele Standards Track [Page 9]
RFC 2452 TCP MIB for IPv6 December 1998
11. Full Copyright Statement
Copyright (C) The Internet Society (1998). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Daniele Standards Track [Page 10]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,787 @@
Network Working Group M. Allman
Request for Comments: 2581 NASA Glenn/Sterling Software
Obsoletes: 2001 V. Paxson
Category: Standards Track ACIRI / ICSI
W. Stevens
Consultant
April 1999
TCP Congestion Control
Status of this Memo
This document specifies an Internet standards track protocol for the
Internet community, and requests discussion and suggestions for
improvements. Please refer to the current edition of the "Internet
Official Protocol Standards" (STD 1) for the standardization state
and status of this protocol. Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (1999). All Rights Reserved.
Abstract
This document defines TCP's four intertwined congestion control
algorithms: slow start, congestion avoidance, fast retransmit, and
fast recovery. In addition, the document specifies how TCP should
begin transmission after a relatively long idle period, as well as
discussing various acknowledgment generation methods.
1. Introduction
This document specifies four TCP [Pos81] congestion control
algorithms: slow start, congestion avoidance, fast retransmit and
fast recovery. These algorithms were devised in [Jac88] and [Jac90].
Their use with TCP is standardized in [Bra89].
This document is an update of [Ste97]. In addition to specifying the
congestion control algorithms, this document specifies what TCP
connections should do after a relatively long idle period, as well as
specifying and clarifying some of the issues pertaining to TCP ACK
generation.
Note that [Ste94] provides examples of these algorithms in action and
[WS95] provides an explanation of the source code for the BSD
implementation of these algorithms.
Allman, et. al. Standards Track [Page 1]
RFC 2581 TCP Congestion Control April 1999
This document is organized as follows. Section 2 provides various
definitions which will be used throughout the document. Section 3
provides a specification of the congestion control algorithms.
Section 4 outlines concerns related to the congestion control
algorithms and finally, section 5 outlines security considerations.
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
"SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
document are to be interpreted as described in [Bra97].
2. Definitions
This section provides the definition of several terms that will be
used throughout the remainder of this document.
SEGMENT:
A segment is ANY TCP/IP data or acknowledgment packet (or both).
SENDER MAXIMUM SEGMENT SIZE (SMSS): The SMSS is the size of the
largest segment that the sender can transmit. This value can be
based on the maximum transmission unit of the network, the path
MTU discovery [MD90] algorithm, RMSS (see next item), or other
factors. The size does not include the TCP/IP headers and
options.
RECEIVER MAXIMUM SEGMENT SIZE (RMSS): The RMSS is the size of the
largest segment the receiver is willing to accept. This is the
value specified in the MSS option sent by the receiver during
connection startup. Or, if the MSS option is not used, 536 bytes
[Bra89]. The size does not include the TCP/IP headers and
options.
FULL-SIZED SEGMENT: A segment that contains the maximum number of
data bytes permitted (i.e., a segment containing SMSS bytes of
data).
RECEIVER WINDOW (rwnd) The most recently advertised receiver window.
CONGESTION WINDOW (cwnd): A TCP state variable that limits the
amount of data a TCP can send. At any given time, a TCP MUST NOT
send data with a sequence number higher than the sum of the
highest acknowledged sequence number and the minimum of cwnd and
rwnd.
INITIAL WINDOW (IW): The initial window is the size of the sender's
congestion window after the three-way handshake is completed.
Allman, et. al. Standards Track [Page 2]
RFC 2581 TCP Congestion Control April 1999
LOSS WINDOW (LW): The loss window is the size of the congestion
window after a TCP sender detects loss using its retransmission
timer.
RESTART WINDOW (RW): The restart window is the size of the
congestion window after a TCP restarts transmission after an idle
period (if the slow start algorithm is used; see section 4.1 for
more discussion).
FLIGHT SIZE: The amount of data that has been sent but not yet
acknowledged.
3. Congestion Control Algorithms
This section defines the four congestion control algorithms: slow
start, congestion avoidance, fast retransmit and fast recovery,
developed in [Jac88] and [Jac90]. In some situations it may be
beneficial for a TCP sender to be more conservative than the
algorithms allow, however a TCP MUST NOT be more aggressive than the
following algorithms allow (that is, MUST NOT send data when the
value of cwnd computed by the following algorithms would not allow
the data to be sent).
3.1 Slow Start and Congestion Avoidance
The slow start and congestion avoidance algorithms MUST be used by a
TCP sender to control the amount of outstanding data being injected
into the network. To implement these algorithms, two variables are
added to the TCP per-connection state. The congestion window (cwnd)
is a sender-side limit on the amount of data the sender can transmit
into the network before receiving an acknowledgment (ACK), while the
receiver's advertised window (rwnd) is a receiver-side limit on the
amount of outstanding data. The minimum of cwnd and rwnd governs
data transmission.
Another state variable, the slow start threshold (ssthresh), is used
to determine whether the slow start or congestion avoidance algorithm
is used to control data transmission, as discussed below.
Beginning transmission into a network with unknown conditions
requires TCP to slowly probe the network to determine the available
capacity, in order to avoid congesting the network with an
inappropriately large burst of data. The slow start algorithm is
used for this purpose at the beginning of a transfer, or after
repairing loss detected by the retransmission timer.
Allman, et. al. Standards Track [Page 3]
RFC 2581 TCP Congestion Control April 1999
IW, the initial value of cwnd, MUST be less than or equal to 2*SMSS
bytes and MUST NOT be more than 2 segments.
We note that a non-standard, experimental TCP extension allows that a
TCP MAY use a larger initial window (IW), as defined in equation 1
[AFP98]:
IW = min (4*SMSS, max (2*SMSS, 4380 bytes)) (1)
With this extension, a TCP sender MAY use a 3 or 4 segment initial
window, provided the combined size of the segments does not exceed
4380 bytes. We do NOT allow this change as part of the standard
defined by this document. However, we include discussion of (1) in
the remainder of this document as a guideline for those experimenting
with the change, rather than conforming to the present standards for
TCP congestion control.
The initial value of ssthresh MAY be arbitrarily high (for example,
some implementations use the size of the advertised window), but it
may be reduced in response to congestion. The slow start algorithm
is used when cwnd < ssthresh, while the congestion avoidance
algorithm is used when cwnd > ssthresh. When cwnd and ssthresh are
equal the sender may use either slow start or congestion avoidance.
During slow start, a TCP increments cwnd by at most SMSS bytes for
each ACK received that acknowledges new data. Slow start ends when
cwnd exceeds ssthresh (or, optionally, when it reaches it, as noted
above) or when congestion is observed.
During congestion avoidance, cwnd is incremented by 1 full-sized
segment per round-trip time (RTT). Congestion avoidance continues
until congestion is detected. One formula commonly used to update
cwnd during congestion avoidance is given in equation 2:
cwnd += SMSS*SMSS/cwnd (2)
This adjustment is executed on every incoming non-duplicate ACK.
Equation (2) provides an acceptable approximation to the underlying
principle of increasing cwnd by 1 full-sized segment per RTT. (Note
that for a connection in which the receiver acknowledges every data
segment, (2) proves slightly more aggressive than 1 segment per RTT,
and for a receiver acknowledging every-other packet, (2) is less
aggressive.)
Allman, et. al. Standards Track [Page 4]
RFC 2581 TCP Congestion Control April 1999
Implementation Note: Since integer arithmetic is usually used in TCP
implementations, the formula given in equation 2 can fail to increase
cwnd when the congestion window is very large (larger than
SMSS*SMSS). If the above formula yields 0, the result SHOULD be
rounded up to 1 byte.
Implementation Note: older implementations have an additional
additive constant on the right-hand side of equation (2). This is
incorrect and can actually lead to diminished performance [PAD+98].
Another acceptable way to increase cwnd during congestion avoidance
is to count the number of bytes that have been acknowledged by ACKs
for new data. (A drawback of this implementation is that it requires
maintaining an additional state variable.) When the number of bytes
acknowledged reaches cwnd, then cwnd can be incremented by up to SMSS
bytes. Note that during congestion avoidance, cwnd MUST NOT be
increased by more than the larger of either 1 full-sized segment per
RTT, or the value computed using equation 2.
Implementation Note: some implementations maintain cwnd in units of
bytes, while others in units of full-sized segments. The latter will
find equation (2) difficult to use, and may prefer to use the
counting approach discussed in the previous paragraph.
When a TCP sender detects segment loss using the retransmission
timer, the value of ssthresh MUST be set to no more than the value
given in equation 3:
ssthresh = max (FlightSize / 2, 2*SMSS) (3)
As discussed above, FlightSize is the amount of outstanding data in
the network.
Implementation Note: an easy mistake to make is to simply use cwnd,
rather than FlightSize, which in some implementations may
incidentally increase well beyond rwnd.
Furthermore, upon a timeout cwnd MUST be set to no more than the loss
window, LW, which equals 1 full-sized segment (regardless of the
value of IW). Therefore, after retransmitting the dropped segment
the TCP sender uses the slow start algorithm to increase the window
from 1 full-sized segment to the new value of ssthresh, at which
point congestion avoidance again takes over.
Allman, et. al. Standards Track [Page 5]
RFC 2581 TCP Congestion Control April 1999
3.2 Fast Retransmit/Fast Recovery
A TCP receiver SHOULD send an immediate duplicate ACK when an out-
of-order segment arrives. The purpose of this ACK is to inform the
sender that a segment was received out-of-order and which sequence
number is expected. From the sender's perspective, duplicate ACKs
can be caused by a number of network problems. First, they can be
caused by dropped segments. In this case, all segments after the
dropped segment will trigger duplicate ACKs. Second, duplicate ACKs
can be caused by the re-ordering of data segments by the network (not
a rare event along some network paths [Pax97]). Finally, duplicate
ACKs can be caused by replication of ACK or data segments by the
network. In addition, a TCP receiver SHOULD send an immediate ACK
when the incoming segment fills in all or part of a gap in the
sequence space. This will generate more timely information for a
sender recovering from a loss through a retransmission timeout, a
fast retransmit, or an experimental loss recovery algorithm, such as
NewReno [FH98].
The TCP sender SHOULD use the "fast retransmit" algorithm to detect
and repair loss, based on incoming duplicate ACKs. The fast
retransmit algorithm uses the arrival of 3 duplicate ACKs (4
identical ACKs without the arrival of any other intervening packets)
as an indication that a segment has been lost. After receiving 3
duplicate ACKs, TCP performs a retransmission of what appears to be
the missing segment, without waiting for the retransmission timer to
expire.
After the fast retransmit algorithm sends what appears to be the
missing segment, the "fast recovery" algorithm governs the
transmission of new data until a non-duplicate ACK arrives. The
reason for not performing slow start is that the receipt of the
duplicate ACKs not only indicates that a segment has been lost, but
also that segments are most likely leaving the network (although a
massive segment duplication by the network can invalidate this
conclusion). In other words, since the receiver can only generate a
duplicate ACK when a segment has arrived, that segment has left the
network and is in the receiver's buffer, so we know it is no longer
consuming network resources. Furthermore, since the ACK "clock"
[Jac88] is preserved, the TCP sender can continue to transmit new
segments (although transmission must continue using a reduced cwnd).
The fast retransmit and fast recovery algorithms are usually
implemented together as follows.
1. When the third duplicate ACK is received, set ssthresh to no more
than the value given in equation 3.
Allman, et. al. Standards Track [Page 6]
RFC 2581 TCP Congestion Control April 1999
2. Retransmit the lost segment and set cwnd to ssthresh plus 3*SMSS.
This artificially "inflates" the congestion window by the number
of segments (three) that have left the network and which the
receiver has buffered.
3. For each additional duplicate ACK received, increment cwnd by
SMSS. This artificially inflates the congestion window in order
to reflect the additional segment that has left the network.
4. Transmit a segment, if allowed by the new value of cwnd and the
receiver's advertised window.
5. When the next ACK arrives that acknowledges new data, set cwnd to
ssthresh (the value set in step 1). This is termed "deflating"
the window.
This ACK should be the acknowledgment elicited by the
retransmission from step 1, one RTT after the retransmission
(though it may arrive sooner in the presence of significant out-
of-order delivery of data segments at the receiver).
Additionally, this ACK should acknowledge all the intermediate
segments sent between the lost segment and the receipt of the
third duplicate ACK, if none of these were lost.
Note: This algorithm is known to generally not recover very
efficiently from multiple losses in a single flight of packets
[FF96]. One proposed set of modifications to address this problem
can be found in [FH98].
4. Additional Considerations
4.1 Re-starting Idle Connections
A known problem with the TCP congestion control algorithms described
above is that they allow a potentially inappropriate burst of traffic
to be transmitted after TCP has been idle for a relatively long
period of time. After an idle period, TCP cannot use the ACK clock
to strobe new segments into the network, as all the ACKs have drained
from the network. Therefore, as specified above, TCP can potentially
send a cwnd-size line-rate burst into the network after an idle
period.
[Jac88] recommends that a TCP use slow start to restart transmission
after a relatively long idle period. Slow start serves to restart
the ACK clock, just as it does at the beginning of a transfer. This
mechanism has been widely deployed in the following manner. When TCP
has not received a segment for more than one retransmission timeout,
cwnd is reduced to the value of the restart window (RW) before
Allman, et. al. Standards Track [Page 7]
RFC 2581 TCP Congestion Control April 1999
transmission begins.
For the purposes of this standard, we define RW = IW.
We note that the non-standard experimental extension to TCP defined
in [AFP98] defines RW = min(IW, cwnd), with the definition of IW
adjusted per equation (1) above.
Using the last time a segment was received to determine whether or
not to decrease cwnd fails to deflate cwnd in the common case of
persistent HTTP connections [HTH98]. In this case, a WWW server
receives a request before transmitting data to the WWW browser. The
reception of the request makes the test for an idle connection fail,
and allows the TCP to begin transmission with a possibly
inappropriately large cwnd.
Therefore, a TCP SHOULD set cwnd to no more than RW before beginning
transmission if the TCP has not sent data in an interval exceeding
the retransmission timeout.
4.2 Generating Acknowledgments
The delayed ACK algorithm specified in [Bra89] SHOULD be used by a
TCP receiver. When used, a TCP receiver MUST NOT excessively delay
acknowledgments. Specifically, an ACK SHOULD be generated for at
least every second full-sized segment, and MUST be generated within
500 ms of the arrival of the first unacknowledged packet.
The requirement that an ACK "SHOULD" be generated for at least every
second full-sized segment is listed in [Bra89] in one place as a
SHOULD and another as a MUST. Here we unambiguously state it is a
SHOULD. We also emphasize that this is a SHOULD, meaning that an
implementor should indeed only deviate from this requirement after
careful consideration of the implications. See the discussion of
"Stretch ACK violation" in [PAD+98] and the references therein for a
discussion of the possible performance problems with generating ACKs
less frequently than every second full-sized segment.
In some cases, the sender and receiver may not agree on what
constitutes a full-sized segment. An implementation is deemed to
comply with this requirement if it sends at least one acknowledgment
every time it receives 2*RMSS bytes of new data from the sender,
where RMSS is the Maximum Segment Size specified by the receiver to
the sender (or the default value of 536 bytes, per [Bra89], if the
receiver does not specify an MSS option during connection
establishment). The sender may be forced to use a segment size less
than RMSS due to the maximum transmission unit (MTU), the path MTU
discovery algorithm or other factors. For instance, consider the
Allman, et. al. Standards Track [Page 8]
RFC 2581 TCP Congestion Control April 1999
case when the receiver announces an RMSS of X bytes but the sender
ends up using a segment size of Y bytes (Y < X) due to path MTU
discovery (or the sender's MTU size). The receiver will generate
stretch ACKs if it waits for 2*X bytes to arrive before an ACK is
sent. Clearly this will take more than 2 segments of size Y bytes.
Therefore, while a specific algorithm is not defined, it is desirable
for receivers to attempt to prevent this situation, for example by
acknowledging at least every second segment, regardless of size.
Finally, we repeat that an ACK MUST NOT be delayed for more than 500
ms waiting on a second full-sized segment to arrive.
Out-of-order data segments SHOULD be acknowledged immediately, in
order to accelerate loss recovery. To trigger the fast retransmit
algorithm, the receiver SHOULD send an immediate duplicate ACK when
it receives a data segment above a gap in the sequence space. To
provide feedback to senders recovering from losses, the receiver
SHOULD send an immediate ACK when it receives a data segment that
fills in all or part of a gap in the sequence space.
A TCP receiver MUST NOT generate more than one ACK for every incoming
segment, other than to update the offered window as the receiving
application consumes new data [page 42, Pos81][Cla82].
4.3 Loss Recovery Mechanisms
A number of loss recovery algorithms that augment fast retransmit and
fast recovery have been suggested by TCP researchers. While some of
these algorithms are based on the TCP selective acknowledgment (SACK)
option [MMFR96], such as [FF96,MM96a,MM96b], others do not require
SACKs [Hoe96,FF96,FH98]. The non-SACK algorithms use "partial
acknowledgments" (ACKs which cover new data, but not all the data
outstanding when loss was detected) to trigger retransmissions.
While this document does not standardize any of the specific
algorithms that may improve fast retransmit/fast recovery, these
enhanced algorithms are implicitly allowed, as long as they follow
the general principles of the basic four algorithms outlined above.
Therefore, when the first loss in a window of data is detected,
ssthresh MUST be set to no more than the value given by equation (3).
Second, until all lost segments in the window of data in question are
repaired, the number of segments transmitted in each RTT MUST be no
more than half the number of outstanding segments when the loss was
detected. Finally, after all loss in the given window of segments
has been successfully retransmitted, cwnd MUST be set to no more than
ssthresh and congestion avoidance MUST be used to further increase
cwnd. Loss in two successive windows of data, or the loss of a
retransmission, should be taken as two indications of congestion and,
therefore, cwnd (and ssthresh) MUST be lowered twice in this case.
Allman, et. al. Standards Track [Page 9]
RFC 2581 TCP Congestion Control April 1999
The algorithms outlined in [Hoe96,FF96,MM96a,MM6b] follow the
principles of the basic four congestion control algorithms outlined
in this document.
5. Security Considerations
This document requires a TCP to diminish its sending rate in the
presence of retransmission timeouts and the arrival of duplicate
acknowledgments. An attacker can therefore impair the performance of
a TCP connection by either causing data packets or their
acknowledgments to be lost, or by forging excessive duplicate
acknowledgments. Causing two congestion control events back-to-back
will often cut ssthresh to its minimum value of 2*SMSS, causing the
connection to immediately enter the slower-performing congestion
avoidance phase.
The Internet to a considerable degree relies on the correct
implementation of these algorithms in order to preserve network
stability and avoid congestion collapse. An attacker could cause TCP
endpoints to respond more aggressively in the face of congestion by
forging excessive duplicate acknowledgments or excessive
acknowledgments for new data. Conceivably, such an attack could
drive a portion of the network into congestion collapse.
6. Changes Relative to RFC 2001
This document has been extensively rewritten editorially and it is
not feasible to itemize the list of changes between the two
documents. The intention of this document is not to change any of the
recommendations given in RFC 2001, but to further clarify cases that
were not discussed in detail in 2001. Specifically, this document
suggests what TCP connections should do after a relatively long idle
period, as well as specifying and clarifying some of the issues
pertaining to TCP ACK generation. Finally, the allowable upper bound
for the initial congestion window has also been raised from one to
two segments.
Acknowledgments
The four algorithms that are described were developed by Van
Jacobson.
Some of the text from this document is taken from "TCP/IP
Illustrated, Volume 1: The Protocols" by W. Richard Stevens
(Addison-Wesley, 1994) and "TCP/IP Illustrated, Volume 2: The
Implementation" by Gary R. Wright and W. Richard Stevens (Addison-
Wesley, 1995). This material is used with the permission of
Addison-Wesley.
Allman, et. al. Standards Track [Page 10]
RFC 2581 TCP Congestion Control April 1999
Neal Cardwell, Sally Floyd, Craig Partridge and Joe Touch contributed
a number of helpful suggestions.
References
[AFP98] Allman, M., Floyd, S. and C. Partridge, "Increasing TCP's
Initial Window Size, RFC 2414, September 1998.
[Bra89] Braden, R., "Requirements for Internet Hosts --
Communication Layers", STD 3, RFC 1122, October 1989.
[Bra97] Bradner, S., "Key words for use in RFCs to Indicate
Requirement Levels", BCP 14, RFC 2119, March 1997.
[Cla82] Clark, D., "Window and Acknowledgment Strategy in TCP", RFC
813, July 1982.
[FF96] Fall, K. and S. Floyd, "Simulation-based Comparisons of
Tahoe, Reno and SACK TCP", Computer Communication Review,
July 1996. ftp://ftp.ee.lbl.gov/papers/sacks.ps.Z.
[FH98] Floyd, S. and T. Henderson, "The NewReno Modification to
TCP's Fast Recovery Algorithm", RFC 2582, April 1999.
[Flo94] Floyd, S., "TCP and Successive Fast Retransmits. Technical
report", October 1994.
ftp://ftp.ee.lbl.gov/papers/fastretrans.ps.
[Hoe96] Hoe, J., "Improving the Start-up Behavior of a Congestion
Control Scheme for TCP", In ACM SIGCOMM, August 1996.
[HTH98] Hughes, A., Touch, J. and J. Heidemann, "Issues in TCP
Slow-Start Restart After Idle", Work in Progress.
[Jac88] Jacobson, V., "Congestion Avoidance and Control", Computer
Communication Review, vol. 18, no. 4, pp. 314-329, Aug.
1988. ftp://ftp.ee.lbl.gov/papers/congavoid.ps.Z.
[Jac90] Jacobson, V., "Modified TCP Congestion Avoidance Algorithm",
end2end-interest mailing list, April 30, 1990.
ftp://ftp.isi.edu/end2end/end2end-interest-1990.mail.
[MD90] Mogul, J. and S. Deering, "Path MTU Discovery", RFC 1191,
November 1990.
Allman, et. al. Standards Track [Page 11]
RFC 2581 TCP Congestion Control April 1999
[MM96a] Mathis, M. and J. Mahdavi, "Forward Acknowledgment: Refining
TCP Congestion Control", Proceedings of SIGCOMM'96, August,
1996, Stanford, CA. Available
fromhttp://www.psc.edu/networking/papers/papers.html
[MM96b] Mathis, M. and J. Mahdavi, "TCP Rate-Halving with Bounding
Parameters", Technical report. Available from
http://www.psc.edu/networking/papers/FACKnotes/current.
[MMFR96] Mathis, M., Mahdavi, J., Floyd, S. and A. Romanow, "TCP
Selective Acknowledgement Options", RFC 2018, October 1996.
[PAD+98] Paxson, V., Allman, M., Dawson, S., Fenner, W., Griner, J.,
Heavens, I., Lahey, K., Semke, J. and B. Volz, "Known TCP
Implementation Problems", RFC 2525, March 1999.
[Pax97] Paxson, V., "End-to-End Internet Packet Dynamics",
Proceedings of SIGCOMM '97, Cannes, France, Sep. 1997.
[Pos81] Postel, J., "Transmission Control Protocol", STD 7, RFC 793,
September 1981.
[Ste94] Stevens, W., "TCP/IP Illustrated, Volume 1: The Protocols",
Addison-Wesley, 1994.
[Ste97] Stevens, W., "TCP Slow Start, Congestion Avoidance, Fast
Retransmit, and Fast Recovery Algorithms", RFC 2001, January
1997.
[WS95] Wright, G. and W. Stevens, "TCP/IP Illustrated, Volume 2:
The Implementation", Addison-Wesley, 1995.
Allman, et. al. Standards Track [Page 12]
RFC 2581 TCP Congestion Control April 1999
Authors' Addresses
Mark Allman
NASA Glenn Research Center/Sterling Software
Lewis Field
21000 Brookpark Rd. MS 54-2
Cleveland, OH 44135
216-433-6586
EMail: mallman@grc.nasa.gov
http://roland.grc.nasa.gov/~mallman
Vern Paxson
ACIRI / ICSI
1947 Center Street
Suite 600
Berkeley, CA 94704-1198
Phone: +1 510/642-4274 x302
EMail: vern@aciri.org
W. Richard Stevens
1202 E. Paseo del Zorro
Tucson, AZ 85718
520-297-9416
EMail: rstevens@kohala.com
http://www.kohala.com/~rstevens
Allman, et. al. Standards Track [Page 13]
RFC 2581 TCP Congestion Control April 1999
Full Copyright Statement
Copyright (C) The Internet Society (1999). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Allman, et. al. Standards Track [Page 14]

View File

@ -0,0 +1,507 @@
Network Working Group D. Borman
Request for Comments: 2675 Berkeley Software Design
Obsoletes: 2147 S. Deering
Category: Standards Track Cisco
R. Hinden
Nokia
August 1999
IPv6 Jumbograms
Status of this Memo
This document specifies an Internet standards track protocol for the
Internet community, and requests discussion and suggestions for
improvements. Please refer to the current edition of the "Internet
Official Protocol Standards" (STD 1) for the standardization state
and status of this protocol. Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (1999). All Rights Reserved.
Abstract
A "jumbogram" is an IPv6 packet containing a payload longer than
65,535 octets. This document describes the IPv6 Jumbo Payload
option, which provides the means of specifying such large payload
lengths. It also describes the changes needed to TCP and UDP to make
use of jumbograms.
Jumbograms are relevant only to IPv6 nodes that may be attached to
links with a link MTU greater than 65,575 octets, and need not be
implemented or understood by IPv6 nodes that do not support
attachment to links with such large MTUs.
1. Introduction
jumbo (jum'bO),
n., pl. -bos, adj.
-n.
1. a person, animal, or thing very large of its kind.
-adj.
2. very large: the jumbo box of cereal.
[1800-10; orig. uncert.; popularized as the name of a large
elephant purchased and exhibited by P.T. Barnum in 1882]
-- www.infoplease.com
Borman, et al. Standards Track [Page 1]
RFC 2675 IPv6 Jumbograms August 1999
The IPv6 header [IPv6] has a 16-bit Payload Length field and,
therefore, supports payloads up to 65,535 octets long. This document
specifies an IPv6 hop-by-hop option, called the Jumbo Payload option,
that carries a 32-bit length field in order to allow transmission of
IPv6 packets with payloads between 65,536 and 4,294,967,295 octets in
length. Packets with such long payloads are referred to as
"jumbograms".
The Jumbo Payload option is relevant only for IPv6 nodes that may be
attached to links with a link MTU greater than 65,575 octets (that
is, 65,535 + 40, where 40 octets is the size of the IPv6 header).
The Jumbo Payload option need not be implemented or understood by
IPv6 nodes that do not support attachment to links with MTU greater
than 65,575.
On links with configurable MTUs, the MTU must not be configured to a
value greater than 65,575 octets if there are nodes attached to that
link that do not support the Jumbo Payload option and it can not be
guaranteed that the Jumbo Payload option will not be sent to those
nodes.
The UDP header [UDP] has a 16-bit Length field which prevents it from
making use of jumbograms, and though the TCP header [TCP] does not
have a Length field, both the TCP MSS option and the TCP Urgent field
are constrained to 16 bits. This document specifies some simple
enhancements to TCP and UDP to enable them to make use of jumbograms.
An implementation of TCP or UDP on an IPv6 node that supports the
Jumbo Payload option must include the enhancements specified here.
Note: The 16 bit checksum used by UDP and TCP becomes less accurate
as the length of the data being checksummed is increased.
Application designers may want to take this into consideration.
1.1 Document History
This document merges and updates material that was previously
published in two separate documents:
- The specification of the Jumbo Payload option previously appeared
as part of the IPv6 specification in RFC 1883. RFC 1883 has been
superseded by RFC 2460, which no longer includes specification of
the Jumbo Payload option.
- The specification of TCP and UDP enhancements to support
jumbograms previously appeared as RFC 2147. RFC 2147 is obsoleted
by this document.
Borman, et al. Standards Track [Page 2]
RFC 2675 IPv6 Jumbograms August 1999
2. Format of the Jumbo Payload Option
The Jumbo Payload option is carried in an IPv6 Hop-by-Hop Options
header, immediately following the IPv6 header. This option has an
alignment requirement of 4n + 2. (See [IPv6, Section 4.2] for
discussion of option alignment.) The option has the following
format:
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Option Type | Opt Data Len |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Jumbo Payload Length |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Option Type 8-bit value C2 (hexadecimal).
Opt Data Len 8-bit value 4.
Jumbo Payload Length 32-bit unsigned integer. Length of the IPv6
packet in octets, excluding the IPv6 header
but including the Hop-by-Hop Options header
and any other extension headers present.
Must be greater than 65,535.
3. Usage of the Jumbo Payload Option
The Payload Length field in the IPv6 header must be set to zero in
every packet that carries the Jumbo Payload option.
If a node that understands the Jumbo Payload option receives a packet
whose IPv6 header carries a Payload Length of zero and a Next Header
value of zero (meaning that a Hop-by-Hop Options header follows), and
whose link-layer framing indicates the presence of octets beyond the
IPv6 header, the node must proceed to process the Hop-by-Hop Options
header in order to determine the actual length of the payload from
the Jumbo Payload option.
The Jumbo Payload option must not be used in a packet that carries a
Fragment header.
Higher-layer protocols that use the IPv6 Payload Length field to
compute the value of the Upper-Layer Packet Length field in the
checksum pseudo-header described in [IPv6, Section 8.1] must instead
use the Jumbo Payload Length field for that computation, for packets
that carry the Jumbo Payload option.
Borman, et al. Standards Track [Page 3]
RFC 2675 IPv6 Jumbograms August 1999
Nodes that understand the Jumbo Payload option are required to detect
a number of possible format errors, and if the erroneous packet was
not destined to a multicast address, report the error by sending an
ICMP Parameter Problem message [ICMPv6] to the packet's source. The
following list of errors specifies the values to be used in the Code
and Pointer fields of the Parameter Problem message:
error: IPv6 Payload Length = 0 and
IPv6 Next Header = Hop-by-Hop Options and
Jumbo Payload option not present
Code: 0
Pointer: high-order octet of the IPv6 Payload Length
error: IPv6 Payload Length != 0 and
Jumbo Payload option present
Code: 0
Pointer: Option Type field of the Jumbo Payload option
error: Jumbo Payload option present and
Jumbo Payload Length < 65,536
Code: 0
Pointer: high-order octet of the Jumbo Payload Length
error: Jumbo Payload option present and
Fragment header present
Code: 0
Pointer: high-order octet of the Fragment header.
A node that does not understand the Jumbo Payload option is expected
to respond to erroneously-received jumbograms as follows, according
to the IPv6 specification:
error: IPv6 Payload Length = 0 and
IPv6 Next Header = Hop-by-Hop Options
Code: 0
Pointer: high-order octet of the IPv6 Payload Length
error: IPv6 Payload Length != 0 and
Jumbo Payload option present
Code: 2
Pointer: Option Type field of the Jumbo Payload option
Borman, et al. Standards Track [Page 4]
RFC 2675 IPv6 Jumbograms August 1999
4. UDP Jumbograms
The 16-bit Length field of the UDP header limits the total length of
a UDP packet (that is, a UDP header plus data) to no greater than
65,535 octets. This document specifies the following modification of
UDP to relax that limit: UDP packets longer than 65,535 octets may be
sent by setting the UDP Length field to zero, and letting the
receiver derive the actual UDP packet length from the IPv6 payload
length. (Note that, prior to this modification, zero was not a legal
value for the UDP Length field, because the UDP packet length
includes the UDP header and therefore has a minimum value of 8.)
The specific requirements for sending a UDP jumbogram are as follows:
When sending a UDP packet, if and only if the length of the UDP
header plus UDP data is greater than 65,535, set the Length field
in the UDP header to zero.
The IPv6 packet carrying such a large UDP packet will necessarily
include a Jumbo Payload option in a Hop-by-Hop Options header; set
the Jumbo Payload Length field of that option to be the actual
length of the UDP header plus data, plus the length of all IPv6
extension headers present between the IPv6 header and the UDP
header.
For generating the UDP checksum, use the actual length of the UDP
header plus data, NOT zero, in the checksum pseudo-header [IPv6,
Section 8.1].
The specific requirements for receiving a UDP jumbogram are as
follows:
When receiving a UDP packet, if and only if the Length field in
the UDP header is zero, calculate the actual length of the UDP
header plus data from the IPv6 Jumbo Payload Length field minus
the length of all extension headers present between the IPv6
header and the UDP header.
In the unexpected case that the UDP Length field is zero but no
Jumbo Payload option is present (i.e., the IPv6 packet is not a
jumbogram), use the Payload Length field in the IPv6 header, in
place of the Jumbo Payload Length field, in the above calculation.
For verifying the received UDP checksum, use the calculated length
of the UDP header plus data, NOT zero, in the checksum pseudo-
header.
Borman, et al. Standards Track [Page 5]
RFC 2675 IPv6 Jumbograms August 1999
5. TCP Jumbograms
Because there is no length field in the TCP header, there is nothing
limiting the length of an individual TCP packet. However, the MSS
value that is negotiated at the beginning of the connection limits
the largest TCP packet that can be sent, and the Urgent Pointer
cannot reference data beyond 65,535 bytes.
5.1 TCP MSS
When determining what MSS value to send, if the MTU of the directly
attached interface minus 60 [IPv6, Section 8.3] is greater than or
equal to 65,535, then set the MSS value to 65,535.
When an MSS value of 65,535 is received, it is to be treated as
infinity. The actual MSS is determined by subtracting 60 from the
value learned by performing Path MTU Discovery [MTU-DISC] over the
path to the TCP peer.
5.2 TCP Urgent Pointer
The Urgent Pointer problem could be fixed by adding a TCP Urgent
Pointer Option. However, since it is unlikely that applications
using jumbograms will also use Urgent Pointers, a less intrusive
change similar to the MSS change will suffice.
When a TCP packet is to be sent with an Urgent Pointer (i.e., the URG
bit set), first calculate the offset from the Sequence Number to the
Urgent Pointer. If the offset is less than 65,535, fill in the
Urgent field and continue with the normal TCP processing. If the
offset is greater than 65,535, and the offset is greater than or
equal to the length of the TCP data, fill in the Urgent Pointer with
65,535 and continue with the normal TCP processing. Otherwise, the
TCP packet must be split into two pieces. The first piece contains
data up to, but not including the data pointed to by the Urgent
Pointer, and the Urgent field is set to 65,535 to indicate that the
Urgent Pointer is beyond the end of this packet. The second piece
can then be sent with the Urgent field set normally.
Note: The first piece does not have to include all of the data up to
the Urgent Pointer. It can be shorter, just as long as it ends
within 65,534 bytes of the Urgent Pointer, so that the offset to the
Urgent Pointer in the second piece will be less than 65,535 bytes.
For TCP input processing, when a TCP packet is received with the URG
bit set and an Urgent field of 65,535, the Urgent Pointer is
calculated using an offset equal to the length of the TCP data,
rather than the offset in the Urgent field.
Borman, et al. Standards Track [Page 6]
RFC 2675 IPv6 Jumbograms August 1999
It should also be noted that though the TCP window is only 16-bits,
larger windows can be used through use of the TCP Window Scale option
[TCP-EXT].
6. Security Considerations
The Jumbo Payload option and TCP/UDP jumbograms do not introduce any
known new security concerns.
7. Authors' Addresses
David A. Borman
Berkeley Software Design, Inc.
4719 Weston Hills Drive
Eagan, MN 55123
USA
Phone: +1 612 405 8194
EMail: dab@bsdi.com
Stephen E. Deering
Cisco Systems, Inc.
170 West Tasman Drive
San Jose, CA 95134-1706
USA
Phone: +1 408 527 8213
EMail: deering@cisco.com
Robert M. Hinden
Nokia
313 Fairchild Drive
Mountain View, CA 94043
USA
Phone: +1 650 625 2004
EMail: hinden@iprg.nokia.com
Borman, et al. Standards Track [Page 7]
RFC 2675 IPv6 Jumbograms August 1999
8. References
[ICMPv6] Conta, A. and S. Deering, "ICMP for the Internet Protocol
Version 6 (IPv6)", RFC 2463, December 1998.
[IPv6] Deering, S. and R. Hinden, "Internet Protocol Version 6
(IPv6) Specification", RFC 2460, December 1998.
[MTU-DISC] McCann, J., Deering, S. and J. Mogul, "Path MTU Discovery
for IP Version 6", RFC 1981, August 1986.
[TCP] Postel, J., "Transmission Control Protocol", STD 7, RFC
793, September 1981.
[TCP-EXT] Jacobson, V., Braden, R. and D. Borman, "TCP Extensions
for High Performance", RFC 1323, May 1992.
[UDP] Postel, J., "User Datagram Protocol", STD 6, RFC 768,
August 1980.
Borman, et al. Standards Track [Page 8]
RFC 2675 IPv6 Jumbograms August 1999
9. Full Copyright Statement
Copyright (C) The Internet Society (1999). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Acknowledgement
Funding for the RFC Editor function is currently provided by the
Internet Society.
Borman, et al. Standards Track [Page 9]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,619 @@
Network Working Group M. Handley
Request for Comments: 2861 J. Padhye
Category: Experimental S. Floyd
ACIRI
June 2000
TCP Congestion Window Validation
Status of this Memo
This memo defines an Experimental Protocol for the Internet
community. It does not specify an Internet standard of any kind.
Discussion and suggestions for improvement are requested.
Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (2000). All Rights Reserved.
Abstract
TCP's congestion window controls the number of packets a TCP flow may
have in the network at any time. However, long periods when the
sender is idle or application-limited can lead to the invalidation of
the congestion window, in that the congestion window no longer
reflects current information about the state of the network. This
document describes a simple modification to TCP's congestion control
algorithms to decay the congestion window cwnd after the transition
from a sufficiently-long application-limited period, while using the
slow-start threshold ssthresh to save information about the previous
value of the congestion window.
An invalid congestion window also results when the congestion window
is increased (i.e., in TCP's slow-start or congestion avoidance
phases) during application-limited periods, when the previous value
of the congestion window might never have been fully utilized. We
propose that the TCP sender should not increase the congestion window
when the TCP sender has been application-limited (and therefore has
not fully used the current congestion window). We have explored
these algorithms both with simulations and with experiments from an
implementation in FreeBSD.
1. Conventions and Acronyms
The keywords MUST, MUST NOT, REQUIRED, SHALL, SHALL NOT, SHOULD,
SHOULD NOT, RECOMMENDED, MAY, and OPTIONAL, when they appear in this
document, are to be interpreted as described in [B97].
Handley, et al. Experimental [Page 1]
RFC 2861 TCP Congestion Window Validation June 2000
2. Introduction
TCP's congestion window controls the number of packets a TCP flow may
have in the network at any time. The congestion window is set using
an Additive-Increase, Multiplicative-Decrease (AIMD) mechanism that
probes for available bandwidth, dynamically adapting to changing
network conditions. This AIMD mechanism works well when the sender
continually has data to send, as is typically the case for TCP used
for bulk-data transfer. In contrast, for TCP used with telnet
applications, the data sender often has little or no data to send,
and the sending rate is often determined by the rate at which data is
generated by the user. With the advent of the web, including
developments such as TCP senders with dynamically-created data and
HTTP 1.1 with persistent-connection TCP, the interaction between
application-limited periods (when the sender sends less than is
allowed by the congestion or receiver windows) and network-limited
periods (when the sender is limited by the TCP window) becomes
increasingly important. More precisely, we define a network-limited
period as any period when the sender is sending a full window of
data.
Long periods when the sender is application-limited can lead to the
invalidation of the congestion window. During periods when the TCP
sender is network-limited, the value of the congestion window is
repeatedly "revalidated" by the successful transmission of a window
of data without loss. When the TCP sender is network-limited, there
is an incoming stream of acknowledgements that "clocks out" new data,
giving concrete evidence of recent available bandwidth in the
network. In contrast, during periods when the TCP sender is
application-limited, the estimate of available capacity represented
by the congestion window may become steadily less accurate over time.
In particular, capacity that had once been used by the network-
limited connection might now be used by other traffic.
Current TCP implementations have a range of behaviors for starting up
after an idle period. Some current TCP implementations slow-start
after an idle period longer than the RTO estimate, as suggested in
[RFC2581] and in the appendix of [VJ88], while other implementations
don't reduce their congestion window after an idle period. RFC 2581
[RFC2581] recommends the following: "a TCP SHOULD set cwnd to no more
than RW [the initial window] before beginning transmission if the TCP
has not sent data in an interval exceeding the retransmission
timeout." A proposal for TCP's slow-start after idle has also been
discussed in [HTH98]. The issue of validation of congestion
information during idle periods has also been addressed in contexts
other than TCP and IP, for example in "Use-it or Lose-it" mechanisms
for ATM networks [J96,J95].
Handley, et al. Experimental [Page 2]
RFC 2861 TCP Congestion Window Validation June 2000
To address the revalidation of the congestion window after a
application-limited period, we propose a simple modification to TCP's
congestion control algorithms to decay the congestion window cwnd
after the transition from a sufficiently-long application-limited
period (i.e., at least one roundtrip time) to a network-limited
period. In particular, we propose that after an idle period, the TCP
sender should reduce its congestion window by half for every RTT that
the flow has remained idle.
When the congestion window is reduced, the slow-start threshold
ssthresh remains as "memory" of the recent congestion window.
Specifically, ssthresh is never decreased when cwnd is reduced after
an application-limited period; before cwnd is reduced, ssthresh is
set to the maximum of its current value, and half-way between the old
and the new values of cwnd. This use of ssthresh allows a TCP sender
increasing its sending rate after an application-limited period to
quickly slow-start to recover most of the previous value of the
congestion window. To be more precise, if ssthresh is less than 3/4
cwnd when the congestion window is reduced after an application-
limited period, then ssthresh is increased to 3/4 cwnd before the
reduction of the congestion window.
An invalid congestion window also results when the congestion window
is increased (i.e., in TCP's slow-start or congestion avoidance
phases) during application-limited periods, when the previous value
of the congestion window might never have been fully utilized. As
far as we know, all current TCP implementations increase the
congestion window when an acknowledgement arrives, if allowed by the
receiver's advertised window and the slow-start or congestion
avoidance window increase algorithm, without checking to see if the
previous value of the congestion window has in fact been used. This
document proposes that the window increase algorithm not be invoked
during application-limited periods [MSML99]. In particular, the TCP
sender should not increase the congestion window when the TCP sender
has been application-limited (and therefore has not fully used the
current congestion window). This restriction prevents the congestion
window from growing arbitrarily large, in the absence of evidence
that the congestion window can be supported by the network. From
[MSML99, Section 5.2]: "This restriction assures that [cwnd] only
grows as long as TCP actually succeeds in injecting enough data into
the network to test the path."
A somewhat-orthogonal problem associated with maintaining a large
congestion window after an application-limited period is that the
sender, with a sudden large amount of data to send after a quiescent
period, might immediately send a full congestion window of back-to-
back packets. This problem of sending large bursts of packets back-
to-back can be effectively handled using rate-based pacing (RBP,
Handley, et al. Experimental [Page 3]
RFC 2861 TCP Congestion Window Validation June 2000
[VH97]), or using a maximum burst size control [FF96]. We would
contend that, even with mechanisms for limiting the sending of back-
to-back packets or pacing packets out over the period of a roundtrip
time, an old congestion window that has not been fully used for some
time can not be trusted as an indication of the bandwidth currently
available for that flow. We would contend that the mechanisms to
pace out packets allowed by the congestion window are largely
orthogonal to the algorithms used to determine the appropriate size
of the congestion window.
3. Description
When a TCP sender has sufficient data available to fill the available
network capacity for that flow, cwnd and ssthresh get set to
appropriate values for the network conditions. When a TCP sender
stops sending, the flow stops sampling the network conditions, and so
the value of the congestion window may become inaccurate. We believe
the correct conservative behavior under these circumstances is to
decay the congestion window by half for every RTT that the flow
remains inactive. The value of half is a very conservative figure
based on how quickly multiplicative decrease would have decayed the
window in the presence of loss.
Another possibility is that the sender may not stop sending, but may
become application-limited rather than network-limited, and offer
less data to the network than the congestion window allows to be
sent. In this case the TCP flow is still sampling network
conditions, but is not offering sufficient traffic to be sure that
there is still sufficient capacity in the network for that flow to
send a full congestion window. Under these circumstances we believe
the correct conservative behavior is for the sender to keep track of
the maximum amount of the congestion window used during each RTT, and
to decay the congestion window each RTT to midway between the current
cwnd value and the maximum value used.
Before the congestion window is reduced, ssthresh is set to the
maximum of its current value and 3/4 cwnd. If the sender then has
more data to send than the decayed cwnd allows, the TCP will slow-
start (perform exponential increase) at least half-way back up to the
old value of cwnd.
The justification for this value of "3/4 cwnd" is that 3/4 cwnd is a
conservative estimate of the recent average value of the congestion
window, and the TCP should safely be able to slow-start at least up
to this point. For a TCP in steady-state that has been reducing its
congestion window each time the congestion window reached some
maximum value `maxwin', the average congestion window has been 3/4
maxwin. On average, when the connection becomes application-limited,
Handley, et al. Experimental [Page 4]
RFC 2861 TCP Congestion Window Validation June 2000
cwnd will be 3/4 maxwin, and in this case cwnd itself represents the
average value of the congestion window. However, if the connection
happens to become application-limited when cwnd equals maxwin, then
the average value of the congestion window is given by 3/4 cwnd.
An alternate possibility would be to set ssthresh to the maximum of
the current value of ssthresh, and the old value of cwnd, allowing
TCP to slow-start all of the way back up to the old value of cwnd.
Further experimentation can be used to evaluate these two options for
setting ssthresh.
For the separate issue of the increase of the congestion window in
response to an acknowledgement, we believe the correct behavior is
for the sender to increase the congestion window only if the window
was full when the acknowledgment arrived.
We term this set of modifications to TCP Congestion Window Validation
(CWV) because they are related to ensuring the congestion window is
always a valid reflection of the current network state as probed by
the connection.
3.1. The basic algorithm for reducing the congestion window
A key issue in the CWV algorithm is to determine how to apply the
guideline of reducing the congestion window once for every roundtrip
time that the flow is application-limited. We use TCP's
retransmission timer (RTO) as a reasonable upper bound on the
roundtrip time, and reduce the congestion window roughly once per
RTO.
This basic algorithm could be implemented in TCP as follows: When TCP
sends a new packet it checks to see if more than RTO seconds have
elapsed since the previous packet was sent. If RTO has elapsed,
ssthresh is set to the maximum of 3/4 cwnd and the current value of
ssthresh, and then the congestion window is halved for every RTO that
elapsed since the previous packet was sent. In addition, T_prev is
set to the current time, and W_used is reset to zero. T_prev will be
used to determine the elapsed time since the sender last was network-
limited or had reduced cwnd after an idle period. When the sender is
application-limited, W_used holds the maximum congestion window
actually used since the sender was last network-limited.
The mechanism for determining the number of RTOs in the most recent
idle period could also be implemented by using a timer that expires
every RTO after the last packet was sent instead of a check per
packet - efficiency constraints on different operating systems may
dictate which is more efficient to implement.
Handley, et al. Experimental [Page 5]
RFC 2861 TCP Congestion Window Validation June 2000
After TCP sends a packet, it also checks to see if that packet filled
the congestion window. If so, the sender is network-limited, and
sets the variable T_prev to the current TCP clock time, and the
variable W_used to zero.
When TCP sends a packet that does not fill the congestion window, and
the TCP send queue is empty, then the sender is application-limited.
The sender checks to see if the amount of unacknowledged data is
greater than W_used; if so, W_used is set to the amount of
unacknowledged data. In addition TCP checks to see if the elapsed
time since T_prev is greater than RTO. If so, then the TCP has not
just reduced its congestion window following an idle period. The TCP
has been application-limited rather than network-limited for at least
an entire RTO interval, but for less than two RTO intervals. In this
case, TCP sets ssthresh to the maximum of 3/4 cwnd and the current
value of ssthresh, and reduces its congestion window to
(cwnd+W_used)/2. W_used is then set to zero, and T_prev is set to
the current time, so a further reduction will not take place until at
least another RTO period has elapsed. Thus, during an application-
limited period the CWV algorithm reduces the congestion window once
per RTO.
3.2. Pseudo-code for reducing the congestion window
Initially:
T_last = tcpnow, T_prev = tcpnow, W_used = 0
After sending a data segment:
If tcpnow - T_last >= RTO
(The sender has been idle.)
ssthresh = max(ssthresh, 3*cwnd/4)
For i=1 To (tcpnow - T_last)/RTO
win = min(cwnd, receiver's declared max window)
cwnd = max(win/2, MSS)
T_prev = tcpnow
W_used = 0
T_last = tcpnow
If window is full
T_prev = tcpnow
W_used = 0
Else
If no more data is available to send
W_used = max(W_used, amount of unacknowledged data)
If tcpnow - T_prev >= RTO
(The sender has been application-limited.)
ssthresh = max(ssthresh, 3*cwnd/4)
Handley, et al. Experimental [Page 6]
RFC 2861 TCP Congestion Window Validation June 2000
win = min(cwnd, receiver's declared max window)
cwnd = (win + W_used)/2
T_prev = tcpnow
W_used = 0
4. Simulations
The CWV proposal has been implemented as an option in the network
simulator NS [NS]. The simulations in the validation test suite for
CWV can be run with the command "./test-all-tcp" in the directory
"tcl/test". The simulations show the use of CWV to reduce the
congestion window after a period when the TCP connection was
application-limited, and to limit the increase in the congestion
window when a transfer is application-limited. As the simulations
illustrate, the use of ssthresh to maintain connection history is a
critical part of the Congestion Window Validation algorithm. [HPF99]
discusses these simulations in more detail.
5. Experiments
We have implemented the CWV mechanism in the TCP implementation in
FreeBSD 3.2. [HPF99] discusses these experiments in more detail.
The first experiment examines the effects of the Congestion Window
Validation mechanisms for limiting cwnd increases during
application-limited periods. The experiment used a real ssh
connection through a modem link emulated using Dummynet [Dummynet].
The link speed is 30Kb/s and the link has five packet buffers
available. Today most modem banks have more buffering available than
this, but the more buffer-limited situation sometimes occurs with
older modems. In the first half of the transfer, the user is typing
away over the connection. About half way through the time, the user
lists a moderately large file, which causes a large burst of traffic
to be transmitted.
For the unmodified TCP, every returning ACK during the first part of
the transfer results in an increase in cwnd. As a result, the large
burst of data arriving from the application to the transport layer is
sent as many back-to-back packets, most of which get lost and
subsequently retransmitted.
For the modified TCP with Congestion Window Validation, the
congestion window is not increased when the window is not full, and
has been decreased during application-limited periods closer to what
the user actually used. The burst of traffic is now constrained by
the congestion window, resulting in a better-behaved flow with
Handley, et al. Experimental [Page 7]
RFC 2861 TCP Congestion Window Validation June 2000
minimal loss. The end result is that the transfer happens
approximately 30% faster than the transfer without CWV, due to
avoiding retransmission timeouts.
The second experiment uses a real ssh connection over a real dialup
ppp connection, where the modem bank has much more buffering. For
the unmodified TCP, the initial burst from the large file does not
cause loss, but does cause the RTT to increase to approximately 5
seconds, where the connection becomes bounded by the receiver's
window.
For the modified TCP with Congestion Window Validation, the flow is
much better behaved, and produces no large burst of traffic. In this
case the linear increase for cwnd results in a slow increase in the
RTT as the buffer slowly fills.
For the second experiment, both the modified and the unmodified TCP
finish delivering the data at precisely the same time. This is
because the link has been fully utilized in both cases due to the
modem buffer being larger than the receiver window. Clearly a modem
buffer of this size is undesirable due to its effect on the RTT of
competing flows, but it is necessary with current TCP implementations
that produce bursts similar to those shown in the top graph.
6. Conclusions
This document has presented several TCP algorithms for Congestion
Window Validation, to be employed after an idle period or a period in
which the sender was application-limited, and before an increase of
the congestion window. The goal of these algorithms is for TCP's
congestion window to reflect recent knowledge of the TCP connection
about the state of the network path, while at the same time keeping
some memory (i.e., in ssthresh) about the earlier state of the path.
We believe that these modifications will be of benefit to both the
network and to the TCP flows themselves, by preventing unnecessary
packet drops due to the TCP sender's failure to update its
information (or lack of information) about current network
conditions. Future work will document and investigate the benefit
provided by these algorithms, using both simulations and experiments.
Additional future work will describe a more complex version of the
CWV algorithm for TCP implementations where the sender does not have
an accurate estimate of the TCP roundtrip time.
Handley, et al. Experimental [Page 8]
RFC 2861 TCP Congestion Window Validation June 2000
7. References
[FF96] Fall, K., and Floyd, S., Simulation-based Comparisons of
Tahoe, Reno, and SACK TCP, Computer Communication Review,
V. 26 N. 3, July 1996, pp. 5-21. URL
"http://www.aciri.org/floyd/papers.html".
[HPF99] Mark Handley, Jitendra Padhye, Sally Floyd, TCP Congestion
Window Validation, UMass CMPSCI Technical Report 99-77,
September 1999. URL "ftp://www-
net.cs.umass.edu/pub/Handley99-tcpq-tr-99-77.ps.gz".
[HTH98] Amy Hughes, Joe Touch, John Heidemann, "Issues in TCP
Slow-Start Restart After Idle", Work in Progress.
[J88] Jacobson, V., Congestion Avoidance and Control, Originally
from Proceedings of SIGCOMM '88 (Palo Alto, CA, Aug.
1988), and revised in 1992. URL "http://www-
nrg.ee.lbl.gov/nrg-papers.html".
[JKBFL96] Raj Jain, Shiv Kalyanaraman, Rohit Goyal, Sonia Fahmy, and
Fang Lu, Comments on "Use-it or Lose-it", ATM Forum
Document Number: ATM Forum/96-0178, URL
"http://www.netlab.ohio-
state.edu/~jain/atmf/af_rl5b2.htm".
[JKGFL95] R. Jain, S. Kalyanaraman, R. Goyal, S. Fahmy, and F. Lu, A
Fix for Source End System Rule 5, AF-TM 95-1660, December
1995, URL "http://www.netlab.ohio-
state.edu/~jain/atmf/af_rl52.htm".
[MSML99] Matt Mathis, Jeff Semke, Jamshid Mahdavi, and Kevin Lahey,
The Rate-Halving Algorithm for TCP Congestion Control,
June 1999. URL
"http://www.psc.edu/networking/ftp/papers/draft-
ratehalving.txt".
[NS] NS, the UCB/LBNL/VINT Network Simulator. URL
"http://www-mash.cs.berkeley.edu/ns/".
[RFC2581] Allman, M., Paxson, V. and W. Stevens, TCP Congestion
Control, RFC 2581, April 1999.
[VH97] Vikram Visweswaraiah and John Heidemann. Improving Restart
of Idle TCP Connections, Technical Report 97-661,
University of Southern California, November, 1997.
Handley, et al. Experimental [Page 9]
RFC 2861 TCP Congestion Window Validation June 2000
[Dummynet] Luigi Rizzo, "Dummynet and Forward Error Correction",
Freenix 98, June 1998, New Orleans. URL
"http://info.iet.unipi.it/~luigi/ip_dummynet/".
8. Security Considerations
General security considerations concerning TCP congestion control are
discussed in RFC 2581. This document describes a algorithm for one
aspect of those congestion control procedures, and so the
considerations described in RFC 2581 apply to this algorithm also.
There are no known additional security concerns for this specific
algorithm.
9. Authors' Addresses
Mark Handley
AT&T Center for Internet Research at ICSI (ACIRI)
Phone: +1 510 666 2946
EMail: mjh@aciri.org
URL: http://www.aciri.org/mjh/
Jitendra Padhye
AT&T Center for Internet Research at ICSI (ACIRI)
Phone: +1 510 666 2887
EMail: padhye@aciri.org
URL: http://www-net.cs.umass.edu/~jitu/
Sally Floyd
AT&T Center for Internet Research at ICSI (ACIRI)
Phone: +1 510 666 2989
EMail: floyd@aciri.org
URL: http://www.aciri.org/floyd/
Handley, et al. Experimental [Page 10]
RFC 2861 TCP Congestion Window Validation June 2000
10. Full Copyright Statement
Copyright (C) The Internet Society (2000). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Acknowledgement
Funding for the RFC Editor function is currently provided by the
Internet Society.
Handley, et al. Experimental [Page 11]

View File

@ -0,0 +1,451 @@
Network Working Group X. Xiao
Request for Comments: 2873 Global Crossing
Category: Standards Track A. Hannan
iVMG
V. Paxson
ACIRI/ICSI
E. Crabbe
Exodus Communications
June 2000
TCP Processing of the IPv4 Precedence Field
Status of this Memo
This document specifies an Internet standards track protocol for the
Internet community, and requests discussion and suggestions for
improvements. Please refer to the current edition of the "Internet
Official Protocol Standards" (STD 1) for the standardization state
and status of this protocol. Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (2000). All Rights Reserved.
Abstract
This memo describes a conflict between TCP [RFC793] and DiffServ
[RFC2475] on the use of the three leftmost bits in the TOS octet of
an IPv4 header [RFC791]. In a network that contains DiffServ-capable
nodes, such a conflict can cause failures in establishing TCP
connections or can cause some established TCP connections to be reset
undesirably. This memo proposes a modification to TCP for resolving
the conflict.
Because the IPv6 [RFC2460] traffic class octet does not have any
defined meaning except what is defined in RFC 2474, and in particular
does not define precedence or security parameter bits, there is no
conflict between TCP and DiffServ on the use of any bits in the IPv6
traffic class octet.
1. Introduction
In TCP, each connection has a set of states associated with it. Such
states are reflected by a set of variables stored in the TCP Control
Block (TCB) of both ends. Such variables may include the local and
remote socket number, precedence of the connection, security level
Xiao, et al. Standards Track [Page 1]
RFC 2873 TCP and the IPv4 Precedence Field June 2000
and compartment, etc. Both ends must agree on the setting of the
precedence and security parameters in order to establish a connection
and keep it open.
There is no field in the TCP header that indicates the precedence of
a segment. Instead, the precedence field in the header of the IP
packet is used as the indication. The security level and compartment
are likewise carried in the IP header, but as IP options rather than
a fixed header field. Because of this difference, the problem with
precedence discussed in this memo does not apply to them.
TCP requires that the precedence (and security parameters) of a
connection must remain unchanged during the lifetime of the
connection. Therefore, for an established TCP connection with
precedence, the receipt of a segment with different precedence
indicates an error. The connection must be reset [RFC793, pp. 36, 37,
40, 66, 67, 71].
With the advent of DiffServ, intermediate nodes may modify the
Differentiated Services Codepoint (DSCP) [RFC2474] of the IP header
to indicate the desired Per-hop Behavior (PHB) [RFC2475, RFC2597,
RFC2598]. The DSCP includes the three bits formerly known as the
precedence field. Because any modification to those three bits will
be considered illegal by endpoints that are precedence-aware, they
may cause failures in establishing connections, or may cause
established connections to be reset.
2. Terminology
Segment: the unit of data that TCP sends to IP
Precedence Field: the three leftmost bits in the TOS octet of an IPv4
header. Note that in DiffServ, these three bits may or may not be
used to denote the precedence of the IP packet. There is no
precedence field in the traffic class octet in IPv6.
TOS Field: bits 3-6 in the TOS octet of IPv4 header [RFC 1349].
MBZ field: Must Be Zero
The structure of the TOS octet is depicted below:
0 1 2 3 4 5 6 7
+-----+-----+-----+-----+-----+-----+-----+-----+
| PRECEDENCE | TOS | MBZ |
+-----+-----+-----+-----+-----+-----+-----+-----+
Xiao, et al. Standards Track [Page 2]
RFC 2873 TCP and the IPv4 Precedence Field June 2000
DS Field: the TOS octet of an IPv4 header is renamed the
Differentiated Services (DS) Field by DiffServ.
The structure of the DS field is depicted below:
0 1 2 3 4 5 6 7
+---+---+---+---+---+---+---+---+
| DSCP | CU |
+---+---+---+---+---+---+---+---+
DSCP: Differentiated Service Code Point, the leftmost 6 bits in the
DS field.
CU: currently unused.
Per-hop Behavior (PHB): a description of the externally observable
forwarding treatment applied at a differentiated services-compliant
node to a behavior aggregate.
3. Problem Description
The manipulation of the DSCP to achieve the desired PHB by DiffServ-
capable nodes may conflict with TCP's use of the precedence field.
This conflict can potentially cause problems for TCP implementations
that conform to RFC 793. First, page 36 of RFC 793 states:
If the connection is in any non-synchronized state (LISTEN, SYN-
SENT, SYN-RECEIVED), and the incoming segment acknowledges
something not yet sent (the segment carries an unacceptable ACK),
or if an incoming segment has a security level or compartment
which does not exactly match the level and compartment requested
for the connection, a reset is sent. If our SYN has not been
acknowledged and the precedence level of the incoming segment is
higher than the precedence level requested then either raise the
local precedence level (if allowed by the user and the system) or
send a reset; or if the precedence level of the incoming segment
is lower than the precedence level requested then continue as if
the precedence matched exactly (if the remote TCP cannot raise
the precedence level to match ours this will be detected in the
next segment it sends, and the connection will be terminated
then). If our SYN has been acknowledged (perhaps in this incoming
segment) the precedence level of the incoming segment must match
the local precedence level exactly, if it does not a reset must
be sent.
This leads to Problem #1: For a precedence-aware TCP module, if
during TCP's synchronization process, the precedence fields of the
SYN and/or ACK packets are modified by the intermediate nodes,
Xiao, et al. Standards Track [Page 3]
RFC 2873 TCP and the IPv4 Precedence Field June 2000
resulting in the received ACK packet having a different precedence
from the precedence picked by this TCP module, the TCP connection
cannot be established, even if both modules actually agree on an
identical precedence for the connection.
Then, on page 37, RFC 793 states:
If the connection is in a synchronized state (ESTABLISHED, FIN-
WAIT-1, FIN-WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK, TIME-WAIT),
security level, or compartment, or precedence which does not
exactly match the level, and compartment, and precedence
requested for the connection, a reset is sent and connection goes
to the CLOSED state.
This leads to Problem #2: For a precedence-aware TCP module, if the
precedence field of a received segment from an established TCP
connection has been changed en route by the intermediate nodes so as
to be different from the precedence specified during the connection
setup, the TCP connection will be reset.
Each of problems #1 and #2 has a mirroring problem. They cause TCP
connections that must be reset according to RFC 793 not to be reset.
Problem #3: A TCP connection may be established between two TCP
modules that pick different precedence, because the precedence fields
of the SYN and ACK packets are modified by intermediate nodes,
resulting in both modules thinking that they are in agreement for the
precedence of the connection.
Problem #4: A TCP connection has been established normally by two
TCP modules that pick the same precedence. But in the middle of the
data transmission, one of the TCP modules changes the precedence of
its segments. According to RFC 793, the TCP connection must be reset.
In a DiffServ-capable environment, if the precedence of the segments
is altered by intermediate nodes such that it retains the expected
value when arriving at the other TCP module, the connection will not
be reset.
4. Proposed Modification to TCP
The proposed modification to TCP is that TCP must ignore the
precedence of all received segments. More specifically:
(1) In TCP's synchronization process, the TCP modules at both ends
must ignore the precedence fields of the SYN and SYN ACK packets. The
TCP connection will be established if all the conditions specified by
RFC 793 are satisfied except the precedence of the connection.
Xiao, et al. Standards Track [Page 4]
RFC 2873 TCP and the IPv4 Precedence Field June 2000
(2) After a connection is established, each end sends segments with
its desired precedence. The precedence picked by one end of the TCP
connection may be the same or may be different from the precedence
picked by the other end (because precedence is ignored during
connection setup time). The precedence fields may be changed by the
intermediate nodes too. In either case, the precedence of the
received packets will be ignored by the other end. The TCP connection
will not be reset in either case.
Problems #1 and #2 are solved by this proposed modification. Problems
#3 and #4 become non-issues because TCP must ignore the precedence.
In a DiffServ-capable environment, the two cases described in
problems #3 and #4 should be allowed.
5. Security Considerations
A TCP implementation that terminates a connection upon receipt of any
segment with an incorrect precedence field, regardless of the
correctness of the sequence numbers in the segment's header, poses a
serious denial-of-service threat, as all an attacker must do to
terminate a connection is guess the port numbers and then send two
segments with different precedence values; one of them is certain to
terminate the connection. Accordingly, the change to TCP processing
proposed in this memo would yield a significant gain in terms of that
TCP implementation's resilience.
On the other hand, the stricter processing rules of RFC 793 in
principle make TCP spoofing attacks more difficult, as the attacker
must not only guess the victim TCP's initial sequence number, but
also its precedence setting.
Finally, the security issues of each PHB group are addressed in the
PHB group's specification [RFC2597, RFC2598].
6. Acknowledgments
Our thanks to Al Smith for his careful review and comments.
Xiao, et al. Standards Track [Page 5]
RFC 2873 TCP and the IPv4 Precedence Field June 2000
7. References
[RFC791] Postel, J., "Internet Protocol", STD 5, RFC 791, September
1981.
[RFC793] Postel, J., "Transmission Control Protocol", STD 7, RFC
793, September 1981.
[RFC1349] Almquist, P., "Type of Service in the Internet Protocol
Suite", RFC 1349, July 1992.
[RFC2460] Deering, S. and R. Hinden, "Internet Protocol, Version 6
(IPv6) Specification", RFC 2460, December 1998.
[RFC2474] Nichols, K., Blake, S., Baker, F. and D. Black, "Definition
of the Differentiated Services Field (DS Field) in the IPv4
and IPv6 Headers", RFC 2474, December 1998.
[RFC2475] Blake, S., Black, D., Carlson, M., Davies, E., Wang, Z. and
W. Weiss, "An Architecture for Differentiated Services",
RFC 2475, December 1998.
[RFC2597] Heinanen, J., Baker, F., Weiss, W. and J. Wroclawski,
"Assured Forwarding PHB Group", RFC 2587, June 1999.
[RFC2598] Jacobson, V., Nichols, K. and K. Poduri, "An Expedited
Forwarding PHB", RFC 2598, June 1999.
Xiao, et al. Standards Track [Page 6]
RFC 2873 TCP and the IPv4 Precedence Field June 2000
8. Authors' Addresses
Xipeng Xiao
Global Crossing
141 Caspian Court
Sunnyvale, CA 94089
USA
Phone: +1 408-543-4801
EMail: xipeng@gblx.net
Alan Hannan
iVMG, Inc.
112 Falkirk Court
Sunnyvale, CA 94087
USA
Phone: +1 408-749-7084
EMail: alan@ivmg.net
Edward Crabbe
Exodus Communications
2650 San Tomas Expressway
Santa Clara, CA 95051
USA
Phone: +1 408-346-1544
EMail: edc@explosive.net
Vern Paxson
ACIRI/ICSI
1947 Center Street
Suite 600
Berkeley, CA 94704-1198
USA
Phone: +1 510-666-2882
EMail: vern@aciri.org
Xiao, et al. Standards Track [Page 7]
RFC 2873 TCP and the IPv4 Precedence Field June 2000
9. Full Copyright Statement
Copyright (C) The Internet Society (2000). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Acknowledgement
Funding for the RFC Editor function is currently provided by the
Internet Society.
Xiao, et al. Standards Track [Page 8]

View File

@ -0,0 +1,955 @@
Network Working Group S. Floyd
Request for Comments: 2883 ACIRI
Category: Standards Track J. Mahdavi
Novell
M. Mathis
Pittsburgh Supercomputing Center
M. Podolsky
UC Berkeley
July 2000
An Extension to the Selective Acknowledgement (SACK) Option for TCP
Status of this Memo
This document specifies an Internet standards track protocol for the
Internet community, and requests discussion and suggestions for
improvements. Please refer to the current edition of the "Internet
Official Protocol Standards" (STD 1) for the standardization state
and status of this protocol. Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (2000). All Rights Reserved.
Abstract
This note defines an extension of the Selective Acknowledgement
(SACK) Option [RFC2018] for TCP. RFC 2018 specified the use of the
SACK option for acknowledging out-of-sequence data not covered by
TCP's cumulative acknowledgement field. This note extends RFC 2018
by specifying the use of the SACK option for acknowledging duplicate
packets. This note suggests that when duplicate packets are
received, the first block of the SACK option field can be used to
report the sequence numbers of the packet that triggered the
acknowledgement. This extension to the SACK option allows the TCP
sender to infer the order of packets received at the receiver,
allowing the sender to infer when it has unnecessarily retransmitted
a packet. A TCP sender could then use this information for more
robust operation in an environment of reordered packets [BPS99], ACK
loss, packet replication, and/or early retransmit timeouts.
1. Conventions and Acronyms
The keywords MUST, MUST NOT, REQUIRED, SHALL, SHALL NOT, SHOULD,
SHOULD NOT, RECOMMENDED, MAY, and OPTIONAL, when they appear in this
document, are to be interpreted as described in [B97].
Floyd, et al. Standards Track [Page 1]
RFC 2883 SACK Extension July 2000
2. Introduction
The Selective Acknowledgement (SACK) option defined in RFC 2018 is
used by the TCP data receiver to acknowledge non-contiguous blocks of
data not covered by the Cumulative Acknowledgement field. However,
RFC 2018 does not specify the use of the SACK option when duplicate
segments are received. This note specifies the use of the SACK
option when acknowledging the receipt of a duplicate packet [F99].
We use the term D-SACK (for duplicate-SACK) to refer to a SACK block
that reports a duplicate segment.
This document does not make any changes to TCP's use of the
cumulative acknowledgement field, or to the TCP receiver's decision
of *when* to send an acknowledgement packet. This document only
concerns the contents of the SACK option when an acknowledgement is
sent.
This extension is compatible with current implementations of the SACK
option in TCP. That is, if one of the TCP end-nodes does not
implement this D-SACK extension and the other TCP end-node does, we
believe that this use of the D-SACK extension by one of the end nodes
will not introduce problems.
The use of D-SACK does not require separate negotiation between a TCP
sender and receiver that have already negotiated SACK capability.
The absence of separate negotiation for D-SACK means that the TCP
receiver could send D-SACK blocks when the TCP sender does not
understand this extension to SACK. In this case, the TCP sender will
simply discard any D-SACK blocks, and process the other SACK blocks
in the SACK option field as it normally would.
Floyd, et al. Standards Track [Page 2]
RFC 2883 SACK Extension July 2000
3. The Sack Option Format as defined in RFC 2018
The SACK option as defined in RFC 2018 is as follows:
+--------+--------+
| Kind=5 | Length |
+--------+--------+--------+--------+
| Left Edge of 1st Block |
+--------+--------+--------+--------+
| Right Edge of 1st Block |
+--------+--------+--------+--------+
| |
/ . . . /
| |
+--------+--------+--------+--------+
| Left Edge of nth Block |
+--------+--------+--------+--------+
| Right Edge of nth Block |
+--------+--------+--------+--------+
The Selective Acknowledgement (SACK) option in the TCP header
contains a number of SACK blocks, where each block specifies the left
and right edge of a block of data received at the TCP receiver. In
particular, a block represents a contiguous sequence space of data
received and queued at the receiver, where the "left edge" of the
block is the first sequence number of the block, and the "right edge"
is the sequence number immediately following the last sequence number
of the block.
RFC 2018 implies that the first SACK block specify the segment that
triggered the acknowledgement. From RFC 2018, when the data receiver
chooses to send a SACK option, "the first SACK block ... MUST specify
the contiguous block of data containing the segment which triggered
this ACK, unless that segment advanced the Acknowledgment Number
field in the header."
However, RFC 2018 does not address the use of the SACK option when
acknowledging a duplicate segment. For example, RFC 2018 specifies
that "each block represents received bytes of data that are
contiguous and isolated". RFC 2018 further specifies that "if sent
at all, SACK options SHOULD be included in all ACKs which do not ACK
the highest sequence number in the data receiver's queue." RFC 2018
does not specify the use of the SACK option when a duplicate segment
is received, and the cumulative acknowledgement field in the ACK
acknowledges all of the data in the data receiver's queue.
Floyd, et al. Standards Track [Page 3]
RFC 2883 SACK Extension July 2000
4. Use of the SACK option for reporting a duplicate segment
This section specifies the use of SACK blocks when the SACK option is
used in reporting a duplicate segment. When D-SACK is used, the
first block of the SACK option should be a D-SACK block specifying
the sequence numbers for the duplicate segment that triggers the
acknowledgement. If the duplicate segment is part of a larger block
of non-contiguous data in the receiver's data queue, then the
following SACK block should be used to specify this larger block.
Additional SACK blocks can be used to specify additional non-
contiguous blocks of data, as specified in RFC 2018.
The guidelines for reporting duplicate segments are summarized below:
(1) A D-SACK block is only used to report a duplicate contiguous
sequence of data received by the receiver in the most recent packet.
(2) Each duplicate contiguous sequence of data received is reported
in at most one D-SACK block. (I.e., the receiver sends two identical
D-SACK blocks in subsequent packets only if the receiver receives two
duplicate segments.)
(3) The left edge of the D-SACK block specifies the first sequence
number of the duplicate contiguous sequence, and the right edge of
the D-SACK block specifies the sequence number immediately following
the last sequence in the duplicate contiguous sequence.
(4) If the D-SACK block reports a duplicate contiguous sequence from
a (possibly larger) block of data in the receiver's data queue above
the cumulative acknowledgement, then the second SACK block in that
SACK option should specify that (possibly larger) block of data.
(5) Following the SACK blocks described above for reporting duplicate
segments, additional SACK blocks can be used for reporting additional
blocks of data, as specified in RFC 2018.
Note that because each duplicate segment is reported in only one ACK
packet, information about that duplicate segment will be lost if that
ACK packet is dropped in the network.
4.1 Reporting Full Duplicate Segments
We illustrate these guidelines with three examples. In each example,
we assume that the data receiver has first received eight segments of
500 bytes each, and has sent an acknowledgement with the cumulative
acknowledgement field set to 4000 (assuming the first sequence number
is zero). The D-SACK block is underlined in each example.
Floyd, et al. Standards Track [Page 4]
RFC 2883 SACK Extension July 2000
4.1.1. Example 1: Reporting a duplicate segment.
Because several ACK packets are lost, the data sender retransmits
packet 3000-3499, and the data receiver subsequently receives a
duplicate segment with sequence numbers 3000-3499. The receiver
sends an acknowledgement with the cumulative acknowledgement field
set to 4000, and the first, D-SACK block specifying sequence numbers
3000-3500.
Transmitted Received ACK Sent
Segment Segment (Including SACK Blocks)
3000-3499 3000-3499 3500 (ACK dropped)
3500-3999 3500-3999 4000 (ACK dropped)
3000-3499 3000-3499 4000, SACK=3000-3500
---------
4.1.2. Example 2: Reporting an out-of-order segment and a duplicate
segment.
Following a lost data packet, the receiver receives an out-of-order
data segment, which triggers the SACK option as specified in RFC
2018. Because of several lost ACK packets, the sender then
retransmits a data packet. The receiver receives the duplicate
packet, and reports it in the first, D-SACK block:
Transmitted Received ACK Sent
Segment Segment (Including SACK Blocks)
3000-3499 3000-3499 3500 (ACK dropped)
3500-3999 3500-3999 4000 (ACK dropped)
4000-4499 (data packet dropped)
4500-4999 4500-4999 4000, SACK=4500-5000 (ACK dropped)
3000-3499 3000-3499 4000, SACK=3000-3500, 4500-5000
---------
Floyd, et al. Standards Track [Page 5]
RFC 2883 SACK Extension July 2000
4.1.3. Example 3: Reporting a duplicate of an out-of-order segment.
Because of a lost data packet, the receiver receives two out-of-order
segments. The receiver next receives a duplicate segment for one of
these out-of-order segments:
Transmitted Received ACK Sent
Segment Segment (Including SACK Blocks)
3500-3999 3500-3999 4000
4000-4499 (data packet dropped)
4500-4999 4500-4999 4000, SACK=4500-5000
5000-5499 5000-5499 4000, SACK=4500-5500
(duplicated packet)
5000-5499 4000, SACK=5000-5500, 4500-5500
---------
4.2. Reporting Partial Duplicate Segments
It may be possible that a sender transmits a packet that includes one
or more duplicate sub-segments--that is, only part but not all of the
transmitted packet has already arrived at the receiver. This can
occur when the size of the sender's transmitted segments increases,
which can occur when the PMTU increases in the middle of a TCP
session, for example. The guidelines in Section 4 above apply to
reporting partial as well as full duplicate segments. This section
gives examples of these guidelines when reporting partial duplicate
segments.
When the SACK option is used for reporting partial duplicate
segments, the first D-SACK block reports the first duplicate sub-
segment. If the data packet being acknowledged contains multiple
partial duplicate sub-segments, then only the first such duplicate
sub-segment is reported in the SACK option. We illustrate this with
the examples below.
4.2.1. Example 4: Reporting a single duplicate subsegment.
The sender increases the packet size from 500 bytes to 1000 bytes.
The receiver subsequently receives a 1000-byte packet containing one
500-byte subsegment that has already been received and one which has
not. The receiver reports only the already received subsegment using
a single D-SACK block.
Floyd, et al. Standards Track [Page 6]
RFC 2883 SACK Extension July 2000
Transmitted Received ACK Sent
Segment Segment (Including SACK Blocks)
500-999 500-999 1000
1000-1499 (delayed)
1500-1999 (data packet dropped)
2000-2499 2000-2499 1000, SACK=2000-2500
1000-2000 1000-1499 1500, SACK=2000-2500
1000-2000 2500, SACK=1000-1500
---------
4.2.2. Example 5: Two non-contiguous duplicate subsegments covered by
the cumulative acknowledgement.
After the sender increases its packet size from 500 bytes to 1500
bytes, the receiver receives a packet containing two non-contiguous
duplicate 500-byte subsegments which are less than the cumulative
acknowledgement field. The receiver reports the first such duplicate
segment in a single D-SACK block.
Transmitted Received ACK Sent
Segment Segment (Including SACK Blocks)
500-999 500-999 1000
1000-1499 (delayed)
1500-1999 (data packet dropped)
2000-2499 (delayed)
2500-2999 (data packet dropped)
3000-3499 3000-3499 1000, SACK=3000-3500
1000-2499 1000-1499 1500, SACK=3000-3500
2000-2499 1500, SACK=2000-2500, 3000-3500
1000-2499 2500, SACK=1000-1500, 3000-3500
---------
4.2.3. Example 6: Two non-contiguous duplicate subsegments not covered
by the cumulative acknowledgement.
This example is similar to Example 5, except that after the sender
increases the packet size, the receiver receives a packet containing
two non-contiguous duplicate subsegments which are above the
cumulative acknowledgement field, rather than below. The first, D-
SACK block reports the first duplicate subsegment, and the second,
SACK block reports the larger block of non-contiguous data that it
belongs to.
Floyd, et al. Standards Track [Page 7]
RFC 2883 SACK Extension July 2000
Transmitted Received ACK Sent
Segment Segment (Including SACK Blocks)
500-999 500-999 1000
1000-1499 (data packet dropped)
1500-1999 (delayed)
2000-2499 (data packet dropped)
2500-2999 (delayed)
3000-3499 (data packet dropped)
3500-3999 3500-3999 1000, SACK=3500-4000
1000-1499 (data packet dropped)
1500-2999 1500-1999 1000, SACK=1500-2000, 3500-4000
2000-2499 1000, SACK=2000-2500, 1500-2000,
3500-4000
1500-2999 1000, SACK=1500-2000, 1500-3000,
---------
3500-4000
4.3. Interaction Between D-SACK and PAWS
RFC 1323 [RFC1323] specifies an algorithm for Protection Against
Wrapped Sequence Numbers (PAWS). PAWS gives a method for
distinguishing between sequence numbers for new data, and sequence
numbers from a previous cycle through the sequence number space.
Duplicate segments might be detected by PAWS as belonging to a
previous cycle through the sequence number space.
RFC 1323 specifies that for such packets, the receiver should do the
following:
Send an acknowledgement in reply as specified in RFC 793 page 69,
and drop the segment.
Since PAWS still requires sending an ACK, there is no harmful
interaction between PAWS and the use of D-SACK. The D-SACK block can
be included in the SACK option of the ACK, as outlined in Section 4,
independently of the use of PAWS by the TCP receiver, and
independently of the determination by PAWS of the validity or
invalidity of the data segment.
TCP senders receiving D-SACK blocks should be aware that a segment
reported as a duplicate segment could possibly have been from a prior
cycle through the sequence number space. This is independent of the
use of PAWS by the TCP data receiver. We do not anticipate that this
will present significant problems for senders using D-SACK
information.
Floyd, et al. Standards Track [Page 8]
RFC 2883 SACK Extension July 2000
5. Detection of Duplicate Packets
This extension to the SACK option enables the receiver to accurately
report the reception of duplicate data. Because each receipt of a
duplicate packet is reported in only one ACK packet, the loss of a
single ACK can prevent this information from reaching the sender. In
addition, we note that the sender can not necessarily trust the
receiver to send it accurate information [SCWA99].
In order for the sender to check that the first (D)SACK block of an
acknowledgement in fact acknowledges duplicate data, the sender
should compare the sequence space in the first SACK block to the
cumulative ACK which is carried IN THE SAME PACKET. If the SACK
sequence space is less than this cumulative ACK, it is an indication
that the segment identified by the SACK block has been received more
than once by the receiver. An implementation MUST NOT compare the
sequence space in the SACK block to the TCP state variable snd.una
(which carries the total cumulative ACK), as this may result in the
wrong conclusion if ACK packets are reordered.
If the sequence space in the first SACK block is greater than the
cumulative ACK, then the sender next compares the sequence space in
the first SACK block with the sequence space in the second SACK
block, if there is one. This comparison can determine if the first
SACK block is reporting duplicate data that lies above the cumulative
ACK.
TCP implementations which follow RFC 2581 [RFC2581] could see
duplicate packets in each of the following four situations. This
document does not specify what action a TCP implementation should
take in these cases. The extension to the SACK option simply enables
the sender to detect each of these cases. Note that these four
conditions are not an exhaustive list of possible cases for duplicate
packets, but are representative of the most common/likely cases.
Subsequent documents will describe experimental proposals for sender
responses to the detection of unnecessary retransmits due to
reordering, lost ACKS, or early retransmit timeouts.
Floyd, et al. Standards Track [Page 9]
RFC 2883 SACK Extension July 2000
5.1. Replication by the network
If a packet is replicated in the network, this extension to the SACK
option can identify this. For example:
Transmitted Received ACK Sent
Segment Segment (Including SACK Blocks)
500-999 500-999 1000
1000-1499 1000-1499 1500
(replicated)
1000-1499 1500, SACK=1000-1500
---------
In this case, the second packet was replicated in the network. An
ACK containing a D-SACK block which is lower than its ACK field and
is not identical to a previously retransmitted segment is indicative
of a replication by the network.
WITHOUT D-SACK:
If D-SACK was not used and the last ACK was piggybacked on a data
packet, the sender would not know that a packet had been replicated
in the network. If D-SACK was not used and neither of the last two
ACKs was piggybacked on a data packet, then the sender could
reasonably infer that either some data packet *or* the final ACK
packet had been replicated in the network. The receipt of the D-SACK
packet gives the sender positive knowledge that this data packet was
replicated in the network (assuming that the receiver is not lying).
RESEARCH ISSUES:
The current SACK option already allows the sender to identify
duplicate ACKs that do not acknowledge new data, but the D-SACK
option gives the sender a stronger basis for inferring that a
duplicate ACK does not acknowledge new data. The knowledge that a
duplicate ACK does not acknowledge new data allows the sender to
refrain from using that duplicate ACKs to infer packet loss (e.g.,
Fast Retransmit) or to send more data (e.g., Fast Recovery).
5.2. False retransmit due to reordering
If packets are reordered in the network such that a segment arrives
more than 3 packets out of order, TCP's Fast Retransmit algorithm
will retransmit the out-of-order packet. An example of this is shown
below:
Floyd, et al. Standards Track [Page 10]
RFC 2883 SACK Extension July 2000
Transmitted Received ACK Sent
Segment Segment (Including SACK Blocks)
500-999 500-999 1000
1000-1499 (delayed)
1500-1999 1500-1999 1000, SACK=1500-2000
2000-2499 2000-2499 1000, SACK=1500-2500
2500-2999 2500-2999 1000, SACK=1500-3000
1000-1499 1000-1499 3000
1000-1499 3000, SACK=1000-1500
---------
In this case, an ACK containing a SACK block which is lower than its
ACK field and identical to a previously retransmitted segment is
indicative of a significant reordering followed by a false
(unnecessary) retransmission.
WITHOUT D-SACK:
With the use of D-SACK illustrated above, the sender knows that
either the first transmission of segment 1000-1499 was delayed in the
network, or the first transmission of segment 1000-1499 was dropped
and the second transmission of segment 1000-1499 was duplicated.
Given that no other segments have been duplicated in the network,
this second option can be considered unlikely.
Without the use of D-SACK, the sender would only know that either the
first transmission of segment 1000-1499 was delayed in the network,
or that either one of the data segments or the final ACK was
duplicated in the network. Thus, the use of D-SACK allows the sender
to more reliably infer that the first transmission of segment
1000-1499 was not dropped.
[AP99], [L99], and [LK00] note that the sender could unambiguously
detect an unnecessary retransmit with the use of the timestamp
option. [LK00] proposes a timestamp-based algorithm that minimizes
the penalty for an unnecessary retransmit. [AP99] proposes a
heuristic for detecting an unnecessary retransmit in an environment
with neither timestamps nor SACK. [L99] also proposes a two-bit
field as an alternate to the timestamp option for unambiguously
marking the first three retransmissions of a packet. A similar idea
was proposed in [ISO8073].
RESEARCH ISSUES:
The use of D-SACK allows the sender to detect some cases (e.g., when
no ACK packets have been lost) when a a Fast Retransmit was due to
packet reordering instead of packet loss. This allows the TCP sender
Floyd, et al. Standards Track [Page 11]
RFC 2883 SACK Extension July 2000
to adjust the duplicate acknowledgment threshold, to prevent such
unnecessary Fast Retransmits in the future. Coupled with this, when
the sender determines, after the fact, that it has made an
unnecessary window reduction, the sender has the option of "undoing"
that reduction in the congestion window by resetting ssthresh to the
value of the old congestion window, and slow-starting until the
congestion window has reached that point.
Any proposal for "undoing" a reduction in the congestion window would
have to address the possibility that the TCP receiver could be lying
in its reports of received packets [SCWA99].
5.3. Retransmit Timeout Due to ACK Loss
If an entire window of ACKs is lost, a timeout will result. An
example of this is given below:
Transmitted Received ACK Sent
Segment Segment (Including SACK Blocks)
500-999 500-999 1000 (ACK dropped)
1000-1499 1000-1499 1500 (ACK dropped)
1500-1999 1500-1999 2000 (ACK dropped)
2000-2499 2000-2499 2500 (ACK dropped)
(timeout)
500-999 500-999 2500, SACK=500-1000
--------
In this case, all of the ACKs are dropped, resulting in a timeout.
This condition can be identified because the first ACK received
following the timeout carries a D-SACK block indicating duplicate
data was received.
WITHOUT D-SACK:
Without the use of D-SACK, the sender in this case would be unable to
decide that no data packets has been dropped.
RESEARCH ISSUES:
For a TCP that implements some form of ACK congestion control
[BPK97], this ability to distinguish between dropped data packets and
dropped ACK packets would be particularly useful. In this case, the
connection could implement congestion control for the return (ACK)
path independently from the congestion control on the forward (data)
path.
Floyd, et al. Standards Track [Page 12]
RFC 2883 SACK Extension July 2000
5.4. Early Retransmit Timeout
If the sender's RTO is too short, an early retransmission timeout can
occur when no packets have in fact been dropped in the network. An
example of this is given below:
Transmitted Received ACK Sent
Segment Segment (Including SACK Blocks)
500-999 (delayed)
1000-1499 (delayed)
1500-1999 (delayed)
2000-2499 (delayed)
(timeout)
500-999 (delayed)
500-999 1000
1000-1499 (delayed)
1000-1499 1500
...
1500-1999 2000
2000-2499 2500
500-999 2500, SACK=500-1000
--------
1000-1499 2500, SACK=1000-1500
---------
...
In this case, the first packet is retransmitted following the
timeout. Subsequently, the original window of packets arrives at the
receiver, resulting in ACKs for these segments. Following this, the
retransmissions of these segments arrive, resulting in ACKs carrying
SACK blocks which identify the duplicate segments.
This can be identified as an early retransmission timeout because the
ACK for byte 1000 is received after the timeout with no SACK
information, followed by an ACK which carries SACK information (500-
999) indicating that the retransmitted segment had already been
received.
WITHOUT D-SACK:
If D-SACK was not used and one of the duplicate ACKs was piggybacked
on a data packet, the sender would not know how many duplicate
packets had been received. If D-SACK was not used and none of the
duplicate ACKs were piggybacked on a data packet, then the sender
would have sent N duplicate packets, for some N, and received N
duplicate ACKs. In this case, the sender could reasonably infer that
Floyd, et al. Standards Track [Page 13]
RFC 2883 SACK Extension July 2000
some data or ACK packet had been replicated in the network, or that
an early retransmission timeout had occurred (or that the receiver is
lying).
RESEARCH ISSUES:
After the sender determines that an unnecessary (i.e., early)
retransmit timeout has occurred, the sender could adjust parameters
for setting the RTO, to prevent more unnecessary retransmit timeouts.
Coupled with this, when the sender determines, after the fact, that
it has made an unnecessary window reduction, the sender has the
option of "undoing" that reduction in the congestion window.
6. Security Considerations
This document neither strengthens nor weakens TCP's current security
properties.
7. Acknowledgements
We would like to thank Mark Handley, Reiner Ludwig, and Venkat
Padmanabhan for conversations on these issues, and to thank Mark
Allman for helpful feedback on this document.
8. References
[AP99] Mark Allman and Vern Paxson, On Estimating End-to-End
Network Path Properties, SIGCOMM 99, August 1999. URL
"http://www.acm.org/sigcomm/sigcomm99/papers/session7-
3.html".
[BPS99] J.C.R. Bennett, C. Partridge, and N. Shectman, Packet
Reordering is Not Pathological Network Behavior, IEEE/ACM
Transactions on Networking, Vol. 7, No. 6, December 1999,
pp. 789-798.
[BPK97] Hari Balakrishnan, Venkata Padmanabhan, and Randy H. Katz,
The Effects of Asymmetry on TCP Performance, Third ACM/IEEE
Mobicom Conference, Budapest, Hungary, Sep 1997. URL
"http://www.cs.berkeley.edu/~padmanab/
index.html#Publications".
[F99] Floyd, S., Re: TCP and out-of-order delivery, Message ID
<199902030027.QAA06775@owl.ee.lbl.gov> to the end-to-end-
interest mailing list, February 1999. URL
"http://www.aciri.org/floyd/notes/TCP_Feb99.email".
Floyd, et al. Standards Track [Page 14]
RFC 2883 SACK Extension July 2000
[ISO8073] ISO/IEC, Information-processing systems - Open Systems
Interconnection - Connection Oriented Transport Protocol
Specification, Internation Standard ISO/IEC 8073, December
1988.
[L99] Reiner Ludwig, A Case for Flow Adaptive Wireless links,
Technical Report UCB//CSD-99-1053, May 1999. URL
"http://iceberg.cs.berkeley.edu/papers/Ludwig-
FlowAdaptive/".
[LK00] Reiner Ludwig and Randy H. Katz, The Eifel Algorithm:
Making TCP Robust Against Spurious Retransmissions, SIGCOMM
Computer Communication Review, V. 30, N. 1, January 2000.
URL "http://www.acm.org/sigcomm/ccr/archive/ccr-toc/ccr-
toc-2000.html".
[RFC1323] Jacobson, V., Braden, R. and D. Borman, "TCP Extensions for
High Performance", RFC 1323, May 1992.
[RFC2018] Mathis, M., Mahdavi, J., Floyd, S. and A. Romanow, "TCP
Selective Acknowledgement Options", RFC 2018, April 1996.
[RFC2581] Allman, M., Paxson,V. and W. Stevens, "TCP Congestion
Control", RFC 2581, April 1999.
[SCWA99] Stefan Savage, Neal Cardwell, David Wetherall, Tom
Anderson, TCP Congestion Control with a Misbehaving
Receiver, ACM Computer Communications Review, pp. 71-78, V.
29, N. 5, October, 1999. URL
"http://www.acm.org/sigcomm/ccr/archive/ccr-toc/ccr-toc-
99.html".
Floyd, et al. Standards Track [Page 15]
RFC 2883 SACK Extension July 2000
Authors' Addresses
Sally Floyd
AT&T Center for Internet Research at ICSI (ACIRI)
Phone: +1 510-666-6989
EMail: floyd@aciri.org
URL: http://www.aciri.org/floyd/
Jamshid Mahdavi
Novell
Phone: 1-408-967-3806
EMail: mahdavi@novell.com
Matt Mathis
Pittsburgh Supercomputing Center
Phone: 412 268-3319
EMail: mathis@psc.edu
URL: http://www.psc.edu/~mathis/
Matthew Podolsky
UC Berkeley Electrical Engineering & Computer Science Dept.
Phone: 510-649-8914
EMail: podolsky@eecs.berkeley.edu
URL: http://www.eecs.berkeley.edu/~podolsky
Floyd, et al. Standards Track [Page 16]
RFC 2883 SACK Extension July 2000
Full Copyright Statement
Copyright (C) The Internet Society (2000). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Acknowledgement
Funding for the RFC Editor function is currently provided by the
Internet Society.
Floyd, et al. Standards Track [Page 17]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,955 @@
Network Working Group S. Floyd
Request for Comments: 2914 ACIRI
BCP: 41 September 2000
Category: Best Current Practice
Congestion Control Principles
Status of this Memo
This document specifies an Internet Best Current Practices for the
Internet Community, and requests discussion and suggestions for
improvements. Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (2000). All Rights Reserved.
Abstract
The goal of this document is to explain the need for congestion
control in the Internet, and to discuss what constitutes correct
congestion control. One specific goal is to illustrate the dangers
of neglecting to apply proper congestion control. A second goal is
to discuss the role of the IETF in standardizing new congestion
control protocols.
1. Introduction
This document draws heavily from earlier RFCs, in some cases
reproducing entire sections of the text of earlier documents
[RFC2309, RFC2357]. We have also borrowed heavily from earlier
publications addressing the need for end-to-end congestion control
[FF99].
2. Current standards on congestion control
IETF standards concerning end-to-end congestion control focus either
on specific protocols (e.g., TCP [RFC2581], reliable multicast
protocols [RFC2357]) or on the syntax and semantics of communications
between the end nodes and routers about congestion information (e.g.,
Explicit Congestion Notification [RFC2481]) or desired quality-of-
service (diff-serv)). The role of end-to-end congestion control is
also discussed in an Informational RFC on "Recommendations on Queue
Management and Congestion Avoidance in the Internet" [RFC2309]. RFC
2309 recommends the deployment of active queue management mechanisms
in routers, and the continuation of design efforts towards mechanisms
Floyd, ed. Best Current Practice [Page 1]
RFC 2914 Congestion Control Principles September 2000
in routers to deal with flows that are unresponsive to congestion
notification. We freely borrow from RFC 2309 some of their general
discussion of end-to-end congestion control.
In contrast to the RFCs discussed above, this document is a more
general discussion of the principles of congestion control. One of
the keys to the success of the Internet has been the congestion
avoidance mechanisms of TCP. While TCP is still the dominant
transport protocol in the Internet, it is not ubiquitous, and there
are an increasing number of applications that, for one reason or
another, choose not to use TCP. Such traffic includes not only
multicast traffic, but unicast traffic such as streaming multimedia
that does not require reliability; and traffic such as DNS or routing
messages that consist of short transfers deemed critical to the
operation of the network. Much of this traffic does not use any form
of either bandwidth reservations or end-to-end congestion control.
The continued use of end-to-end congestion control by best-effort
traffic is critical for maintaining the stability of the Internet.
This document also discusses the general role of the IETF in the
standardization of new congestion control protocols.
The discussion of congestion control principles for differentiated
services or integrated services is not addressed in this document.
Some categories of integrated or differentiated services include a
guarantee by the network of end-to-end bandwidth, and as such do not
require end-to-end congestion control mechanisms.
3. The development of end-to-end congestion control.
3.1. Preventing congestion collapse.
The Internet protocol architecture is based on a connectionless end-
to-end packet service using the IP protocol. The advantages of its
connectionless design, flexibility and robustness, have been amply
demonstrated. However, these advantages are not without cost:
careful design is required to provide good service under heavy load.
In fact, lack of attention to the dynamics of packet forwarding can
result in severe service degradation or "Internet meltdown". This
phenomenon was first observed during the early growth phase of the
Internet of the mid 1980s [RFC896], and is technically called
"congestion collapse".
The original specification of TCP [RFC793] included window-based flow
control as a means for the receiver to govern the amount of data sent
by the sender. This flow control was used to prevent overflow of the
receiver's data buffer space available for that connection. [RFC793]
Floyd, ed. Best Current Practice [Page 2]
RFC 2914 Congestion Control Principles September 2000
reported that segments could be lost due either to errors or to
network congestion, but did not include dynamic adjustment of the
flow-control window in response to congestion.
The original fix for Internet meltdown was provided by Van Jacobson.
Beginning in 1986, Jacobson developed the congestion avoidance
mechanisms that are now required in TCP implementations [Jacobson88,
RFC 2581]. These mechanisms operate in the hosts to cause TCP
connections to "back off" during congestion. We say that TCP flows
are "responsive" to congestion signals (i.e., dropped packets) from
the network. It is these TCP congestion avoidance algorithms that
prevent the congestion collapse of today's Internet.
However, that is not the end of the story. Considerable research has
been done on Internet dynamics since 1988, and the Internet has
grown. It has become clear that the TCP congestion avoidance
mechanisms [RFC2581], while necessary and powerful, are not
sufficient to provide good service in all circumstances. In addition
to the development of new congestion control mechanisms [RFC2357],
router-based mechanisms are in development that complement the
endpoint congestion avoidance mechanisms.
A major issue that still needs to be addressed is the potential for
future congestion collapse of the Internet due to flows that do not
use responsible end-to-end congestion control. RFC 896 [RFC896]
suggested in 1984 that gateways should detect and `squelch'
misbehaving hosts: "Failure to respond to an ICMP Source Quench
message, though, should be regarded as grounds for action by a
gateway to disconnect a host. Detecting such failure is non-trivial
but is a worthwhile area for further research." Current papers
still propose that routers detect and penalize flows that are not
employing acceptable end-to-end congestion control [FF99].
3.2. Fairness
In addition to a concern about congestion collapse, there is a
concern about `fairness' for best-effort traffic. Because TCP "backs
off" during congestion, a large number of TCP connections can share a
single, congested link in such a way that bandwidth is shared
reasonably equitably among similarly situated flows. The equitable
sharing of bandwidth among flows depends on the fact that all flows
are running compatible congestion control algorithms. For TCP, this
means congestion control algorithms conformant with the current TCP
specification [RFC793, RFC1122, RFC2581].
The issue of fairness among competing flows has become increasingly
important for several reasons. First, using window scaling
[RFC1323], individual TCPs can use high bandwidth even over high-
Floyd, ed. Best Current Practice [Page 3]
RFC 2914 Congestion Control Principles September 2000
propagation-delay paths. Second, with the growth of the web,
Internet users increasingly want high-bandwidth and low-delay
communications, rather than the leisurely transfer of a long file in
the background. The growth of best-effort traffic that does not use
TCP underscores this concern about fairness between competing best-
effort traffic in times of congestion.
The popularity of the Internet has caused a proliferation in the
number of TCP implementations. Some of these may fail to implement
the TCP congestion avoidance mechanisms correctly because of poor
implementation [RFC2525]. Others may deliberately be implemented
with congestion avoidance algorithms that are more aggressive in
their use of bandwidth than other TCP implementations; this would
allow a vendor to claim to have a "faster TCP". The logical
consequence of such implementations would be a spiral of increasingly
aggressive TCP implementations, or increasingly aggressive transport
protocols, leading back to the point where there is effectively no
congestion avoidance and the Internet is chronically congested.
There is a well-known way to achieve more aggressive performance
without even changing the transport protocol, by changing the level
of granularity: open multiple connections to the same place, as has
been done in the past by some Web browsers. Thus, instead of a
spiral of increasingly aggressive transport protocols, we would
instead have a spiral of increasingly aggressive web browsers, or
increasingly aggressive applications.
This raises the issue of the appropriate granularity of a "flow",
where we define a `flow' as the level of granularity appropriate for
the application of both fairness and congestion control. From RFC
2309: "There are a few `natural' answers: 1) a TCP or UDP connection
(source address/port, destination address/port); 2) a
source/destination host pair; 3) a given source host or a given
destination host. We would guess that the source/destination host
pair gives the most appropriate granularity in many circumstances.
The granularity of flows for congestion management is, at least in
part, a policy question that needs to be addressed in the wider IETF
community."
Again borrowing from RFC 2309, we use the term "TCP-compatible" for a
flow that behaves under congestion like a flow produced by a
conformant TCP. A TCP-compatible flow is responsive to congestion
notification, and in steady-state uses no more bandwidth than a
conformant TCP running under comparable conditions (drop rate, RTT,
MTU, etc.)
Floyd, ed. Best Current Practice [Page 4]
RFC 2914 Congestion Control Principles September 2000
It is convenient to divide flows into three classes: (1) TCP-
compatible flows, (2) unresponsive flows, i.e., flows that do not
slow down when congestion occurs, and (3) flows that are responsive
but are not TCP-compatible. The last two classes contain more
aggressive flows that pose significant threats to Internet
performance, as we discuss below.
In addition to steady-state fairness, the fairness of the initial
slow-start is also a concern. One concern is the transient effect on
other flows of a flow with an overly-aggressive slow-start procedure.
Slow-start performance is particularly important for the many flows
that are short-lived, and only have a small amount of data to
transfer.
3.3. Optimizing performance regarding throughput, delay, and loss.
In addition to the prevention of congestion collapse and concerns
about fairness, a third reason for a flow to use end-to-end
congestion control can be to optimize its own performance regarding
throughput, delay, and loss. In some circumstances, for example in
environments of high statistical multiplexing, the delay and loss
rate experienced by a flow are largely independent of its own sending
rate. However, in environments with lower levels of statistical
multiplexing or with per-flow scheduling, the delay and loss rate
experienced by a flow is in part a function of the flow's own sending
rate. Thus, a flow can use end-to-end congestion control to limit
the delay or loss experienced by its own packets. We would note,
however, that in an environment like the current best-effort
Internet, concerns regarding congestion collapse and fairness with
competing flows limit the range of congestion control behaviors
available to a flow.
4. The role of the standards process
The standardization of a transport protocol includes not only
standardization of aspects of the protocol that could affect
interoperability (e.g., information exchanged by the end-nodes), but
also standardization of mechanisms deemed critical to performance
(e.g., in TCP, reduction of the congestion window in response to a
packet drop). At the same time, implementation-specific details and
other aspects of the transport protocol that do not affect
interoperability and do not significantly interfere with performance
do not require standardization. Areas of TCP that do not require
standardization include the details of TCP's Fast Recovery procedure
after a Fast Retransmit [RFC2582]. The appendix uses examples from
TCP to discuss in more detail the role of the standards process in
the development of congestion control.
Floyd, ed. Best Current Practice [Page 5]
RFC 2914 Congestion Control Principles September 2000
4.1. The development of new transport protocols.
In addition to addressing the danger of congestion collapse, the
standardization process for new transport protocols takes care to
avoid a congestion control `arms race' among competing protocols. As
an example, in RFC 2357 [RFC2357] the TSV Area Directors and their
Directorate outline criteria for the publication as RFCs of
Internet-Drafts on reliable multicast transport protocols. From
[RFC2357]: "A particular concern for the IETF is the impact of
reliable multicast traffic on other traffic in the Internet in times
of congestion, in particular the effect of reliable multicast traffic
on competing TCP traffic.... The challenge to the IETF is to
encourage research and implementations of reliable multicast, and to
enable the needs of applications for reliable multicast to be met as
expeditiously as possible, while at the same time protecting the
Internet from the congestion disaster or collapse that could result
from the widespread use of applications with inappropriate reliable
multicast mechanisms."
The list of technical criteria that must be addressed by RFCs on new
reliable multicast transport protocols include the following: "Is
there a congestion control mechanism? How well does it perform? When
does it fail? Note that congestion control mechanisms that operate
on the network more aggressively than TCP will face a great burden of
proof that they don't threaten network stability."
It is reasonable to expect that these concerns about the effect of
new transport protocols on competing traffic will apply not only to
reliable multicast protocols, but to unreliable unicast, reliable
unicast, and unreliable multicast traffic as well.
4.2. Application-level issues that affect congestion control
The specific issue of a browser opening multiple connections to the
same destination has been addressed by RFC 2616 [RFC2616], which
states in Section 8.1.4 that "Clients that use persistent connections
SHOULD limit the number of simultaneous connections that they
maintain to a given server. A single-user client SHOULD NOT maintain
more than 2 connections with any server or proxy."
4.3. New developments in the standards process
The most obvious developments in the IETF that could affect the
evolution of congestion control are the development of integrated and
differentiated services [RFC2212, RFC2475] and of Explicit Congestion
Notification (ECN) [RFC2481]. However, other less dramatic
developments are likely to affect congestion control as well.
Floyd, ed. Best Current Practice [Page 6]
RFC 2914 Congestion Control Principles September 2000
One such effort is that to construct Endpoint Congestion Management
[BS00], to enable multiple concurrent flows from a sender to the same
receiver to share congestion control state. By allowing multiple
connections to the same destination to act as one flow in terms of
end-to-end congestion control, a Congestion Manager could allow
individual connections slow-starting to take advantage of previous
information about the congestion state of the end-to-end path.
Further, the use of a Congestion Manager could remove the congestion
control dangers of multiple flows being opened between the same
source/destination pair, and could perhaps be used to allow a browser
to open many simultaneous connections to the same destination.
5. A description of congestion collapse
This section discusses congestion collapse from undelivered packets
in some detail, and shows how unresponsive flows could contribute to
congestion collapse in the Internet. This section draws heavily on
material from [FF99].
Informally, congestion collapse occurs when an increase in the
network load results in a decrease in the useful work done by the
network. As discussed in Section 3, congestion collapse was first
reported in the mid 1980s [RFC896], and was largely due to TCP
connections unnecessarily retransmitting packets that were either in
transit or had already been received at the receiver. We call the
congestion collapse that results from the unnecessary retransmission
of packets classical congestion collapse. Classical congestion
collapse is a stable condition that can result in throughput that is
a small fraction of normal [RFC896]. Problems with classical
congestion collapse have generally been corrected by the timer
improvements and congestion control mechanisms in modern
implementations of TCP [Jacobson88].
A second form of potential congestion collapse occurs due to
undelivered packets. Congestion collapse from undelivered packets
arises when bandwidth is wasted by delivering packets through the
network that are dropped before reaching their ultimate destination.
This is probably the largest unresolved danger with respect to
congestion collapse in the Internet today. Different scenarios can
result in different degrees of congestion collapse, in terms of the
fraction of the congested links' bandwidth used for productive work.
The danger of congestion collapse from undelivered packets is due
primarily to the increasing deployment of open-loop applications not
using end-to-end congestion control. Even more destructive would be
best-effort applications that *increase* their sending rate in
response to an increased packet drop rate (e.g., automatically using
an increased level of FEC).
Floyd, ed. Best Current Practice [Page 7]
RFC 2914 Congestion Control Principles September 2000
Table 1 gives the results from a scenario with congestion collapse
from undelivered packets, where scarce bandwidth is wasted by packets
that never reach their destination. The simulation uses a scenario
with three TCP flows and one UDP flow competing over a congested 1.5
Mbps link. The access links for all nodes are 10 Mbps, except that
the access link to the receiver of the UDP flow is 128 Kbps, only 9%
of the bandwidth of shared link. When the UDP source rate exceeds
128 Kbps, most of the UDP packets will be dropped at the output port
to that final link.
UDP
Arrival UDP TCP Total
Rate Goodput Goodput Goodput
--------------------------------------
0.7 0.7 98.5 99.2
1.8 1.7 97.3 99.1
2.6 2.6 96.0 98.6
5.3 5.2 92.7 97.9
8.8 8.4 87.1 95.5
10.5 8.4 84.8 93.2
13.1 8.4 81.4 89.8
17.5 8.4 77.3 85.7
26.3 8.4 64.5 72.8
52.6 8.4 38.1 46.4
58.4 8.4 32.8 41.2
65.7 8.4 28.5 36.8
75.1 8.4 19.7 28.1
87.6 8.4 11.3 19.7
105.2 8.4 3.4 11.8
131.5 8.4 2.4 10.7
Table 1. A simulation with three TCP flows and one UDP flow.
Table 1 shows the UDP arrival rate from the sender, the UDP goodput
(defined as the bandwidth delivered to the receiver), the TCP goodput
(as delivered to the TCP receivers), and the aggregate goodput on the
congested 1.5 Mbps link. Each rate is given as a fraction of the
bandwidth of the congested link. As the UDP source rate increases,
the TCP goodput decreases roughly linearly, and the UDP goodput is
nearly constant. Thus, as the UDP flow increases its offered load,
its only effect is to hurt the TCP and aggregate goodput. On the
congested link, the UDP flow ultimately `wastes' the bandwidth that
could have been used by the TCP flow, and reduces the goodput in the
network as a whole down to a small fraction of the bandwidth of the
congested link.
Floyd, ed. Best Current Practice [Page 8]
RFC 2914 Congestion Control Principles September 2000
The simulations in Table 1 illustrate both unfairness and congestion
collapse. As [FF99] discusses, compatible congestion control is not
the only way to provide fairness; per-flow scheduling at the
congested routers is an alternative mechanism at the routers that
guarantees fairness. However, as discussed in [FF99], per-flow
scheduling can not be relied upon to prevent congestion collapse.
There are only two alternatives for eliminating the danger of
congestion collapse from undelivered packets. The first alternative
for preventing congestion collapse from undelivered packets is the
use of effective end-to-end congestion control by the end nodes.
More specifically, the requirement would be that a flow avoid a
pattern of significant losses at links downstream from the first
congested link on the path. (Here, we would consider any link a
`congested link' if any flow is using bandwidth that would otherwise
be used by other traffic on the link.) Given that an end-node is
generally unable to distinguish between a path with one congested
link and a path with multiple congested links, the most reliable way
for a flow to avoid a pattern of significant losses at a downstream
congested link is for the flow to use end-to-end congestion control,
and reduce its sending rate in the presence of loss.
A second alternative for preventing congestion collapse from
undelivered packets would be a guarantee by the network that packets
accepted at a congested link in the network will be delivered all the
way to the receiver [RFC2212, RFC2475]. We note that the choice
between the first alternative of end-to-end congestion control and
the second alternative of end-to-end bandwidth guarantees does not
have to be an either/or decision; congestion collapse can be
prevented by the use of effective end-to-end congestion by some of
the traffic, and the use of end-to-end bandwidth guarantees from the
network for the rest of the traffic.
6. Forms of end-to-end congestion control
This document has discussed concerns about congestion collapse and
about fairness with TCP for new forms of congestion control. This
does not mean, however, that concerns about congestion collapse and
fairness with TCP necessitate that all best-effort traffic deploy
congestion control based on TCP's Additive-Increase Multiplicative-
Decrease (AIMD) algorithm of reducing the sending rate in half in
response to each packet drop. This section separately discusses the
implications of these two concerns of congestion collapse and
fairness with TCP.
Floyd, ed. Best Current Practice [Page 9]
RFC 2914 Congestion Control Principles September 2000
6.1. End-to-end congestion control for avoiding congestion collapse.
The avoidance of congestion collapse from undelivered packets
requires that flows avoid a scenario of a high sending rate, multiple
congested links, and a persistent high packet drop rate at the
downstream link. Because congestion collapse from undelivered
packets consists of packets that waste valuable bandwidth only to be
dropped downstream, this form of congestion collapse is not possible
in an environment where each flow traverses only one congested link,
or where only a small number of packets are dropped at links
downstream of the first congested link. Thus, any form of congestion
control that successfully avoids a high sending rate in the presence
of a high packet drop rate should be sufficient to avoid congestion
collapse from undelivered packets.
We would note that the addition of Explicit Congestion Notification
(ECN) to the IP architecture would not, in and of itself, remove the
danger of congestion collapse for best-effort traffic. ECN allows
routers to set a bit in packet headers as an indication of congestion
to the end-nodes, rather than being forced to rely on packet drops to
indicate congestion. However, with ECN, packet-marking would replace
packet-dropping only in times of moderate congestion. In particular,
when congestion is heavy, and a router's buffers overflow, the router
has no choice but to drop arriving packets.
6.2. End-to-end congestion control for fairness with TCP.
The concern expressed in [RFC2357] about fairness with TCP places a
significant though not crippling constraint on the range of viable
end-to-end congestion control mechanisms for best-effort traffic. An
environment with per-flow scheduling at all congested links would
isolate flows from each other, and eliminate the need for congestion
control mechanisms to be TCP-compatible. An environment with
differentiated services, where flows marked as belonging to a certain
diff-serv class would be scheduled in isolation from best-effort
traffic, could allow the emergence of an entire diff-serv class of
traffic where congestion control was not required to be TCP-
compatible. Similarly, a pricing-controlled environment, or a diff-
serv class with its own pricing paradigm, could supercede the concern
about fairness with TCP. However, for the current Internet
environment, where other best-effort traffic could compete in a FIFO
queue with TCP traffic, the absence of fairness with TCP could lead
to one flow `starving out' another flow in a time of high congestion,
as was illustrated in Table 1 above.
However, the list of TCP-compatible congestion control procedures is
not limited to AIMD with the same increase/ decrease parameters as
TCP. Other TCP-compatible congestion control procedures include
Floyd, ed. Best Current Practice [Page 10]
RFC 2914 Congestion Control Principles September 2000
rate-based variants of AIMD; AIMD with different sets of
increase/decrease parameters that give the same steady-state
behavior; equation-based congestion control where the sender adjusts
its sending rate in response to information about the long-term
packet drop rate; layered multicast where receivers subscribe and
unsubscribe from layered multicast groups; and possibly other forms
that we have not yet begun to consider.
7. Acknowledgements
Much of this document draws directly on previous RFCs addressing
end-to-end congestion control. This attempts to be a summary of
ideas that have been discussed for many years, and by many people.
In particular, acknowledgement is due to the members of the End-to-
End Research Group, the Reliable Multicast Research Group, and the
Transport Area Directorate. This document has also benefited from
discussion and feedback from the Transport Area Working Group.
Particular thanks are due to Mark Allman for feedback on an earlier
version of this document.
8. References
[BS00] Balakrishnan H. and S. Seshan, "The Congestion Manager",
Work in Progress.
[DMKM00] Dawkins, S., Montenegro, G., Kojo, M. and V. Magret,
"End-to-end Performance Implications of Slow Links",
Work in Progress.
[FF99] Floyd, S. and K. Fall, "Promoting the Use of End-to-End
Congestion Control in the Internet", IEEE/ACM
Transactions on Networking, August 1999. URL
http://www.aciri.org/floyd/end2end-paper.html
[HPF00] Handley, M., Padhye, J. and S. Floyd, "TCP Congestion
Window Validation", RFC 2861, June 2000.
[Jacobson88] V. Jacobson, Congestion Avoidance and Control, ACM
SIGCOMM '88, August 1988.
[RFC793] Postel, J., "Transmission Control Protocol", STD 7, RFC
793, September 1981.
[RFC896] Nagle, J., "Congestion Control in IP/TCP", RFC 896,
January 1984.
[RFC1122] Braden, R., Ed., "Requirements for Internet Hosts --
Communication Layers", STD 3, RFC 1122, October 1989.
Floyd, ed. Best Current Practice [Page 11]
RFC 2914 Congestion Control Principles September 2000
[RFC1323] Jacobson, V., Braden, R. and D. Borman, "TCP Extensions
for High Performance", RFC 1323, May 1992.
[RFC2119] Bradner, S., "Key words for use in RFCs to Indicate
Requirement Levels", BCP 14, RFC 2119, March 1997.
[RFC2212] Shenker, S., Partridge, C. and R. Guerin, "Specification
of Guaranteed Quality of Service", RFC 2212, September
1997.
[RFC2309] Braden, R., Clark, D., Crowcroft, J., Davie, B.,
Deering, S., Estrin, D., Floyd, S., Jacobson, V.,
Minshall, G., Partridge, C., Peterson, L., Ramakrishnan,
K.K., Shenker, S., Wroclawski, J., and L. Zhang,
"Recommendations on Queue Management and Congestion
Avoidance in the Internet", RFC 2309, April 1998.
[RFC2357] Mankin, A., Romanow, A., Bradner, S. and V. Paxson,
"IETF Criteria for Evaluating Reliable Multicast
Transport and Application Protocols", RFC 2357, June
1998.
[RFC2414] Allman, M., Floyd, S. and C. Partridge, "Increasing
TCP's Initial Window", RFC 2414, September 1998.
[RFC2475] Blake, S., Black, D., Carlson, M., Davies, E., Wang, Z.
and W. Weiss, "An Architecture for Differentiated
Services", RFC 2475, December 1998.
[RFC2481] Ramakrishnan K. and S. Floyd, "A Proposal to add
Explicit Congestion Notification (ECN) to IP", RFC 2481,
January 1999.
[RFC2525] Paxson, V., Allman, M., Dawson, S., Fenner, W., Griner,
J., Heavens, I., Lahey, K., Semke, J. and B. Volz,
"Known TCP Implementation Problems", RFC 2525, March
1999.
[RFC2581] Allman, M., Paxson, V. and W. Stevens, "TCP Congestion
Control", RFC 2581, April 1999.
[RFC2582] Floyd, S. and T. Henderson, "The NewReno Modification to
TCP's Fast Recovery Algorithm", RFC 2582, April 1999.
[RFC2616] Fielding, R., Gettys, J., Mogul, J., Frystyk, H.,
Masinter, L., Leach, P. and T. Berners-Lee, "Hypertext
Transfer Protocol -- HTTP/1.1", RFC 2616, June 1999.
Floyd, ed. Best Current Practice [Page 12]
RFC 2914 Congestion Control Principles September 2000
[SCWA99] S. Savage, N. Cardwell, D. Wetherall, and T. Anderson,
TCP Congestion Control with a Misbehaving Receiver, ACM
Computer Communications Review, October 1999.
[TCPB98] Hari Balakrishnan, Venkata N. Padmanabhan, Srinivasan
Seshan, Mark Stemm, and Randy H. Katz, TCP Behavior of a
Busy Internet Server: Analysis and Improvements, IEEE
Infocom, March 1998. Available from:
"http://www.cs.berkeley.edu/~hari/papers/infocom98.ps.gz".
[TCPF98] Dong Lin and H.T. Kung, TCP Fast Recovery Strategies:
Analysis and Improvements, IEEE Infocom, March 1998.
Available from:
"http://www.eecs.harvard.edu/networking/papers/infocom-
tcp-final-198.pdf".
9. TCP-Specific issues
In this section we discuss some of the particulars of TCP congestion
control, to illustrate a realization of the congestion control
principles, including some of the details that arise when
incorporating them into a production transport protocol.
9.1. Slow-start.
The TCP sender can not open a new connection by sending a large burst
of data (e.g., a receiver's advertised window) all at once. The TCP
sender is limited by a small initial value for the congestion window.
During slow-start, the TCP sender can increase its sending rate by at
most a factor of two in one roundtrip time. Slow-start ends when
congestion is detected, or when the sender's congestion window is
greater than the slow-start threshold ssthresh.
An issue that potentially affects global congestion control, and
therefore has been explicitly addressed in the standards process,
includes an increase in the value of the initial window
[RFC2414,RFC2581].
Issues that have not been addressed in the standards process, and are
generally considered not to require standardization, include such
issues as the use (or non-use) of rate-based pacing, and mechanisms
for ending slow-start early, before the congestion window reaches
ssthresh. Such mechanisms result in slow-start behavior that is as
conservative or more conservative than standard TCP.
Floyd, ed. Best Current Practice [Page 13]
RFC 2914 Congestion Control Principles September 2000
9.2. Additive Increase, Multiplicative Decrease.
In the absence of congestion, the TCP sender increases its congestion
window by at most one packet per roundtrip time. In response to a
congestion indication, the TCP sender decreases its congestion window
by half. (More precisely, the new congestion window is half of the
minimum of the congestion window and the receiver's advertised
window.)
An issue that potentially affects global congestion control, and
therefore would be likely to be explicitly addressed in the standards
process, would include a proposed addition of congestion control for
the return stream of `pure acks'.
An issue that has not been addressed in the standards process, and is
generally not considered to require standardization, would be a
change to the congestion window to apply as an upper bound on the
number of bytes presumed to be in the pipe, instead of applying as a
sliding window starting from the cumulative acknowledgement.
(Clearly, the receiver's advertised window applies as a sliding
window starting from the cumulative acknowledgement field, because
packets received above the cumulative acknowledgement field are held
in TCP's receive buffer, and have not been delivered to the
application. However, the congestion window applies to the number of
packets outstanding in the pipe, and does not necessarily have to
include packets that have been received out-of-order by the TCP
receiver.)
9.3. Retransmit timers.
The TCP sender sets a retransmit timer to infer that a packet has
been dropped in the network. When the retransmit timer expires, the
sender infers that a packet has been lost, sets ssthresh to half of
the current window, and goes into slow-start, retransmitting the lost
packet. If the retransmit timer expires because no acknowledgement
has been received for a retransmitted packet, the retransmit timer is
also "backed-off", doubling the value of the next retransmit timeout
interval.
An issue that potentially affects global congestion control, and
therefore would be likely to be explicitly addressed in the standards
process, might include a modified mechanism for setting the
retransmit timer that could significantly increase the number of
retransmit timers that expire prematurely, when the acknowledgement
has not yet arrived at the sender, but in fact no packets have been
dropped. This could be of concern to the Internet standards process
Floyd, ed. Best Current Practice [Page 14]
RFC 2914 Congestion Control Principles September 2000
because retransmit timers that expire prematurely could lead to an
increase in the number of packets unnecessarily transmitted on a
congested link.
9.4. Fast Retransmit and Fast Recovery.
After seeing three duplicate acknowledgements, the TCP sender infers
a packet loss. The TCP sender sets ssthresh to half of the current
window, reduces the congestion window to at most half of the previous
window, and retransmits the lost packet.
An issue that potentially affects global congestion control, and
therefore would be likely to be explicitly addressed in the standards
process, might include a proposal (if there was one) for inferring a
lost packet after only one or two duplicate acknowledgements. If
poorly designed, such a proposal could lead to an increase in the
number of packets unnecessarily transmitted on a congested path.
An issue that has not been addressed in the standards process, and
would not be expected to require standardization, would be a proposal
to send a "new" or presumed-lost packet in response to a duplicate or
partial acknowledgement, if allowed by the congestion window. An
example of this would be sending a new packet in response to a single
duplicate acknowledgement, to keep the `ack clock' going in case no
further acknowledgements would have arrived. Such a proposal is an
example of a beneficial change that does not involve interoperability
and does not affect global congestion control, and that therefore
could be implemented by vendors without requiring the intervention of
the IETF standards process. (This issue has in fact been addressed
in [DMKM00], which suggests that "researchers may wish to experiment
with injecting new traffic into the network when duplicate
acknowledgements are being received, as described in [TCPB98] and
[TCPF98]."
9.5. Other aspects of TCP congestion control.
Other aspects of TCP congestion control that have not been discussed
in any of the sections above include TCP's recovery from an idle or
application-limited period [HPF00].
10. Security Considerations
This document has been about the risks associated with congestion
control, or with the absence of congestion control. Section 3.2
discusses the potentials for unfairness if competing flows don't use
compatible congestion control mechanisms, and Section 5 considers the
dangers of congestion collapse if flows don't use end-to-end
congestion control.
Floyd, ed. Best Current Practice [Page 15]
RFC 2914 Congestion Control Principles September 2000
Because this document does not propose any specific congestion
control mechanisms, it is also not necessary to present specific
security measures associated with congestion control. However, we
would note that there are a range of security considerations
associated with congestion control that should be considered in IETF
documents.
For example, individual congestion control mechanisms should be as
robust as possible to the attempts of individual end-nodes to subvert
end-to-end congestion control [SCWA99]. This is a particular concern
in multicast congestion control, because of the far-reaching
distribution of the traffic and the greater opportunities for
individual receivers to fail to report congestion.
RFC 2309 also discussed the potential dangers to the Internet of
unresponsive flows, that is, flows that don't reduce their sending
rate in the presence of congestion, and describes the need for
mechanisms in the network to deal with flows that are unresponsive to
congestion notification. We would note that there is still a need
for research, engineering, measurement, and deployment in these
areas.
Because the Internet aggregates very large numbers of flows, the risk
to the whole infrastructure of subverting the congestion control of a
few individual flows is limited. Rather, the risk to the
infrastructure would come from the widespread deployment of many
end-nodes subverting end-to-end congestion control.
AUTHOR'S ADDRESS
Sally Floyd
AT&T Center for Internet Research at ICSI (ACIRI)
Phone: +1 (510) 642-4274 x189
EMail: floyd@aciri.org
URL: http://www.aciri.org/floyd/
Floyd, ed. Best Current Practice [Page 16]
RFC 2914 Congestion Control Principles September 2000
Full Copyright Statement
Copyright (C) The Internet Society (2000). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Acknowledgement
Funding for the RFC Editor function is currently provided by the
Internet Society.
Floyd, ed. Best Current Practice [Page 17]

View File

@ -0,0 +1,843 @@
Network Working Group K. Lahey
Request for Comments: 2923 dotRocket, Inc.
Category: Informational September 2000
TCP Problems with Path MTU Discovery
Status of this Memo
This memo provides information for the Internet community. It does
not specify an Internet standard of any kind. Distribution of this
memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (2000). All Rights Reserved.
Abstract
This memo catalogs several known Transmission Control Protocol (TCP)
implementation problems dealing with Path Maximum Transmission Unit
Discovery (PMTUD), including the long-standing black hole problem,
stretch acknowlegements (ACKs) due to confusion between Maximum
Segment Size (MSS) and segment size, and MSS advertisement based on
PMTU.
1. Introduction
This memo catalogs several known TCP implementation problems dealing
with Path MTU Discovery [RFC1191], including the long-standing black
hole problem, stretch ACKs due to confusion between MSS and segment
size, and MSS advertisement based on PMTU. The goal in doing so is
to improve conditions in the existing Internet by enhancing the
quality of current TCP/IP implementations.
While Path MTU Discovery (PMTUD) can be used with any upper-layer
protocol, it is most commonly used by TCP; this document does not
attempt to treat problems encountered by other upper-layer protocols.
Path MTU Discovery for IPv6 [RFC1981] treats only IPv6-dependent
issues, but not the TCP issues brought up in this document.
Each problem is defined as follows:
Name of Problem
The name associated with the problem. In this memo, the name is
given as a subsection heading.
Lahey Informational [Page 1]
RFC 2923 TCP Problems with Path MTU Discovery September 2000
Classification
One or more problem categories for which the problem is
classified: "congestion control", "performance", "reliability",
"non-interoperation -- connectivity failure".
Description
A definition of the problem, succinct but including necessary
background material.
Significance
A brief summary of the sorts of environments for which the problem
is significant.
Implications
Why the problem is viewed as a problem.
Relevant RFCs
The RFCs defining the TCP specification with which the problem
conflicts. These RFCs often qualify behavior using terms such as
MUST, SHOULD, MAY, and others written capitalized. See RFC 2119
for the exact interpretation of these terms.
Trace file demonstrating the problem
One or more ASCII trace files demonstrating the problem, if
applicable.
Trace file demonstrating correct behavior
One or more examples of how correct behavior appears in a trace,
if applicable.
References
References that further discuss the problem.
How to detect
How to test an implementation to see if it exhibits the problem.
This discussion may include difficulties and subtleties associated
with causing the problem to manifest itself, and with interpreting
traces to detect the presence of the problem (if applicable).
How to fix
For known causes of the problem, how to correct the
implementation.
Lahey Informational [Page 2]
RFC 2923 TCP Problems with Path MTU Discovery September 2000
2. Known implementation problems
2.1.
Name of Problem
Black Hole Detection
Classification
Non-interoperation -- connectivity failure
Description
A host performs Path MTU Discovery by sending out as large a
packet as possible, with the Don't Fragment (DF) bit set in the IP
header. If the packet is too large for a router to forward on to
a particular link, the router must send an ICMP Destination
Unreachable -- Fragmentation Needed message to the source address.
The host then adjusts the packet size based on the ICMP message.
As was pointed out in [RFC1435], routers don't always do this
correctly -- many routers fail to send the ICMP messages, for a
variety of reasons ranging from kernel bugs to configuration
problems. Firewalls are often misconfigured to suppress all ICMP
messages. IPsec [RFC2401] and IP-in-IP [RFC2003] tunnels
shouldn't cause these sorts of problems, if the implementations
follow the advice in the appropriate documents.
PMTUD, as documented in [RFC1191], fails when the appropriate ICMP
messages are not received by the originating host. The upper-
layer protocol continues to try to send large packets and, without
the ICMP messages, never discovers that it needs to reduce the
size of those packets. Its packets are disappearing into a PMTUD
black hole.
Significance
When PMTUD fails due to the lack of ICMP messages, TCP will also
completely fail under some conditions.
Implications
This failure is especially difficult to debug, as pings and some
interactive TCP connections to the destination host work. Bulk
transfers fail with the first large packet and the connection
eventually times out.
These situations can almost always be blamed on a misconfiguration
within the network, which should be corrected. However it seems
inappropriate for some TCP implementations to suffer
Lahey Informational [Page 3]
RFC 2923 TCP Problems with Path MTU Discovery September 2000
interoperability failures over paths which do not affect other TCP
implementations (i.e. those without PMTUD). This creates a market
disincentive for deploying TCP implementation with PMTUD enabled.
Relevant RFCs
RFC 1191 describes Path MTU Discovery. RFC 1435 provides an early
description of these sorts of problems.
Trace file demonstrating the problem
Made using tcpdump [Jacobson89] recording at an intermediate host.
20:12:11.951321 A > B: S 1748427200:1748427200(0)
win 49152 <mss 1460>
20:12:11.951829 B > A: S 1001927984:1001927984(0)
ack 1748427201 win 16384 <mss 65240>
20:12:11.955230 A > B: . ack 1 win 49152 (DF)
20:12:11.959099 A > B: . 1:1461(1460) ack 1 win 49152 (DF)
20:12:13.139074 A > B: . 1:1461(1460) ack 1 win 49152 (DF)
20:12:16.188685 A > B: . 1:1461(1460) ack 1 win 49152 (DF)
20:12:22.290483 A > B: . 1:1461(1460) ack 1 win 49152 (DF)
20:12:34.491856 A > B: . 1:1461(1460) ack 1 win 49152 (DF)
20:12:58.896405 A > B: . 1:1461(1460) ack 1 win 49152 (DF)
20:13:47.703184 A > B: . 1:1461(1460) ack 1 win 49152 (DF)
20:14:52.780640 A > B: . 1:1461(1460) ack 1 win 49152 (DF)
20:15:57.856037 A > B: . 1:1461(1460) ack 1 win 49152 (DF)
20:17:02.932431 A > B: . 1:1461(1460) ack 1 win 49152 (DF)
20:18:08.009337 A > B: . 1:1461(1460) ack 1 win 49152 (DF)
20:19:13.090521 A > B: . 1:1461(1460) ack 1 win 49152 (DF)
20:20:18.168066 A > B: . 1:1461(1460) ack 1 win 49152 (DF)
20:21:23.242761 A > B: R 1461:1461(0) ack 1 win 49152 (DF)
The short SYN packet has no trouble traversing the network, due to
its small size. Similarly, ICMP echo packets used to diagnose
connectivity problems will succeed.
Large data packets fail to traverse the network. Eventually the
connection times out. This can be especially confusing when the
application starts out with a very small write, which succeeds,
following up with many large writes, which then fail.
Trace file demonstrating correct behavior
Made using tcpdump recording at an intermediate host.
16:48:42.659115 A > B: S 271394446:271394446(0)
win 8192 <mss 1460> (DF)
16:48:42.672279 B > A: S 2837734676:2837734676(0)
ack 271394447 win 16384 <mss 65240>
Lahey Informational [Page 4]
RFC 2923 TCP Problems with Path MTU Discovery September 2000
16:48:42.676890 A > B: . ack 1 win 8760 (DF)
16:48:42.870574 A > B: . 1:1461(1460) ack 1 win 8760 (DF)
16:48:42.871799 A > B: . 1461:2921(1460) ack 1 win 8760 (DF)
16:48:45.786814 A > B: . 1:1461(1460) ack 1 win 8760 (DF)
16:48:51.794676 A > B: . 1:1461(1460) ack 1 win 8760 (DF)
16:49:03.808912 A > B: . 1:537(536) ack 1 win 8760
16:49:04.016476 B > A: . ack 537 win 16384
16:49:04.021245 A > B: . 537:1073(536) ack 1 win 8760
16:49:04.021697 A > B: . 1073:1609(536) ack 1 win 8760
16:49:04.120694 B > A: . ack 1609 win 16384
16:49:04.126142 A > B: . 1609:2145(536) ack 1 win 8760
In this case, the sender sees four packets fail to traverse the
network (using a two-packet initial send window) and turns off
PMTUD. All subsequent packets have the DF flag turned off, and
the size set to the default value of 536 [RFC1122].
References
This problem has been discussed extensively on the tcp-impl
mailing list; the name "black hole" has been in use for many
years.
How to detect
This shows up as a TCP connection which hangs (fails to make
progress) until closed by timeout (this often manifests itself as
a connection that connects and starts to transfer, then eventually
terminates after 15 minutes with zero bytes transfered). This is
particularly annoying with an application like ftp, which will
work perfectly while it uses small packets for control
information, and then fail on bulk transfers.
A series of ICMP echo packets will show that the two end hosts are
still capable of passing packets, a series of MTU-sized ICMP echo
packets will show some fragmentation, and a series of MTU-sized
ICMP echo packets with DF set will fail. This can be confusing
for network engineers trying to diagnose the problem.
There are several traceroute implementations that do PMTUD, and
can demonstrate the problem.
How to fix
TCP should notice that the connection is timing out. After
several timeouts, TCP should attempt to send smaller packets,
perhaps turning off the DF flag for each packet. If this
succeeds, it should continue to turn off PMTUD for the connection
for some reasonable period of time, after which it should probe
again to try to determine if the path has changed.
Lahey Informational [Page 5]
RFC 2923 TCP Problems with Path MTU Discovery September 2000
Note that, under IPv6, there is no DF bit -- it is implicitly on
at all times. Fragmentation is not allowed in routers, only at
the originating host. Fortunately, the minimum supported MTU for
IPv6 is 1280 octets, which is significantly larger than the 68
octet minimum in IPv4. This should make it more reasonable for
IPv6 TCP implementations to fall back to 1280 octet packets, when
IPv4 implementations will probably have to turn off DF to respond
to black hole detection.
Ideally, the ICMP black holes should be fixed when they are found.
If hosts start to implement black hole detection, it may be that
these problems will go unnoticed and unfixed. This is especially
unfortunate, since detection can take several seconds each time,
and these delays could result in a significant, hidden degradation
of performance. Hosts that implement black hole detection should
probably log detected black holes, so that they can be fixed.
2.2.
Name of Problem
Stretch ACK due to PMTUD
Classification
Congestion Control / Performance
Description
When a naively implemented TCP stack communicates with a PMTUD
equipped stack, it will try to generate an ACK for every second
full-sized segment. If it determines the full-sized segment based
on the advertised MSS, this can degrade badly in the face of
PMTUD.
The PMTU can wind up being a small fraction of the advertised MSS;
in this case, an ACK would be generated only very infrequently.
Significance
Stretch ACKs have a variety of unfortunate effects, more fully
outlined in [RFC2525]. Most of these have to do with encouraging
a more bursty connection, due to the infrequent arrival of ACKs.
They can also impede congestion window growth.
Implications
The complete implications of stretch ACKs are outlined in
[RFC2525].
Lahey Informational [Page 6]
RFC 2923 TCP Problems with Path MTU Discovery September 2000
Relevant RFCs
RFC 1122 outlines the requirements for frequency of ACK
generation. [RFC2581] expands on this and clarifies that delayed
ACK is a SHOULD, not a MUST.
Trace file demonstrating it
Made using tcpdump recording at an intermediate host. The
timestamp options from all but the first two packets have been
removed for clarity.
18:16:52.976657 A > B: S 3183102292:3183102292(0) win 16384
<mss 4312,nop,wscale 0,nop,nop,timestamp 12128 0> (DF)
18:16:52.979580 B > A: S 2022212745:2022212745(0) ack 3183102293 win
49152 <mss 4312,nop,wscale 1,nop,nop,timestamp 1592957 12128> (DF)
18:16:52.979738 A > B: . ack 1 win 17248 (DF)
18:16:52.982473 A > B: . 1:4301(4300) ack 1 win 17248 (DF)
18:16:52.982557 C > A: icmp: B unreachable -
need to frag (mtu 1500)! (DF)
18:16:52.985839 B > A: . ack 1 win 32768 (DF)
18:16:54.129928 A > B: . 1:1449(1448) ack 1 win 17248 (DF)
.
.
.
18:16:58.507078 A > B: . 1463941:1465389(1448) ack 1 win 17248 (DF)
18:16:58.507200 A > B: . 1465389:1466837(1448) ack 1 win 17248 (DF)
18:16:58.507326 A > B: . 1466837:1468285(1448) ack 1 win 17248 (DF)
18:16:58.507439 A > B: . 1468285:1469733(1448) ack 1 win 17248 (DF)
18:16:58.524763 B > A: . ack 1452357 win 32768 (DF)
18:16:58.524986 B > A: . ack 1461045 win 32768 (DF)
18:16:58.525138 A > B: . 1469733:1471181(1448) ack 1 win 17248 (DF)
18:16:58.525268 A > B: . 1471181:1472629(1448) ack 1 win 17248 (DF)
18:16:58.525393 A > B: . 1472629:1474077(1448) ack 1 win 17248 (DF)
18:16:58.525516 A > B: . 1474077:1475525(1448) ack 1 win 17248 (DF)
18:16:58.525642 A > B: . 1475525:1476973(1448) ack 1 win 17248 (DF)
18:16:58.525766 A > B: . 1476973:1478421(1448) ack 1 win 17248 (DF)
18:16:58.526063 A > B: . 1478421:1479869(1448) ack 1 win 17248 (DF)
18:16:58.526187 A > B: . 1479869:1481317(1448) ack 1 win 17248 (DF)
18:16:58.526310 A > B: . 1481317:1482765(1448) ack 1 win 17248 (DF)
18:16:58.526432 A > B: . 1482765:1484213(1448) ack 1 win 17248 (DF)
18:16:58.526561 A > B: . 1484213:1485661(1448) ack 1 win 17248 (DF)
18:16:58.526671 A > B: . 1485661:1487109(1448) ack 1 win 17248 (DF)
18:16:58.537944 B > A: . ack 1478421 win 32768 (DF)
18:16:58.538328 A > B: . 1487109:1488557(1448) ack 1 win 17248 (DF)
Lahey Informational [Page 7]
RFC 2923 TCP Problems with Path MTU Discovery September 2000
Note that the interval between ACKs is significantly larger than two
times the segment size; it works out to be almost exactly two times
the advertised MSS. This transfer was long enough that it could be
verified that the stretch ACK was not the result of lost ACK packets.
Trace file demonstrating correct behavior
Made using tcpdump recording at an intermediate host. The timestamp
options from all but the first two packets have been removed for
clarity.
18:13:32.287965 A > B: S 2972697496:2972697496(0)
win 16384 <mss 4312,nop,wscale 0,nop,nop,timestamp 11326 0> (DF)
18:13:32.290785 B > A: S 245639054:245639054(0)
ack 2972697497 win 34496 <mss 4312> (DF)
18:13:32.290941 A > B: . ack 1 win 17248 (DF)
18:13:32.293774 A > B: . 1:4313(4312) ack 1 win 17248 (DF)
18:13:32.293856 C > A: icmp: B unreachable -
need to frag (mtu 1500)! (DF)
18:13:33.637338 A > B: . 1:1461(1460) ack 1 win 17248 (DF)
.
.
.
18:13:35.561691 A > B: . 1514021:1515481(1460) ack 1 win 17248 (DF)
18:13:35.561814 A > B: . 1515481:1516941(1460) ack 1 win 17248 (DF)
18:13:35.561938 A > B: . 1516941:1518401(1460) ack 1 win 17248 (DF)
18:13:35.562059 A > B: . 1518401:1519861(1460) ack 1 win 17248 (DF)
18:13:35.562174 A > B: . 1519861:1521321(1460) ack 1 win 17248 (DF)
18:13:35.564008 B > A: . ack 1481901 win 64680 (DF)
18:13:35.564383 A > B: . 1521321:1522781(1460) ack 1 win 17248 (DF)
18:13:35.564499 A > B: . 1522781:1524241(1460) ack 1 win 17248 (DF)
18:13:35.615576 B > A: . ack 1484821 win 64680 (DF)
18:13:35.615646 B > A: . ack 1487741 win 64680 (DF)
18:13:35.615716 B > A: . ack 1490661 win 64680 (DF)
18:13:35.615784 B > A: . ack 1493581 win 64680 (DF)
18:13:35.615856 B > A: . ack 1496501 win 64680 (DF)
18:13:35.615952 A > B: . 1524241:1525701(1460) ack 1 win 17248 (DF)
18:13:35.615966 B > A: . ack 1499421 win 64680 (DF)
18:13:35.616088 A > B: . 1525701:1527161(1460) ack 1 win 17248 (DF)
18:13:35.616105 B > A: . ack 1502341 win 64680 (DF)
18:13:35.616211 A > B: . 1527161:1528621(1460) ack 1 win 17248 (DF)
18:13:35.616228 B > A: . ack 1505261 win 64680 (DF)
18:13:35.616327 A > B: . 1528621:1530081(1460) ack 1 win 17248 (DF)
18:13:35.616349 B > A: . ack 1508181 win 64680 (DF)
18:13:35.616448 A > B: . 1530081:1531541(1460) ack 1 win 17248 (DF)
18:13:35.616565 A > B: . 1531541:1533001(1460) ack 1 win 17248 (DF)
18:13:35.616891 A > B: . 1533001:1534461(1460) ack 1 win 17248 (DF)
Lahey Informational [Page 8]
RFC 2923 TCP Problems with Path MTU Discovery September 2000
In this trace, an ACK is generated for every two segments that
arrive. (The segment size is slightly larger in this trace, even
though the source hosts are the same, because of the lack of
timestamp options in this trace.)
How to detect
This condition can be observed in a packet trace when the advertised
MSS is significantly larger than the actual PMTU of a connection.
How to fix Several solutions for this problem have been proposed:
A simple solution is to ACK every other packet, regardless of size.
This has the drawback of generating large numbers of ACKs in the face
of lots of very small packets; this shows up with applications like
the X Window System.
A slightly more complex solution would monitor the size of incoming
segments and try to determine what segment size the sender is using.
This requires slightly more state in the receiver, but has the
advantage of making receiver silly window syndrome avoidance
computations more accurate [RFC813].
2.3.
Name of Problem
Determining MSS from PMTU
Classification
Performance
Description
The MSS advertised at the start of a connection should be based on
the MTU of the interfaces on the system. (For efficiency and other
reasons this may not be the largest MSS possible.) Some systems use
PMTUD determined values to determine the MSS to advertise.
This results in an advertised MSS that is smaller than the largest
MTU the system can receive.
Significance
The advertised MSS is an indication to the remote system about the
largest TCP segment that can be received [RFC879]. If this value is
too small, the remote system will be forced to use a smaller segment
size when sending, purely because the local system found a particular
PMTU earlier.
Lahey Informational [Page 9]
RFC 2923 TCP Problems with Path MTU Discovery September 2000
Given the asymmetric nature of many routes on the Internet
[Paxson97], it seems entirely possible that the return PMTU is
different from the sending PMTU. Limiting the segment size in this
way can reduce performance and frustrate the PMTUD algorithm.
Even if the route was symmetric, setting this artificially lowered
limit on segment size will make it impossible to probe later to
determine if the PMTU has changed.
Implications
The whole point of PMTUD is to send as large a segment as possible.
If long-running connections cannot successfully probe for larger
PMTU, then potential performance gains will be impossible to realize.
This destroys the whole point of PMTUD.
Relevant RFCs RFC 1191. [RFC879] provides a complete discussion of
MSS calculations and appropriate values. Note that this practice
does not violate any of the specifications in these RFCs.
Trace file demonstrating it
This trace was made using tcpdump running on an intermediate host.
Host A initiates two separate consecutive connections, A1 and A2, to
host B. Router C is the location of the MTU bottleneck. As usual,
TCP options are removed from all non-SYN packets.
22:33:32.305912 A1 > B: S 1523306220:1523306220(0)
win 8760 <mss 1460> (DF)
22:33:32.306518 B > A1: S 729966260:729966260(0)
ack 1523306221 win 16384 <mss 65240>
22:33:32.310307 A1 > B: . ack 1 win 8760 (DF)
22:33:32.323496 A1 > B: P 1:1461(1460) ack 1 win 8760 (DF)
22:33:32.323569 C > A1: icmp: 129.99.238.5 unreachable -
need to frag (mtu 1024) (DF) (ttl 255, id 20666)
22:33:32.783694 A1 > B: . 1:985(984) ack 1 win 8856 (DF)
22:33:32.840817 B > A1: . ack 985 win 16384
22:33:32.845651 A1 > B: . 1461:2445(984) ack 1 win 8856 (DF)
22:33:32.846094 B > A1: . ack 985 win 16384
22:33:33.724392 A1 > B: . 985:1969(984) ack 1 win 8856 (DF)
22:33:33.724893 B > A1: . ack 2445 win 14924
22:33:33.728591 A1 > B: . 2445:2921(476) ack 1 win 8856 (DF)
22:33:33.729161 A1 > B: . ack 1 win 8856 (DF)
22:33:33.840758 B > A1: . ack 2921 win 16384
[...]
22:33:34.238659 A1 > B: F 7301:8193(892) ack 1 win 8856 (DF)
22:33:34.239036 B > A1: . ack 8194 win 15492
22:33:34.239303 B > A1: F 1:1(0) ack 8194 win 16384
Lahey Informational [Page 10]
RFC 2923 TCP Problems with Path MTU Discovery September 2000
22:33:34.242971 A1 > B: . ack 2 win 8856 (DF)
22:33:34.454218 A2 > B: S 1523591299:1523591299(0)
win 8856 <mss 984> (DF)
22:33:34.454617 B > A2: S 732408874:732408874(0)
ack 1523591300 win 16384 <mss 65240>
22:33:34.457516 A2 > B: . ack 1 win 8856 (DF)
22:33:34.470683 A2 > B: P 1:985(984) ack 1 win 8856 (DF)
22:33:34.471144 B > A2: . ack 985 win 16384
22:33:34.476554 A2 > B: . 985:1969(984) ack 1 win 8856 (DF)
22:33:34.477580 A2 > B: P 1969:2953(984) ack 1 win 8856 (DF)
[...]
Notice that the SYN packet for session A2 specifies an MSS of 984.
Trace file demonstrating correct behavior
As before, this trace was made using tcpdump running on an
intermediate host. Host A initiates two separate consecutive
connections, A1 and A2, to host B. Router C is the location of the
MTU bottleneck. As usual, TCP options are removed from all non-SYN
packets.
22:36:58.828602 A1 > B: S 3402991286:3402991286(0) win 32768
<mss 4312,wscale 0,nop,timestamp 1123370309 0,
echo 1123370309> (DF)
22:36:58.844040 B > A1: S 946999880:946999880(0)
ack 3402991287 win 16384
<mss 65240,nop,wscale 0,nop,nop,timestamp 429552 1123370309>
22:36:58.848058 A1 > B: . ack 1 win 32768 (DF)
22:36:58.851514 A1 > B: P 1:1025(1024) ack 1 win 32768 (DF)
22:36:58.851584 C > A1: icmp: 129.99.238.5 unreachable -
need to frag (mtu 1024) (DF)
22:36:58.855885 A1 > B: . 1:969(968) ack 1 win 32768 (DF)
22:36:58.856378 A1 > B: . 969:985(16) ack 1 win 32768 (DF)
22:36:59.036309 B > A1: . ack 985 win 16384
22:36:59.039255 A1 > B: FP 985:1025(40) ack 1 win 32768 (DF)
22:36:59.039623 B > A1: . ack 1026 win 16344
22:36:59.039828 B > A1: F 1:1(0) ack 1026 win 16384
22:36:59.043037 A1 > B: . ack 2 win 32768 (DF)
22:37:01.436032 A2 > B: S 3404812097:3404812097(0) win 32768
<mss 4312,wscale 0,nop,timestamp 1123372916 0,
echo 1123372916> (DF)
22:37:01.436424 B > A2: S 949814769:949814769(0)
ack 3404812098 win 16384
<mss 65240,nop,wscale 0,nop,nop,timestamp 429562 1123372916>
22:37:01.440147 A2 > B: . ack 1 win 32768 (DF)
22:37:01.442736 A2 > B: . 1:969(968) ack 1 win 32768 (DF)
Lahey Informational [Page 11]
RFC 2923 TCP Problems with Path MTU Discovery September 2000
22:37:01.442894 A2 > B: P 969:985(16) ack 1 win 32768 (DF)
22:37:01.443283 B > A2: . ack 985 win 16384
22:37:01.446068 A2 > B: P 985:1025(40) ack 1 win 32768 (DF)
22:37:01.446519 B > A2: . ack 1025 win 16384
22:37:01.448465 A2 > B: F 1025:1025(0) ack 1 win 32768 (DF)
22:37:01.448837 B > A2: . ack 1026 win 16384
22:37:01.449007 B > A2: F 1:1(0) ack 1026 win 16384
22:37:01.452201 A2 > B: . ack 2 win 32768 (DF)
Note that the same MSS was used for both session A1 and session A2.
How to detect
This can be detected using a packet trace of two separate
connections; the first should invoke PMTUD; the second should start
soon enough after the first that the PMTU value does not time out.
How to fix
The MSS should be determined based on the MTUs of the interfaces on
the system, as outlined in [RFC1122] and [RFC1191].
3. Security Considerations
The one security concern raised by this memo is that ICMP black holes
are often caused by over-zealous security administrators who block
all ICMP messages. It is vitally important that those who design and
deploy security systems understand the impact of strict filtering on
upper-layer protocols. The safest web site in the world is worthless
if most TCP implementations cannot transfer data from it. It would
be far nicer to have all of the black holes fixed rather than fixing
all of the TCP implementations.
4. Acknowledgements
Thanks to Mark Allman, Vern Paxson, and Jamshid Mahdavi for generous
help reviewing the document, and to Matt Mathis for early suggestions
of various mechanisms that can cause PMTUD black holes, as well as
review. The structure for describing TCP problems, and the early
description of that structure is from [RFC2525]. Special thanks to
Amy Bock, who helped perform the PMTUD tests which discovered these
bugs.
Lahey Informational [Page 12]
RFC 2923 TCP Problems with Path MTU Discovery September 2000
5. References
[RFC2581] Allman, M., Paxson, V. and W. Stevens, "TCP Congestion
Control", RFC 2581, April 1999.
[RFC1122] Braden, R., "Requirements for Internet Hosts --
Communication Layers", STD 3, RFC 1122, October 1989.
[RFC813] Clark, D., "Window and Acknowledgement Strategy in TCP",
RFC 813, July 1982.
[Jacobson89] V. Jacobson, C. Leres, and S. McCanne, tcpdump, June
1989, ftp.ee.lbl.gov
[RFC1435] Knowles, S., "IESG Advice from Experience with Path MTU
Discovery", RFC 1435, March 1993.
[RFC1191] Mogul, J. and S. Deering, "Path MTU discovery", RFC
1191, November 1990.
[RFC1981] McCann, J., Deering, S. and J. Mogul, "Path MTU
Discovery for IP version 6", RFC 1981, August 1996.
[Paxson96] V. Paxson, "End-to-End Routing Behavior in the
Internet", IEEE/ACM Transactions on Networking (5),
pp.~601-615, Oct. 1997.
[RFC2525] Paxon, V., Allman, M., Dawson, S., Fenner, W., Griner,
J., Heavens, I., Lahey, K., Semke, I. and B. Volz,
"Known TCP Implementation Problems", RFC 2525, March
1999.
[RFC879] Postel, J., "The TCP Maximum Segment Size and Related
Topics", RFC 879, November 1983.
[RFC2001] Stevens, W., "TCP Slow Start, Congestion Avoidance, Fast
Retransmit, and Fast Recovery Algorithms", RFC 2001,
January 1997.
Lahey Informational [Page 13]
RFC 2923 TCP Problems with Path MTU Discovery September 2000
6. Author's Address
Kevin Lahey
dotRocket, Inc.
1901 S. Bascom Ave., Suite 300
Campbell, CA 95008
USA
Phone: +1 408-371-8977 x115
email: kml@dotrocket.com
Lahey Informational [Page 14]
RFC 2923 TCP Problems with Path MTU Discovery September 2000
7. Full Copyright Statement
Copyright (C) The Internet Society (2000). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Acknowledgement
Funding for the RFC Editor function is currently provided by the
Internet Society.
Lahey Informational [Page 15]

View File

@ -0,0 +1,451 @@
Network Working Group V. Paxson
Request for Comments: 2988 ACIRI
Category: Standards Track M. Allman
NASA GRC/BBN
November 2000
Computing TCP's Retransmission Timer
Status of this Memo
This document specifies an Internet standards track protocol for the
Internet community, and requests discussion and suggestions for
improvements. Please refer to the current edition of the "Internet
Official Protocol Standards" (STD 1) for the standardization state
and status of this protocol. Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (2000). All Rights Reserved.
Abstract
This document defines the standard algorithm that Transmission
Control Protocol (TCP) senders are required to use to compute and
manage their retransmission timer. It expands on the discussion in
section 4.2.3.1 of RFC 1122 and upgrades the requirement of
supporting the algorithm from a SHOULD to a MUST.
1 Introduction
The Transmission Control Protocol (TCP) [Pos81] uses a retransmission
timer to ensure data delivery in the absence of any feedback from the
remote data receiver. The duration of this timer is referred to as
RTO (retransmission timeout). RFC 1122 [Bra89] specifies that the
RTO should be calculated as outlined in [Jac88].
This document codifies the algorithm for setting the RTO. In
addition, this document expands on the discussion in section 4.2.3.1
of RFC 1122 and upgrades the requirement of supporting the algorithm
from a SHOULD to a MUST. RFC 2581 [APS99] outlines the algorithm TCP
uses to begin sending after the RTO expires and a retransmission is
sent. This document does not alter the behavior outlined in RFC 2581
[APS99].
Paxson & Allman Standards Track [Page 1]
RFC 2988 Computing TCP's Retransmission Timer November 2000
In some situations it may be beneficial for a TCP sender to be more
conservative than the algorithms detailed in this document allow.
However, a TCP MUST NOT be more aggressive than the following
algorithms allow.
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
"SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
document are to be interpreted as described in [Bra97].
2 The Basic Algorithm
To compute the current RTO, a TCP sender maintains two state
variables, SRTT (smoothed round-trip time) and RTTVAR (round-trip
time variation). In addition, we assume a clock granularity of G
seconds.
The rules governing the computation of SRTT, RTTVAR, and RTO are as
follows:
(2.1) Until a round-trip time (RTT) measurement has been made for a
segment sent between the sender and receiver, the sender SHOULD
set RTO <- 3 seconds (per RFC 1122 [Bra89]), though the
"backing off" on repeated retransmission discussed in (5.5)
still applies.
Note that some implementations may use a "heartbeat" timer
that in fact yield a value between 2.5 seconds and 3
seconds. Accordingly, a lower bound of 2.5 seconds is also
acceptable, providing that the timer will never expire
faster than 2.5 seconds. Implementations using a heartbeat
timer with a granularity of G SHOULD not set the timer below
2.5 + G seconds.
(2.2) When the first RTT measurement R is made, the host MUST set
SRTT <- R
RTTVAR <- R/2
RTO <- SRTT + max (G, K*RTTVAR)
where K = 4.
(2.3) When a subsequent RTT measurement R' is made, a host MUST set
RTTVAR <- (1 - beta) * RTTVAR + beta * |SRTT - R'|
SRTT <- (1 - alpha) * SRTT + alpha * R'
Paxson & Allman Standards Track [Page 2]
RFC 2988 Computing TCP's Retransmission Timer November 2000
The value of SRTT used in the update to RTTVAR is its value
before updating SRTT itself using the second assignment. That
is, updating RTTVAR and SRTT MUST be computed in the above
order.
The above SHOULD be computed using alpha=1/8 and beta=1/4 (as
suggested in [JK88]).
After the computation, a host MUST update
RTO <- SRTT + max (G, K*RTTVAR)
(2.4) Whenever RTO is computed, if it is less than 1 second then the
RTO SHOULD be rounded up to 1 second.
Traditionally, TCP implementations use coarse grain clocks to
measure the RTT and trigger the RTO, which imposes a large
minimum value on the RTO. Research suggests that a large
minimum RTO is needed to keep TCP conservative and avoid
spurious retransmissions [AP99]. Therefore, this
specification requires a large minimum RTO as a conservative
approach, while at the same time acknowledging that at some
future point, research may show that a smaller minimum RTO is
acceptable or superior.
(2.5) A maximum value MAY be placed on RTO provided it is at least 60
seconds.
3 Taking RTT Samples
TCP MUST use Karn's algorithm [KP87] for taking RTT samples. That
is, RTT samples MUST NOT be made using segments that were
retransmitted (and thus for which it is ambiguous whether the reply
was for the first instance of the packet or a later instance). The
only case when TCP can safely take RTT samples from retransmitted
segments is when the TCP timestamp option [JBB92] is employed, since
the timestamp option removes the ambiguity regarding which instance
of the data segment triggered the acknowledgment.
Traditionally, TCP implementations have taken one RTT measurement at
a time (typically once per RTT). However, when using the timestamp
option, each ACK can be used as an RTT sample. RFC 1323 [JBB92]
suggests that TCP connections utilizing large congestion windows
should take many RTT samples per window of data to avoid aliasing
effects in the estimated RTT. A TCP implementation MUST take at
least one RTT measurement per RTT (unless that is not possible per
Karn's algorithm).
Paxson & Allman Standards Track [Page 3]
RFC 2988 Computing TCP's Retransmission Timer November 2000
For fairly modest congestion window sizes research suggests that
timing each segment does not lead to a better RTT estimator [AP99].
Additionally, when multiple samples are taken per RTT the alpha and
beta defined in section 2 may keep an inadequate RTT history. A
method for changing these constants is currently an open research
question.
4 Clock Granularity
There is no requirement for the clock granularity G used for
computing RTT measurements and the different state variables.
However, if the K*RTTVAR term in the RTO calculation equals zero,
the variance term MUST be rounded to G seconds (i.e., use the
equation given in step 2.3).
RTO <- SRTT + max (G, K*RTTVAR)
Experience has shown that finer clock granularities (<= 100 msec)
perform somewhat better than more coarse granularities.
Note that [Jac88] outlines several clever tricks that can be used to
obtain better precision from coarse granularity timers. These
changes are widely implemented in current TCP implementations.
5 Managing the RTO Timer
An implementation MUST manage the retransmission timer(s) in such a
way that a segment is never retransmitted too early, i.e. less than
one RTO after the previous transmission of that segment.
The following is the RECOMMENDED algorithm for managing the
retransmission timer:
(5.1) Every time a packet containing data is sent (including a
retransmission), if the timer is not running, start it running
so that it will expire after RTO seconds (for the current value
of RTO).
(5.2) When all outstanding data has been acknowledged, turn off the
retransmission timer.
(5.3) When an ACK is received that acknowledges new data, restart the
retransmission timer so that it will expire after RTO seconds
(for the current value of RTO).
Paxson & Allman Standards Track [Page 4]
RFC 2988 Computing TCP's Retransmission Timer November 2000
When the retransmission timer expires, do the following:
(5.4) Retransmit the earliest segment that has not been acknowledged
by the TCP receiver.
(5.5) The host MUST set RTO <- RTO * 2 ("back off the timer"). The
maximum value discussed in (2.5) above may be used to provide an
upper bound to this doubling operation.
(5.6) Start the retransmission timer, such that it expires after RTO
seconds (for the value of RTO after the doubling operation
outlined in 5.5).
Note that after retransmitting, once a new RTT measurement is
obtained (which can only happen when new data has been sent and
acknowledged), the computations outlined in section 2 are performed,
including the computation of RTO, which may result in "collapsing"
RTO back down after it has been subject to exponential backoff
(rule 5.5).
Note that a TCP implementation MAY clear SRTT and RTTVAR after
backing off the timer multiple times as it is likely that the
current SRTT and RTTVAR are bogus in this situation. Once SRTT and
RTTVAR are cleared they should be initialized with the next RTT
sample taken per (2.2) rather than using (2.3).
6 Security Considerations
This document requires a TCP to wait for a given interval before
retransmitting an unacknowledged segment. An attacker could cause a
TCP sender to compute a large value of RTO by adding delay to a
timed packet's latency, or that of its acknowledgment. However,
the ability to add delay to a packet's latency often coincides with
the ability to cause the packet to be lost, so it is difficult to
see what an attacker might gain from such an attack that could cause
more damage than simply discarding some of the TCP connection's
packets.
The Internet to a considerable degree relies on the correct
implementation of the RTO algorithm (as well as those described in
RFC 2581) in order to preserve network stability and avoid
congestion collapse. An attacker could cause TCP endpoints to
respond more aggressively in the face of congestion by forging
acknowledgments for segments before the receiver has actually
received the data, thus lowering RTO to an unsafe value. But to do
so requires spoofing the acknowledgments correctly, which is
difficult unless the attacker can monitor traffic along the path
between the sender and the receiver. In addition, even if the
Paxson & Allman Standards Track [Page 5]
RFC 2988 Computing TCP's Retransmission Timer November 2000
attacker can cause the sender's RTO to reach too small a value, it
appears the attacker cannot leverage this into much of an attack
(compared to the other damage they can do if they can spoof packets
belonging to the connection), since the sending TCP will still back
off its timer in the face of an incorrectly transmitted packet's
loss due to actual congestion.
Acknowledgments
The RTO algorithm described in this memo was originated by Van
Jacobson in [Jac88].
References
[AP99] Allman, M. and V. Paxson, "On Estimating End-to-End Network
Path Properties", SIGCOMM 99.
[APS99] Allman, M., Paxson V. and W. Stevens, "TCP Congestion
Control", RFC 2581, April 1999.
[Bra89] Braden, R., "Requirements for Internet Hosts --
Communication Layers", STD 3, RFC 1122, October 1989.
[Bra97] Bradner, S., "Key words for use in RFCs to Indicate
Requirement Levels", BCP 14, RFC 2119, March 1997.
[Jac88] Jacobson, V., "Congestion Avoidance and Control", Computer
Communication Review, vol. 18, no. 4, pp. 314-329, Aug. 1988.
[JK88] Jacobson, V. and M. Karels, "Congestion Avoidance and
Control", ftp://ftp.ee.lbl.gov/papers/congavoid.ps.Z.
[KP87] Karn, P. and C. Partridge, "Improving Round-Trip Time
Estimates in Reliable Transport Protocols", SIGCOMM 87.
[Pos81] Postel, J., "Transmission Control Protocol", STD 7, RFC 793,
September 1981.
Paxson & Allman Standards Track [Page 6]
RFC 2988 Computing TCP's Retransmission Timer November 2000
Author's Addresses
Vern Paxson
ACIRI / ICSI
1947 Center Street
Suite 600
Berkeley, CA 94704-1198
Phone: 510-666-2882
Fax: 510-643-7684
EMail: vern@aciri.org
http://www.aciri.org/vern/
Mark Allman
NASA Glenn Research Center/BBN Technologies
Lewis Field
21000 Brookpark Rd. MS 54-2
Cleveland, OH 44135
Phone: 216-433-6586
Fax: 216-433-8705
EMail: mallman@grc.nasa.gov
http://roland.grc.nasa.gov/~mallman
Paxson & Allman Standards Track [Page 7]
RFC 2988 Computing TCP's Retransmission Timer November 2000
Full Copyright Statement
Copyright (C) The Internet Society (2000). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Acknowledgement
Funding for the RFC Editor function is currently provided by the
Internet Society.
Paxson & Allman Standards Track [Page 8]

View File

@ -0,0 +1,507 @@
Network Working Group M. Allman
Request for Comments: 3042 NASA GRC/BBN
Category: Standards Track H. Balakrishnan
MIT
S. Floyd
ACIRI
January 2001
Enhancing TCP's Loss Recovery Using Limited Transmit
Status of this Memo
This document specifies an Internet standards track protocol for the
Internet community, and requests discussion and suggestions for
improvements. Please refer to the current edition of the "Internet
Official Protocol Standards" (STD 1) for the standardization state
and status of this protocol. Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (2001). All Rights Reserved.
Abstract
This document proposes a new Transmission Control Protocol (TCP)
mechanism that can be used to more effectively recover lost segments
when a connection's congestion window is small, or when a large
number of segments are lost in a single transmission window. The
"Limited Transmit" algorithm calls for sending a new data segment in
response to each of the first two duplicate acknowledgments that
arrive at the sender. Transmitting these segments increases the
probability that TCP can recover from a single lost segment using the
fast retransmit algorithm, rather than using a costly retransmission
timeout. Limited Transmit can be used both in conjunction with, and
in the absence of, the TCP selective acknowledgment (SACK) mechanism.
1 Introduction
A number of researchers have observed that TCP's loss recovery
strategies do not work well when the congestion window at a TCP
sender is small. This can happen, for instance, because there is
only a limited amount of data to send, or because of the limit
imposed by the receiver-advertised window, or because of the
constraints imposed by end-to-end congestion control over a
connection with a small bandwidth-delay product
[Riz96,Mor97,BPS+98,Bal98,LK98]. When a TCP detects a missing
segment, it enters a loss recovery phase using one of two methods.
Allman, et al. Standards Track [Page 1]
RFC 3042 Enhancing TCP Loss Recovery January 2001
First, if an acknowledgment (ACK) for a given segment is not received
in a certain amount of time a retransmission timeout occurs and the
segment is resent [RFC793,PA00]. Second, the "Fast Retransmit"
algorithm resends a segment when three duplicate ACKs arrive at the
sender [Jac88,RFC2581]. However, because duplicate ACKs from the
receiver are also triggered by packet reordering in the Internet, the
TCP sender waits for three duplicate ACKs in an attempt to
disambiguate segment loss from packet reordering. Once in a loss
recovery phase, a number of techniques can be used to retransmit lost
segments, including slow start-based recovery or Fast Recovery
[RFC2581], NewReno [RFC2582], and loss recovery based on selective
acknowledgments (SACKs) [RFC2018,FF96].
TCP's retransmission timeout (RTO) is based on measured round-trip
times (RTT) between the sender and receiver, as specified in [PA00].
To prevent spurious retransmissions of segments that are only delayed
and not lost, the minimum RTO is conservatively chosen to be 1
second. Therefore, it behooves TCP senders to detect and recover
from as many losses as possible without incurring a lengthy timeout
when the connection remains idle. However, if not enough duplicate
ACKs arrive from the receiver, the Fast Retransmit algorithm is never
triggered---this situation occurs when the congestion window is small
or if a large number of segments in a window are lost. For instance,
consider a congestion window (cwnd) of three segments. If one
segment is dropped by the network, then at most two duplicate ACKs
will arrive at the sender. Since three duplicate ACKs are required
to trigger Fast Retransmit, a timeout will be required to resend the
dropped packet.
[BPS+97] found that roughly 56% of retransmissions sent by a busy web
server were sent after the RTO expires, while only 44% were handled
by Fast Retransmit. In addition, only 4% of the RTO-based
retransmissions could have been avoided with SACK, which of course
has to continue to disambiguate reordering from genuine loss. In
contrast, using the technique outlined in this document and in
[Bal98], 25% of the RTO-based retransmissions in that dataset would
have likely been avoided.
The next section of this document outlines small changes to TCP
senders that will decrease the reliance on the retransmission timer,
and thereby improve TCP performance when Fast Retransmit is not
triggered. These changes do not adversely affect the performance of
TCP nor interact adversely with other connections, in other
circumstances.
Allman, et al. Standards Track [Page 2]
RFC 3042 Enhancing TCP Loss Recovery January 2001
1.1 Terminology
In this document, he key words "MUST", "MUST NOT", "REQUIRED",
"SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY",
AND "OPTIONAL" are to be interpreted as described in RFC 2119 [1] and
indicate requirement levels for protocols.
2 The Limited Transmit Algorithm
When a TCP sender has previously unsent data queued for transmission
it SHOULD use the Limited Transmit algorithm, which calls for a TCP
sender to transmit new data upon the arrival of the first two
consecutive duplicate ACKs when the following conditions are
satisfied:
* The receiver's advertised window allows the transmission of the
segment.
* The amount of outstanding data would remain less than or equal
to the congestion window plus 2 segments. In other words, the
sender can only send two segments beyond the congestion window
(cwnd).
The congestion window (cwnd) MUST NOT be changed when these new
segments are transmitted. Assuming that these new segments and the
corresponding ACKs are not dropped, this procedure allows the sender
to infer loss using the standard Fast Retransmit threshold of three
duplicate ACKs [RFC2581]. This is more robust to reordered packets
than if an old packet were retransmitted on the first or second
duplicate ACK.
Note: If the connection is using selective acknowledgments [RFC2018],
the data sender MUST NOT send new segments in response to duplicate
ACKs that contain no new SACK information, as a misbehaving receiver
can generate such ACKs to trigger inappropriate transmission of data
segments. See [SCWA99] for a discussion of attacks by misbehaving
receivers.
Limited Transmit follows the "conservation of packets" congestion
control principle [Jac88]. Each of the first two duplicate ACKs
indicate that a segment has left the network. Furthermore, the
sender has not yet decided that a segment has been dropped and
therefore has no reason to assume that the current congestion control
state is inaccurate. Therefore, transmitting segments does not
deviate from the spirit of TCP's congestion control principles.
Allman, et al. Standards Track [Page 3]
RFC 3042 Enhancing TCP Loss Recovery January 2001
[BPS99] shows that packet reordering is not a rare network event.
[RFC2581] does not provide for sending of data on the first two
duplicate ACKs that arrive at the sender. This causes a burst of
segments to be sent when an ACK for new data does arrive following
packet reordering. Using Limited Transmit, data packets will be
clocked out by incoming ACKs and therefore transmission will not be
as bursty.
Note: Limited Transmit is implemented in the ns simulator [NS].
Researchers wishing to investigate this mechanism further can do so
by enabling "singledup_" for the given TCP connection.
3 Related Work
Deployment of Explicit Congestion Notification (ECN) [Flo94,RFC2481]
may benefit connections with small congestion window sizes [SA00].
ECN provides a method for indicating congestion to the end-host
without dropping segments. While some segment drops may still occur,
ECN may allow TCP to perform better with small congestion window
sizes because the sender can avoid many of the Fast Retransmits and
Retransmit Timeouts that would otherwise have been needed to detect
dropped segments [SA00].
When ECN-enabled TCP traffic competes with non-ECN-enabled TCP
traffic, ECN-enabled traffic can receive up to 30% higher goodput.
For bulk transfers, the relative performance benefit of ECN is
greatest when on average each flow has 3-4 outstanding packets during
each round-trip time [ZQ00]. This should be a good estimate for the
performance impact of a flow using Limited Transmit, since both ECN
and Limited Transmit reduce the reliance on the retransmission timer
for signaling congestion.
The Rate-Halving congestion control algorithm [MSML99] uses a form of
limited transmit, as it calls for transmitting a data segment on
every second duplicate ACK that arrives at the sender. The algorithm
decouples the decision of what to send from the decision of when to
send. However, similar to Limited Transmit the algorithm will always
send a new data segment on the second duplicate ACK that arrives at
the sender.
4 Security Considerations
The additional security implications of the changes proposed in this
document, compared to TCP's current vulnerabilities, are minimal.
The potential security issues come from the subversion of end-to-end
congestion control from "false" duplicate ACKs, where a "false"
duplicate ACK is a duplicate ACK that does not actually acknowledge
new data received at the TCP receiver. False duplicate ACKs could
Allman, et al. Standards Track [Page 4]
RFC 3042 Enhancing TCP Loss Recovery January 2001
result from duplicate ACKs that are themselves duplicated in the
network, or from misbehaving TCP receivers that send false duplicate
ACKs to subvert end-to-end congestion control [SCWA99,RFC2581].
When the TCP data receiver has agreed to use the SACK option, the TCP
data sender has fairly strong protection against false duplicate
ACKs. In particular, with SACK, a duplicate ACK that acknowledges
new data arriving at the receiver reports the sequence numbers of
that new data. Thus, with SACK, the TCP sender can verify that an
arriving duplicate ACK acknowledges data that the TCP sender has
actually sent, and for which no previous acknowledgment has been
received, before sending new data as a result of that acknowledgment.
For further protection, the TCP sender could keep a record of packet
boundaries for transmitted data packets, and recognize at most one
valid acknowledgment for each packet (e.g., the first acknowledgment
acknowledging the receipt of all of the sequence numbers in that
packet).
One could imagine some limited protection against false duplicate
ACKs for a non-SACK TCP connection, where the TCP sender keeps a
record of the number of packets transmitted, and recognizes at most
one acknowledgment per packet to be used for triggering the sending
of new data. However, this accounting of packets transmitted and
acknowledged would require additional state and extra complexity at
the TCP sender, and does not seem necessary.
The most important protection against false duplicate ACKs comes from
the limited potential of duplicate ACKs in subverting end-to-end
congestion control. There are two separate cases to consider: when
the TCP sender receives less than a threshold number of duplicate
ACKs, and when the TCP sender receives at least a threshold number of
duplicate ACKs. In the latter case a TCP with Limited Transmit will
behave essentially the same as a TCP without Limited Transmit in that
the congestion window will be halved and a loss recovery period will
be initiated.
When a TCP sender receives less than a threshold number of duplicate
ACKs a misbehaving receiver could send two duplicate ACKs after each
regular ACK. One might imagine that the TCP sender would send at
three times its allowed sending rate. However, using Limited
Transmit as outlined in section 2 the sender is only allowed to
exceed the congestion window by less than the duplicate ACK threshold
(of three segments), and thus would not send a new packet for each
duplicate ACK received.
Allman, et al. Standards Track [Page 5]
RFC 3042 Enhancing TCP Loss Recovery January 2001
Acknowledgments
Bill Fenner, Jamshid Mahdavi and the Transport Area Working Group
provided valuable feedback on an early version of this document.
References
[Bal98] Hari Balakrishnan. Challenges to Reliable Data Transport
over Heterogeneous Wireless Networks. Ph.D. Thesis,
University of California at Berkeley, August 1998.
[BPS+97] Hari Balakrishnan, Venkata Padmanabhan, Srinivasan Seshan,
Mark Stemm, and Randy Katz. TCP Behavior of a Busy Web
Server: Analysis and Improvements. Technical Report
UCB/CSD-97-966, August 1997. Available from
http://nms.lcs.mit.edu/~hari/papers/csd-97-966.ps. (Also
in Proc. IEEE INFOCOM Conf., San Francisco, CA, March
1998.)
[BPS99] Jon Bennett, Craig Partridge, Nicholas Shectman. Packet
Reordering is Not Pathological Network Behavior. IEEE/ACM
Transactions on Networking, December 1999.
[FF96] Kevin Fall, Sally Floyd. Simulation-based Comparisons of
Tahoe, Reno, and SACK TCP. ACM Computer Communication
Review, July 1996.
[Flo94] Sally Floyd. TCP and Explicit Congestion Notification.
ACM Computer Communication Review, October 1994.
[Jac88] Van Jacobson. Congestion Avoidance and Control. ACM
SIGCOMM 1988.
[LK98] Dong Lin, H.T. Kung. TCP Fast Recovery Strategies:
Analysis and Improvements. Proceedings of InfoCom, March
1998.
[MSML99] Matt Mathis, Jeff Semke, Jamshid Mahdavi, Kevin Lahey. The
Rate Halving Algorithm, 1999. URL:
http://www.psc.edu/networking/rate_halving.html.
[Mor97] Robert Morris. TCP Behavior with Many Flows. Proceedings
of the Fifth IEEE International Conference on Network
Protocols. October 1997.
[NS] Ns network simulator. URL: http://www.isi.edu/nsnam/.
Allman, et al. Standards Track [Page 6]
RFC 3042 Enhancing TCP Loss Recovery January 2001
[PA00] Paxson, V. and M. Allman, "Computing TCP's Retransmission
Timer", RFC 2988, November 2000.
[Riz96] Luigi Rizzo. Issues in the Implementation of Selective
Acknowledgments for TCP. January, 1996. URL:
http://www.iet.unipi.it/~luigi/selack.ps
[SA00] Hadi Salim, J. and U. Ahmed, "Performance Evaluation of
Explicit Congestion Notification (ECN) in IP Networks", RFC
2884, July 2000.
[SCWA99] Stefan Savage, Neal Cardwell, David Wetherall, Tom
Anderson. TCP Congestion Control with a Misbehaving
Receiver. ACM Computer Communications Review, October
1999.
[RFC793] Postel, J., "Transmission Control Protocol", STD 7, RFC
793, September 1981.
[RFC2018] Mathis, M., Mahdavi, J., Floyd, S. and A. Romanow, "TCP
Selective Acknowledgement Options", RFC 2018, October 1996.
[RFC2119] Bradner, S., "Key words for use in RFCs to Indicate
Requirement Levels", BCP 14, RFC 2119, March 1997.
[RFC2481] Ramakrishnan, K. and S. Floyd, "A Proposal to Add Explicit
Congestion Notification (ECN) to IP", RFC 2481, January
1999.
[RFC2581] Allman, M., Paxson, V. and W. Stevens, "TCP Congestion
Control", RFC 2581, April 1999.
[RFC2582] Floyd, S. and T. Henderson, "The NewReno Modification to
TCP's Fast Recovery Algorithm", RFC 2582, April 1999.
[ZQ00] Yin Zhang and Lili Qiu, Understanding the End-to-End
Performance Impact of RED in a Heterogeneous Environment,
Cornell CS Technical Report 2000-1802, July 2000. URL
http://www.cs.cornell.edu/yzhang/papers.htm.
Allman, et al. Standards Track [Page 7]
RFC 3042 Enhancing TCP Loss Recovery January 2001
Authors' Addresses
Mark Allman
NASA Glenn Research Center/BBN Technologies
Lewis Field
21000 Brookpark Rd. MS 54-5
Cleveland, OH 44135
Phone: +1-216-433-6586
Fax: +1-216-433-8705
EMail: mallman@grc.nasa.gov
http://roland.grc.nasa.gov/~mallman
Hari Balakrishnan
Laboratory for Computer Science
545 Technology Square
Massachusetts Institute of Technology
Cambridge, MA 02139
EMail: hari@lcs.mit.edu
http://nms.lcs.mit.edu/~hari/
Sally Floyd
AT&T Center for Internet Research at ICSI (ACIRI)
1947 Center St, Suite 600
Berkeley, CA 94704
Phone: +1-510-666-2989
EMail: floyd@aciri.org
http://www.aciri.org/floyd/
Allman, et al. Standards Track [Page 8]
RFC 3042 Enhancing TCP Loss Recovery January 2001
Full Copyright Statement
Copyright (C) The Internet Society (2001). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Acknowledgement
Funding for the RFC Editor function is currently provided by the
Internet Society.
Allman, et al. Standards Track [Page 9]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,955 @@
Network Working Group S. Dawkins
Request for Comments: 3150 G. Montenegro
BCP: 48 M . Kojo
Category: Best Current Practice V. Magret
July 2001
End-to-end Performance Implications of Slow Links
Status of this Memo
This document specifies an Internet Best Current Practices for the
Internet Community, and requests discussion and suggestions for
improvements. Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (2001). All Rights Reserved.
Abstract
This document makes performance-related recommendations for users of
network paths that traverse "very low bit-rate" links.
"Very low bit-rate" implies "slower than we would like". This
recommendation may be useful in any network where hosts can saturate
available bandwidth, but the design space for this recommendation
explicitly includes connections that traverse 56 Kb/second modem
links or 4.8 Kb/second wireless access links - both of which are
widely deployed.
This document discusses general-purpose mechanisms. Where
application-specific mechanisms can outperform the relevant general-
purpose mechanism, we point this out and explain why.
This document has some recommendations in common with RFC 2689,
"Providing integrated services over low-bitrate links", especially in
areas like header compression. This document focuses more on
traditional data applications for which "best-effort delivery" is
appropriate.
Dawkins, et al. Best Current Practice [Page 1]
RFC 3150 PILC - Slow Links July 2001
Table of Contents
1.0 Introduction ................................................. 2
2.0 Description of Optimizations ................................. 3
2.1 Header Compression Alternatives ...................... 3
2.2 Payload Compression Alternatives ..................... 5
2.3 Choosing MTU sizes ................................... 5
2.4 Interactions with TCP Congestion Control [RFC2581] ... 6
2.5 TCP Buffer Auto-tuning ............................... 9
2.6 Small Window Effects ................................. 10
3.0 Summary of Recommended Optimizations ......................... 10
4.0 Topics For Further Work ...................................... 12
5.0 Security Considerations ...................................... 12
6.0 IANA Considerations .......................................... 13
7.0 Acknowledgements ............................................. 13
8.0 References ................................................... 13
Authors' Addresses ............................................... 16
Full Copyright Statement ......................................... 17
1.0 Introduction
The Internet protocol stack was designed to operate in a wide range
of link speeds, and has met this design goal with only a limited
number of enhancements (for example, the use of TCP window scaling as
described in "TCP Extensions for High Performance" [RFC1323] for
very-high-bandwidth connections).
Pre-World Wide Web application protocols tended to be either
interactive applications sending very little data (e.g., Telnet) or
bulk transfer applications that did not require interactive response
(e.g., File Transfer Protocol, Network News). The World Wide Web has
given us traffic that is both interactive and often "bulky",
including images, sound, and video.
The World Wide Web has also popularized the Internet, so that there
is significant interest in accessing the Internet over link speeds
that are much "slower" than typical office network speeds. In fact,
a significant proportion of the current Internet users is connected
to the Internet over a relatively slow last-hop link. In future, the
number of such users is likely to increase rapidly as various mobile
devices are foreseen to to be attached to the Internet over slow
wireless links.
In order to provide the best interactive response for these "bulky"
transfers, implementors may wish to minimize the number of bits
actually transmitted over these "slow" connections. There are two
Dawkins, et al. Best Current Practice [Page 2]
RFC 3150 PILC - Slow Links July 2001
areas that can be considered - compressing the bits that make up the
overhead associated with the connection, and compressing the bits
that make up the payload being transported over the connection.
In addition, implementors may wish to consider TCP receive window
settings and queuing mechanisms as techniques to improve performance
over low-speed links. While these techniques do not involve protocol
changes, they are included in this document for completeness.
2.0 Description of Optimizations
This section describes optimizations which have been suggested for
use in situations where hosts can saturate their links. The next
section summarizes recommendations about the use of these
optimizations.
2.1 Header Compression Alternatives
Mechanisms for TCP and IP header compression defined in [RFC1144,
RFC2507, RFC2508, RFC2509, RFC3095] provide the following benefits:
- Improve interactive response time
- Decrease header overhead (for a typical dialup MTU of 296
bytes, the overhead of TCP/IP headers can decrease from about
13 percent with typical 40-byte headers to 1-1.5 percent with
with 3-5 byte compressed headers, for most packets). This
enables use of small packets for delay-sensitive low data-rate
traffic and good line efficiency for bulk data even with small
segment sizes (for reasons to use a small MTU on slow links,
see section 2.3)
- Many slow links today are wireless and tend to be significantly
lossy. Header compression reduces packet loss rate over lossy
links (simply because shorter transmission times expose packets
to fewer events that cause loss).
[RFC1144] header compression is a Proposed Standard for TCP Header
compression that is widely deployed. Unfortunately it is vulnerable
on lossy links, because even a single bit error results in loss of
synchronization between the compressor and decompressor. It uses TCP
timeouts to detect a loss of such synchronization, but these errors
result in loss of data (up to a full TCP window), delay of a full
RTO, and unnecessary slow-start.
Dawkins, et al. Best Current Practice [Page 3]
RFC 3150 PILC - Slow Links July 2001
A more recent header compression proposal [RFC2507] includes an
explicit request for retransmission of an uncompressed packet to
allow resynchronization without waiting for a TCP timeout (and
executing congestion avoidance procedures). This works much better
on links with lossy characteristics.
The above scheme ceases to perform well under conditions as extreme
as those of many cellular links (error conditions of 1e-3 or 1e-2 and
round trip times over 100 ms.). For these cases, the 'Robust Header
Compression' working group has developed ROHC [RFC3095]. Extensions
of ROHC to support compression of TCP headers are also under
development.
[RFC1323] defines a "TCP Timestamp" option, used to prevent
"wrapping" of the TCP sequence number space on high-speed links, and
to improve TCP RTT estimates by providing unambiguous TCP roundtrip
timings. Use of TCP timestamps prevents header compression, because
the timestamps are sent as TCP options. This means that each
timestamped header has TCP options that differ from the previous
header, and headers with changed TCP options are always sent
uncompressed. In addition, timestamps do not seem to have much of an
impact on RTO estimation [AlPa99].
Nevertheless, the ROHC working group is developing schemes to
compress TCP headers, including options such as timestamps and
selective acknowledgements.
Recommendation: Implement [RFC2507], in particular as it relates to
IPv4 tunnels and Minimal Encapsulation for Mobile IP, as well as TCP
header compression for lossy links and links that reorder packets.
PPP capable devices should implement "IP Header Compression over PPP"
[RFC2509]. Robust Header Compression [RFC3095] is recommended for
extremely slow links with very high error rates (see above), but
implementors should judge if its complexity is justified (perhaps by
the cost of the radio frequency resources).
[RFC1144] header compression should only be enabled when operating
over reliable "slow" links.
Use of TCP Timestamps [RFC1323] is not recommended with these
connections, because it complicates header compression. Even though
the Robust Header Compression (ROHC) working group is developing
specifications to remedy this, those mechanisms are not yet fully
developed nor deployed, and may not be generally justifiable.
Furthermore, connections traversing "slow" links do not require
protection against TCP sequence-number wrapping.
Dawkins, et al. Best Current Practice [Page 4]
RFC 3150 PILC - Slow Links July 2001
2.2 Payload Compression Alternatives
Compression of IP payloads is also desirable on "slow" network links.
"IP Payload Compression Protocol (IPComp)" [RFC2393] defines a
framework where common compression algorithms can be applied to
arbitrary IP segment payloads.
IP payload compression is something of a niche optimization. It is
necessary because IP-level security converts IP payloads to random
bitstreams, defeating commonly-deployed link-layer compression
mechanisms which are faced with payloads that have no redundant
"information" that can be more compactly represented.
However, many IP payloads are already compressed (images, audio,
video, "zipped" files being transferred), or are already encrypted
above the IP layer (e.g., SSL [SSL]/TLS [RFC2246]). These payloads
will not "compress" further, limiting the benefit of this
optimization.
For uncompressed HTTP payload types, HTTP/1.1 [RFC2616] also includes
Content-Encoding and Accept-Encoding headers, supporting a variety of
compression algorithms for common compressible MIME types like
text/plain. This leaves only the HTTP headers themselves
uncompressed.
In general, application-level compression can often outperform
IPComp, because of the opportunity to use compression dictionaries
based on knowledge of the specific data being compressed.
Extensive use of application-level compression techniques will reduce
the need for IPComp, especially for WWW users.
Recommendation: IPComp may optionally be implemented.
2.3 Choosing MTU Sizes
There are several points to keep in mind when choosing an MTU for
low-speed links.
First, if a full-length MTU occupies a link for longer than the
delayed ACK timeout (typically 200 milliseconds, but may be up to 500
milliseconds), this timeout will cause an ACK to be generated for
every segment, rather than every second segment, as occurs with most
implementations of the TCP delayed ACK algorithm.
Dawkins, et al. Best Current Practice [Page 5]
RFC 3150 PILC - Slow Links July 2001
Second, "relatively large" MTUs, which take human-perceptible amounts
of time to be transmitted into the network, create human-perceptible
delays in other flows using the same link. [RFC1144] considers
100-200 millisecond delays as human-perceptible. The convention of
choosing 296-byte MTUs (with header compression enabled) for dialup
access is a compromise that limits the maximum link occupancy delay
with full-length MTUs close to 200 milliseconds on 9.6 Kb/second
links.
Third, on last-hop links using a larger link MTU size, and therefore
larger MSS, would allow a TCP sender to increase its congestion
window faster in bytes than when using a smaller MTU size (and a
smaller MSS). However, with a smaller MTU size, and a smaller MSS
size, the congestion window, when measured in segments, increases
more quickly than it would with a larger MSS size. Connections using
smaller MSS sizes are more likely to be able to send enough segments
to generate three duplicate acknowledgements, triggering fast
retransmit/fast recovery when packet losses are encountered. Hence,
a smaller MTU size is useful for slow links with lossy
characteristics.
Fourth, using a smaller MTU size also decreases the queuing delay of
a TCP flow (and thereby RTT) compared to use of larger MTU size with
the same number of packets in a queue. This means that a TCP flow
using a smaller segment size and traversing a slow link is able to
inflate the congestion window (in number of segments) to a larger
value while experiencing the same queuing delay.
Finally, some networks charge for traffic on a per-packet basis, not
on a per-kilobyte basis. In these cases, connections using a larger
MTU may be charged less than connections transferring the same number
of bytes using a smaller MTU.
Recommendation: If it is possible to do so, MTUs should be chosen
that do not monopolize network interfaces for human-perceptible
amounts of time, and implementors should not chose MTUs that will
occupy a network interface for significantly more than 100-200
milliseconds.
2.4 Interactions with TCP Congestion Control [RFC2581]
In many cases, TCP connections that traverse slow links have the slow
link as an "access" link, with higher-speed links in use for most of
the connection path. One common configuration might be a laptop
computer using dialup access to a terminal server (a last-hop
router), with an HTTP server on a high-speed LAN "behind" the
terminal server.
Dawkins, et al. Best Current Practice [Page 6]
RFC 3150 PILC - Slow Links July 2001
In this case, the HTTP server may be able to place packets on its
directly-attached high-speed LAN at a higher rate than the last-hop
router can forward them on the low-speed link. When the last-hop
router falls behind, it will be unable to buffer the traffic intended
for the low-speed link, and will become a point of congestion and
begin to drop the excess packets. In particular, several packets may
be dropped in a single transmission window when initial slow start
overshoots the last-hop router buffer.
Although packet loss is occurring, it isn't detected at the TCP
sender until one RTT time after the router buffer space is exhausted
and the first packet is dropped. This late congestion signal allows
the congestion window to increase up to double the size it was at the
time the first packet was dropped at the router.
If the link MTU is large enough to take more than the delayed ACK
timeout interval to transmit a packet, an ACK is sent for every
segment and the congestion window is doubled in a single RTT. If a
smaller link MTU is in use and delayed ACKs can be utilized, the
congestion window increases by a factor of 1.5 in one RTT. In both
cases the sender continues transmitting packets well beyond the
congestion point of the last-hop router, resulting in multiple packet
losses in a single window.
The self-clocking nature of TCP's slow start and congestion avoidance
algorithms prevent this buffer overrun from continuing. In addition,
these algorithms allow senders to "probe" for available bandwidth -
cycling through an increasing rate of transmission until loss occurs,
followed by a dramatic (50-percent) drop in transmission rate. This
happens when a host directly connected to a low-speed link offers an
advertised window that is unrealistically large for the low-speed
link. During the congestion avoidance phase the peer host continues
to probe for available bandwidth, trying to fill the advertised
window, until packet loss occurs.
The same problems may also exist when a sending host is directly
connected to a slow link as most slow links have some local buffer in
the link interface. This link interface buffer is subject to
overflow exactly in the same way as the last-hop router buffer.
When a last-hop router with a small number of buffers per outbound
link is used, the first buffer overflow occurs earlier than it would
if the router had a larger number of buffers. Subsequently with a
smaller number of buffers the periodic packet losses occur more
frequently during congestion avoidance, when the sender probes for
available bandwidth.
Dawkins, et al. Best Current Practice [Page 7]
RFC 3150 PILC - Slow Links July 2001
The most important responsibility of router buffers is to absorb
bursts. Too few buffers (for example, only three buffers per
outbound link as described in [RFC2416]) means that routers will
overflow their buffer pools very easily and are unlikely to absorb
even a very small burst. When a larger number of router buffers are
allocated per outbound link, the buffer space does not overflow as
quickly but the buffers are still likely to become full due to TCP's
default behavior. A larger number of router buffers leads to longer
queuing delays and a longer RTT.
If router queues become full before congestion is signaled or remain
full for long periods of time, this is likely to result in "lock-
out", where a single connection or a few connections occupy the
router queue space, preventing other connections from using the link
[RFC2309], especially when a tail drop queue management discipline is
being used.
Therefore, it is essential to have a large enough number of buffers
in routers to be able to absorb data bursts, but keep the queues
normally small. In order to achieve this it has been recommended in
[RFC2309] that an active queue management mechanism, like Random
Early Detection (RED) [RED93], should be implemented in all Internet
routers, including the last-hop routers in front of a slow link. It
should also be noted that RED requires a sufficiently large number of
router buffers to work properly. In addition, the appropriate
parameters of RED on a last-hop router connected to a slow link will
likely deviate from the defaults recommended.
Active queue management mechanism do not eliminate packet drops but,
instead, drop packets at earlier stage to solve the full-queue
problem for flows that are responsive to packet drops as congestion
signal. Hosts that are directly connected to low-speed links may
limit the receive windows they advertise in order to lower or
eliminate the number of packet drops in a last-hop router. When
doing so one should, however, take care that the advertised window is
large enough to allow full utilization of the last-hop link capacity
and to allow triggering fast retransmit, when a packet loss is
encountered. This recommendation takes two forms:
- Modern operating systems use relatively large default TCP receive
buffers compared to what is required to fully utilize the link
capacity of low-speed links. Users should be able to choose the
default receive window size in use - typically a system-wide
parameter. (This "choice" may be as simple as "dial-up access/LAN
access" on a dialog box - this would accommodate many environments
without requiring hand-tuning by experienced network engineers.)
Dawkins, et al. Best Current Practice [Page 8]
RFC 3150 PILC - Slow Links July 2001
- Application developers should not attempt to manually manage
network bandwidth using socket buffer sizes. Only in very rare
circumstances will an application actually know both the bandwidth
and delay of a path and be able to choose a suitably low (or high)
value for the socket buffer size to obtain good network
performance.
This recommendation is not a general solution for any network path
that might involve a slow link. Instead, this recommendation is
applicable in environments where the host "knows" it is always
connected to other hosts via "slow links". For hosts that may
connect to other host over a variety of links (e.g., dial-up laptop
computers with LAN-connected docking stations), buffer auto-tuning
for the receive buffer is a more reasonable recommendation, and is
discussed below.
2.5 TCP Buffer Auto-tuning
[SMM98] recognizes a tension between the desire to allocate "large"
TCP buffers, so that network paths are fully utilized, and a desire
to limit the amount of memory dedicated to TCP buffers, in order to
efficiently support large numbers of connections to hosts over
network paths that may vary by six orders of magnitude.
The technique proposed is to dynamically allocate TCP buffers, based
on the current congestion window, rather than attempting to
preallocate TCP buffers without any knowledge of the network path.
This proposal results in receive buffers that are appropriate for the
window sizes in use, and send buffers large enough to contain two
windows of segments, so that SACK and fast recovery can recover
losses without forcing the connection to use lengthy retransmission
timeouts.
While most of the motivation for this proposal is given from a
server's perspective, hosts that connect using multiple interfaces
with markedly-different link speeds may also find this kind of
technique useful. This is true in particular with slow links, which
are likely to dominate the end-to-end RTT. If the host is connected
only via a single slow link interface at a time, it is fairly easy to
(dynamically) adjust the receive window (and thus the advertised
window) to a value appropriate for the slow last-hop link with known
bandwidth and delay characteristics.
Recommendation: If a host is sometimes connected via a slow link but
the host is also connected using other interfaces with markedly-
different link speeds, it may use receive buffer auto-tuning to
adjust the advertised window to an appropriate value.
Dawkins, et al. Best Current Practice [Page 9]
RFC 3150 PILC - Slow Links July 2001
2.6 Small Window Effects
If a TCP connection stabilizes with a congestion window of only a few
segments (as could be expected on a "slow" link), the sender isn't
sending enough segments to generate three duplicate acknowledgements,
triggering fast retransmit and fast recovery. This means that a
retransmission timeout is required to repair the loss - dropping the
TCP connection to a congestion window with only one segment.
[TCPB98] and [TCPF98] observe that (in studies of network trace
datasets) it is relatively common for TCP retransmission timeouts to
occur even when some duplicate acknowledgements are being sent. The
challenge is to use these duplicate acknowledgements to trigger fast
retransmit/fast recovery without injecting traffic into the network
unnecessarily - and especially not injecting traffic in ways that
will result in instability.
The "Limited Transmit" algorithm [RFC3042] suggests sending a new
segment when the first and second duplicate acknowledgements are
received, so that the receiver is more likely to be able to continue
to generate duplicate acknowledgements until the TCP retransmit
threshold is reached, triggering fast retransmit and fast recovery.
When the congestion window is small, this is very useful in assisting
fast retransmit and fast recovery to recover from a packet loss
without using a retransmission timeout. We note that a maximum of
two additional new segments will be sent before the receiver sends
either a new acknowledgement advancing the window or two additional
duplicate acknowledgements, triggering fast retransmit/fast recovery,
and that these new segments will be acknowledgement-clocked, not
back-to-back.
Recommendation: Limited Transmit should be implemented in all hosts.
3.0 Summary of Recommended Optimizations
This section summarizes our recommendations regarding the previous
standards-track mechanisms, for end nodes that are connected via a
slow link.
Header compression should be implemented. [RFC1144] header
compression can be enabled over robust network links. [RFC2507]
should be used over network connections that are expected to
experience loss due to corruption as well as loss due to congestion.
For extremely lossy and slow links, implementors should evaluate ROHC
[RFC3095] as a potential solution. [RFC1323] TCP timestamps must be
turned off because (1) their protection against TCP sequence number
wrapping is unjustified for slow links, and (2) they complicate TCP
header compression.
Dawkins, et al. Best Current Practice [Page 10]
RFC 3150 PILC - Slow Links July 2001
IP Payload Compression [RFC2393] should be implemented, although
compression at higher layers of the protocol stack (for example [RFC
2616]) may make this mechanism less useful.
For HTTP/1.1 environments, [RFC2616] payload compression should be
implemented and should be used for payloads that are not already
compressed.
Implementors should choose MTUs that don't monopolize network
interfaces for more than 100-200 milliseconds, in order to limit the
impact of a single connection on all other connections sharing the
network interface.
Use of active queue management is recommended on last-hop routers
that provide Internet access to host behind a slow link. In
addition, number of router buffers per slow link should be large
enough to absorb concurrent data bursts from more than a single flow.
To absorb concurrent data bursts from two or three TCP senders with a
typical data burst of three back-to-back segments per sender, at
least six (6) or nine (9) buffers are needed. Effective use of
active queue management is likely to require even larger number of
buffers.
Implementors should consider the possibility that a host will be
directly connected to a low-speed link when choosing default TCP
receive window sizes.
Application developers should not attempt to manually manage network
bandwidth using socket buffer sizes as only in very rare
circumstances an application will be able to choose a suitable value
for the socket buffer size to obtain good network performance.
Limited Transmit [RFC3042] should be implemented in all end hosts as
it assists in triggering fast retransmit when congestion window is
small.
All of the mechanisms described above are stable standards-track RFCs
(at Proposed Standard status, as of this writing).
In addition, implementors may wish to consider TCP buffer auto-
tuning, especially when the host system is likely to be used with a
wide variety of access link speeds. This is not a standards-track
TCP mechanism but, as it is an operating system implementation issue,
it does not need to be standardized.
Of the above mechanisms, only Header Compression (for IP and TCP) may
cease to work in the presence of end-to-end IPSEC. However,
[RFC3095] does allow compressing the ESP header.
Dawkins, et al. Best Current Practice [Page 11]
RFC 3150 PILC - Slow Links July 2001
4.0 Topics For Further Work
In addition to the standards-track mechanisms discussed above, there
are still opportunities to improve performance over low-speed links.
"Sending fewer bits" is an obvious response to slow link speeds. The
now-defunct HTTP-NG proposal [HTTP-NG] replaced the text-based HTTP
header representation with a binary representation for compactness.
However, HTTP-NG is not moving forward and HTTP/1.1 is not being
enhanced to include a more compact HTTP header representation.
Instead, the Wireless Application Protocol (WAP) Forum has opted for
the XML-based Wireless Session Protocol [WSP], which includes a
compact header encoding mechanism.
It would be nice to agree on a more compact header representation
that will be used by all WWW communities, not only the wireless WAN
community. Indeed, general XML content encodings have been proposed
[Millau], although they are not yet widely adopted.
We note that TCP options which change from segment to segment
effectively disable header compression schemes deployed today,
because there's no way to indicate that some fields in the header are
unchanged from the previous segment, while other fields are not. The
Robust Header Compression working group is developing such schemes
for TCP options such as timestamps and selective acknowledgements.
Hopefully, documents subsequent to [RFC3095] will define such
specifications.
Another effort worth following is that of 'Delta Encoding'. Here,
clients that request a slightly modified version of some previously
cached resource would receive a succinct description of the
differences, rather than the entire resource [HTTP-DELTA].
5.0 Security Considerations
All recommendations included in this document are stable standards-
track RFCs (at Proposed Standard status, as of this writing) or
otherwise do not suggest any changes to any protocol. With the
exception of Van Jacobson compression [RFC1144] and [RFC2507,
RFC2508, RFC2509], all other mechanisms are applicable to TCP
connections protected by end-to-end IPSec. This includes ROHC
[RFC3095], albeit partially, because even though it can compress the
outermost ESP header to some extent, encryption still renders any
payload data uncompressible (including any subsequent protocol
headers).
Dawkins, et al. Best Current Practice [Page 12]
RFC 3150 PILC - Slow Links July 2001
6.0 IANA Considerations
This document is a pointer to other, existing IETF standards. There
are no new IANA considerations.
7.0 Acknowledgements
This recommendation has grown out of "Long Thin Networks" [RFC2757],
which in turn benefited from work done in the IETF TCPSAT working
group.
8.0 References
[AlPa99] Mark Allman and Vern Paxson, "On Estimating End-to-End
Network Path Properties", in ACM SIGCOMM 99 Proceedings,
1999.
[HTTP-DELTA] J. Mogul, et al., "Delta encoding in HTTP", Work in
Progress.
[HTTP-NG] Mike Spreitzer, Bill Janssen, "HTTP 'Next Generation'",
9th International WWW Conference, May, 2000. Also
available as: http://www.www9.org/w9cdrom/60/60.html
[Millau] Marc Girardot, Neel Sundaresan, "Millau: an encoding
format for efficient representation and exchange of XML
over the Web", 9th International WWW Conference, May,
2000. Also available as:
http://www.www9.org/w9cdrom/154/154.html
[PAX97] Paxson, V., "End-to-End Internet Packet Dynamics", 1997,
in SIGCOMM 97 Proceedings, available as:
http://www.acm.org/sigcomm/ccr/archive/ccr-toc/ccr-toc-
97.html
[RED93] Floyd, S., and Jacobson, V., Random Early Detection
gateways for Congestion Avoidance, IEEE/ACM Transactions
on Networking, V.1 N.4, August 1993, pp. 397-413. Also
available from http://ftp.ee.lbl.gov/floyd/red.html.
[RFC1144] Jacobson, V., "Compressing TCP/IP Headers for Low-Speed
Serial Links", RFC 1144, February 1990.
Dawkins, et al. Best Current Practice [Page 13]
RFC 3150 PILC - Slow Links July 2001
[RFC1323] Jacobson, V., Braden, R. and D. Borman, "TCP Extensions
for High Performance", RFC 1323, May 1992.
[RFC2246] Dierks, T. and C. Allen, "The TLS Protocol: Version
1.0", RFC 2246, January 1999.
[RFC2309] Braden, R., Clark, D., Crowcroft, J., Davie, B.,
Deering, S., Estrin, D., Floyd, S., Jacobson, V.,
Minshall, G., Partridge, C., Peterson, L., Ramakrishnan,
K., Shenker, S., Wroclawski, J. and L. Zhang,
"Recommendations on Queue Management and Congestion
Avoidance in the Internet", RFC 2309, April 1998.
[RFC2393] Shacham, A., Monsour, R., Pereira, R. and M. Thomas, "IP
Payload Compression Protocol (IPComp)", RFC 2393,
December 1998.
[RFC2401] Kent, S. and R. Atkinson, "Security Architecture for the
Internet Protocol", RFC 2401, November 1998.
[RFC2416] Shepard, T. and C. Partridge, "When TCP Starts Up With
Four Packets Into Only Three Buffers", RFC 2416,
September 1998.
[RFC2507] Degermark, M., Nordgren, B. and S. Pink, "IP Header
Compression", RFC 2507, February 1999.
[RFC2508] Casner, S. and V. Jacobson. "Compressing IP/UDP/RTP
Headers for Low-Speed Serial Links", RFC 2508, February
1999.
[RFC2509] Engan, M., Casner, S. and C. Bormann, "IP Header
Compression over PPP", RFC 2509, February 1999.
[RFC2581] Allman, M., Paxson, V. and W. Stevens, "TCP Congestion
Control", RFC 2581, April 1999.
[RFC2616] Fielding, R., Gettys, J., Mogul, J., Frystyk, H.,
Masinter, L., Leach, P. and T. Berners-Lee, "Hypertext
Transfer Protocol -- HTTP/1.1", RFC 2616, June 1999.
[RFC2757] Montenegro, G., Dawkins, S., Kojo, M., Magret, V., and
N. Vaidya, "Long Thin Networks", RFC 2757, January 2000.
[RFC3042] Allman, M., Balakrishnan, H. and S. Floyd, "Enhancing
TCP's Loss Recovery Using Limited Transmit", RFC 3042,
January 2001.
Dawkins, et al. Best Current Practice [Page 14]
RFC 3150 PILC - Slow Links July 2001
[RFC3095] Bormann, C., Burmeister, C., Degermark, M., Fukushima,
H., Hannu, H., Jonsson, L-E., Hakenberg, R., Koren, T.,
Le, K., Liu, Z., Martensson, A., Miyazaki, A., Svanbro,
K., Wiebke, T., Yoshimura, T. and H. Zheng, "RObust
Header Compression (ROHC): Framework and four Profiles:
RTP, UDP ESP and uncompressed", RFC 3095, July 2001.
[SMM98] Jeffrey Semke, Matthew Mathis, and Jamshid Mahdavi,
"Automatic TCP Buffer Tuning", in ACM SIGCOMM 98
Proceedings 1998. Available from
http://www.acm.org/sigcomm/sigcomm98/tp/abs_26.html.
[SSL] Alan O. Freier, Philip Karlton, Paul C. Kocher, The SSL
Protocol: Version 3.0, March 1996. (Expired Internet-
Draft, available from
http://home.netscape.com/eng/ssl3/ssl-toc.html)
[TCPB98] Hari Balakrishnan, Venkata N. Padmanabhan, Srinivasan
Seshan, Mark Stemm, Randy H. Katz, "TCP Behavior of a
Busy Internet Server: Analysis and Improvements", IEEE
Infocom, March 1998. Available from:
http://www.cs.berkeley.edu/~hari/papers/infocom98.ps.gz
[TCPF98] Dong Lin and H.T. Kung, "TCP Fast Recovery Strategies:
Analysis and Improvements", IEEE Infocom, March 1998.
Available from:
http://www.eecs.harvard.edu/networking/papers/ infocom-
tcp-final-198.pdf
[WSP] Wireless Application Protocol Forum, "WAP Wireless
Session Protocol Specification", approved 4 May, 2000,
available from
http://www1.wapforum.org/tech/documents/WAP-203-WSP-
20000504-a.pdf. (informative reference).
Dawkins, et al. Best Current Practice [Page 15]
RFC 3150 PILC - Slow Links July 2001
Authors' Addresses
Questions about this document may be directed to:
Spencer Dawkins
Fujitsu Network Communications
2801 Telecom Parkway
Richardson, Texas 75082
Phone: +1-972-479-3782
EMail: spencer.dawkins@fnc.fujitsu.com
Gabriel Montenegro
Sun Microsystems Laboratories, Europe
29, chemin du Vieux Chene
38240 Meylan, FRANCE
Phone: +33 476 18 80 45
EMail: gab@sun.com
Markku Kojo
Department of Computer Science
University of Helsinki
P.O. Box 26 (Teollisuuskatu 23)
FIN-00014 HELSINKI
Finland
Phone: +358-9-1914-4179
Fax: +358-9-1914-4441
EMail: kojo@cs.helsinki.fi
Vincent Magret
Alcatel Internetworking, Inc.
26801 W. Agoura road
Calabasas, CA, 91301
Phone: +1 818 878 4485
EMail: vincent.magret@alcatel.com
Dawkins, et al. Best Current Practice [Page 16]
RFC 3150 PILC - Slow Links July 2001
Full Copyright Statement
Copyright (C) The Internet Society (2001). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Acknowledgement
Funding for the RFC Editor function is currently provided by the
Internet Society.
Dawkins, et al. Best Current Practice [Page 17]

View File

@ -0,0 +1,899 @@
Network Working Group S. Dawkins
Request for Comments: 3155 G. Montenegro
BCP: 50 M. Kojo
Category: Best Current Practice V. Magret
N. Vaidya
August 2001
End-to-end Performance Implications of Links with Errors
Status of this Memo
This document specifies an Internet Best Current Practices for the
Internet Community, and requests discussion and suggestions for
improvements. Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (2001). All Rights Reserved.
Abstract
This document discusses the specific TCP mechanisms that are
problematic in environments with high uncorrected error rates, and
discusses what can be done to mitigate the problems without
introducing intermediate devices into the connection.
Table of Contents
1.0 Introduction ............................................. 2
1.1 Should you be reading this recommendation? ........... 3
1.2 Relationship of this recommendation to PEPs ........... 4
1.3 Relationship of this recommendation to Link Layer
Mechanisms............................................. 4
2.0 Errors and Interactions with TCP Mechanisms .............. 5
2.1 Slow Start and Congestion Avoidance [RFC2581] ......... 5
2.2 Fast Retransmit and Fast Recovery [RFC2581] ........... 6
2.3 Selective Acknowledgements [RFC2018, RFC2883] ......... 7
3.0 Summary of Recommendations ............................... 8
4.0 Topics For Further Work .................................. 9
4.1 Achieving, and maintaining, large windows ............. 10
5.0 Security Considerations .................................. 11
6.0 IANA Considerations ...................................... 11
7.0 Acknowledgements ......................................... 11
References ................................................... 11
Authors' Addresses ........................................... 14
Full Copyright Statement ..................................... 16
Dawkins, et al. Best Current Practice [Page 1]
RFC 3155 PILC - Links with Errors August 2001
1.0 Introduction
The rapidly-growing Internet is being accessed by an increasingly
wide range of devices over an increasingly wide variety of links. At
least some of these links do not provide the degree of reliability
that hosts expect, and this expansion into unreliable links causes
some Internet protocols, especially TCP [RFC793], to perform poorly.
Specifically, TCP congestion control [RFC2581], while appropriate for
connections that lose traffic primarily because of congestion and
buffer exhaustion, interacts badly with uncorrected errors when TCP
connections traverse links with high uncorrected error rates. The
result is that sending TCPs may spend an excessive amount of time
waiting for acknowledgement that do not arrive, and then, although
these losses are not due to congestion-related buffer exhaustion, the
sending TCP transmits at substantially reduced traffic levels as it
probes the network to determine "safe" traffic levels.
This document does not address issues with other transport protocols,
for example, UDP.
Congestion avoidance in the Internet is based on an assumption that
most packet losses are due to congestion. TCP's congestion avoidance
strategy treats the absence of acknowledgement as a congestion
signal. This has worked well since it was introduced in 1988 [VJ-
DCAC], because most links and subnets have relatively low error rates
in normal operation, and congestion is the primary cause of loss in
these environments. However, links and subnets that do not enjoy low
uncorrected error rates are becoming more prevalent in parts of the
Internet. In particular, these include terrestrial and satellite
wireless links. Users relying on traffic traversing these links may
see poor performance because their TCP connections are spending
excessive time in congestion avoidance and/or slow start procedures
triggered by packet losses due to transmission errors.
The recommendations in this document aim at improving utilization of
available path capacity over such high error-rate links in ways that
do not threaten the stability of the Internet.
Applications use TCP in very different ways, and these have
interactions with TCP's behavior [RFC2861]. Nevertheless, it is
possible to make some basic assumptions about TCP flows.
Accordingly, the mechanisms discussed here are applicable to all uses
of TCP, albeit in varying degrees according to different scenarios
(as noted where appropriate).
Dawkins, et al. Best Current Practice [Page 2]
RFC 3155 PILC - Links with Errors August 2001
This recommendation is based on the explicit assumption that major
changes to the entire installed base of routers and hosts are not a
practical possibility. This constrains any changes to hosts that are
directly affected by errored links.
1.1 Should you be reading this recommendation?
All known subnetwork technologies provide an "imperfect" subnetwork
service - the bit error rate is non-zero. But there's no obvious way
for end stations to tell the difference between packets discarded due
to congestion and losses due to transmission errors.
If a directly-attached subnetwork is reporting transmission errors to
a host, these reports matter, but we can't rely on explicit
transmission error reports to both hosts.
Another way of deciding if a subnetwork should be considered to have
a "high error rate" is by appealing to mathematics.
An approximate formula for the TCP Reno response function is given in
[PFTK98]:
s
T = --------------------------------------------------
RTT*sqrt(2p/3) + tRTO*(3*sqrt(3p/8))*p*(1 + 32p**2)
where
T = the sending rate in bytes per second
s = the packet size in bytes
RTT = round-trip time in seconds
tRTO = TCP retransmit timeout value in seconds
p = steady-state packet loss rate
If one plugs in an observed packet loss rate, does the math and then
sees predicted bandwidth utilization that is greater than the link
speed, the connection will not benefit from recommendations in this
document, because the level of packet losses being encountered won't
affect the ability of TCP to utilize the link. If, however, the
predicted bandwidth is less than the link speed, packet losses are
affecting the ability of TCP to utilize the link.
If further investigation reveals a subnetwork with significant
transmission error rates, the recommendations in this document will
improve the ability of TCP to utilize the link.
Dawkins, et al. Best Current Practice [Page 3]
RFC 3155 PILC - Links with Errors August 2001
A few caveats are in order, when doing this calculation:
(1) the RTT is the end-to-end RTT, not the link RTT.
(2) Max(1.0, 4*RTT) can be substituted as a simplification for
tRTO.
(3) losses may be bursty - a loss rate measured over an interval
that includes multiple bursty loss events may understate the
impact of these loss events on the sending rate.
1.2 Relationship of this recommendation to PEPs
This document discusses end-to-end mechanisms that do not require
TCP-level awareness by intermediate nodes. This places severe
limitations on what the end nodes can know about the nature of losses
that are occurring between the end nodes. Attempts to apply
heuristics to distinguish between congestion and transmission error
have not been successful [BV97, BV98, BV98a]. This restriction is
relaxed in an informational document on Performance Enhancing Proxies
(PEPs) [RFC3135]. Because PEPs can be placed on boundaries where
network characteristics change dramatically, PEPs have an additional
opportunity to improve performance over links with uncorrected
errors.
However, generalized use of PEPs contravenes the end-to-end principle
and is highly undesirable given their deleterious implications, which
include the following: lack of fate sharing (a PEP adds a third point
of failure besides the endpoints themselves), end-to-end reliability
and diagnostics, preventing end-to-end security (particularly network
layer security such as IPsec), mobility (handoffs are much more
complex because state must be transferred), asymmetric routing (PEPs
typically require being on both the forward and reverse paths of a
connection), scalability (PEPs add more state to maintain), QoS
transparency and guarantees.
Not every type of PEP has all the drawbacks listed above.
Nevertheless, the use of PEPs may have very serious consequences
which must be weighed carefully.
1.3 Relationship of this recommendation to Link Layer Mechanisms
This recommendation is for use with TCP over subnetwork technologies
(link layers) that have already been deployed. Subnetworks that are
intended to carry Internet protocols, but have not been completely
specified are the subject of a best common practices (BCP) document
which has been developed or is under development by the Performance
Dawkins, et al. Best Current Practice [Page 4]
RFC 3155 PILC - Links with Errors August 2001
Implications of Link Characteristics WG (PILC) [PILC-WEB]. This last
document is aimed at designers who still have the opportunity to
reduce the number of uncorrected errors TCP will encounter.
2.0 Errors and Interactions with TCP Mechanisms
A TCP sender adapts its use of network path capacity based on
feedback from the TCP receiver. As TCP is not able to distinguish
between losses due to congestion and losses due to uncorrected
errors, it is not able to accurately determine available path
capacity in the presence of significant uncorrected errors.
2.1 Slow Start and Congestion Avoidance [RFC2581]
Slow Start and Congestion Avoidance [RFC2581] are essential to the
current stability of the Internet. These mechanisms were designed to
accommodate networks that do not provide explicit congestion
notification. Although experimental mechanisms such as [RFC2481] are
moving in the direction of explicit congestion notification, the
effect of ECN on ECN-aware TCPs is essentially the same as the effect
of implicit congestion notification through congestion-related loss,
except that ECN provides this notification before packets are lost,
and must then be retransmitted.
TCP connections experiencing high error rates on their paths interact
badly with Slow Start and with Congestion Avoidance, because high
error rates make the interpretation of losses ambiguous - the sender
cannot know whether detected losses are due to congestion or to data
corruption. TCP makes the "safe" choice and assumes that the losses
are due to congestion.
- Whenever sending TCPs receive three out-of-order
acknowledgement, they assume the network is mildly congested
and invoke fast retransmit/fast recovery (described below).
- Whenever TCP's retransmission timer expires, the sender assumes
that the network is congested and invokes slow start.
- Less-reliable link layers often use small link MTUs. This
slows the rate of increase in the sender's window size during
slow start, because the sender's window is increased in units
of segments. Small link MTUs alone don't improve reliability.
Path MTU discovery [RFC1191] must also be used to prevent
fragmentation. Path MTU discovery allows the most rapid
opening of the sender's window size during slow start, but a
number of round trips may still be required to open the window
completely.
Dawkins, et al. Best Current Practice [Page 5]
RFC 3155 PILC - Links with Errors August 2001
Recommendation: Any standards-conformant TCP will implement Slow
Start and Congestion Avoidance, which are MUSTs in STD 3 [RFC1122].
Recommendations in this document will not interfere with these
mechanisms.
2.2 Fast Retransmit and Fast Recovery [RFC2581]
TCP provides reliable delivery of data as a byte-stream to an
application, so that when a segment is lost (whether due to either
congestion or transmission loss), the receiver TCP implementation
must wait to deliver data to the receiving application until the
missing data is received. The receiver TCP implementation detects
missing segments by segments arriving with out-of-order sequence
numbers.
TCPs should immediately send an acknowledgement when data is received
out-of-order [RFC2581], providing the next expected sequence number
with no delay, so that the sender can retransmit the required data as
quickly as possible and the receiver can resume delivery of data to
the receiving application. When an acknowledgement carries the same
expected sequence number as an acknowledgement that has already been
sent for the last in-order segment received, these acknowledgement
are called "duplicate ACKs".
Because IP networks are allowed to reorder packets, the receiver may
send duplicate acknowledgments for segments that arrive out of order
due to routing changes, link-level retransmission, etc. When a TCP
sender receives three duplicate ACKs, fast retransmit [RFC2581]
allows it to infer that a segment was lost. The sender retransmits
what it considers to be this lost segment without waiting for the
full retransmission timeout, thus saving time.
After a fast retransmit, a sender halves its congestion window and
invokes the fast recovery [RFC2581] algorithm, whereby it invokes
congestion avoidance from a halved congestion window, but does not
invoke slow start from a one-segment congestion window as it would do
after a retransmission timeout. As the sender is still receiving
dupacks, it knows the receiver is receiving packets sent, so the full
reduction after a timeout when no communication has been received is
not called for. This relatively safe optimization also saves time.
It is important to be realistic about the maximum throughput that TCP
can have over a connection that traverses a high error-rate link. In
general, TCP will increase its congestion window beyond the delay-
bandwidth product. TCP's congestion avoidance strategy is additive-
increase, multiplicative-decrease, which means that if additional
errors are encountered before the congestion window recovers
completely from a 50-percent reduction, the effect can be a "downward
Dawkins, et al. Best Current Practice [Page 6]
RFC 3155 PILC - Links with Errors August 2001
spiral" of the congestion window due to additional 50-percent
reductions. Even using Fast Retransmit/Fast Recovery, the sender
will halve the congestion window each time a window contains one or
more segments that are lost, and will re-open the window by one
additional segment for each congestion window's worth of
acknowledgement received.
If a connection's path traverses a link that loses one or more
segments during this recovery period, the one-half reduction takes
place again, this time on a reduced congestion window - and this
downward spiral will continue to hold the congestion window below
path capacity until the connection is able to recover completely by
additive increase without experiencing loss.
Of course, no downward spiral occurs if the error rate is constantly
high and the congestion window always remains small; the
multiplicative-increase "slow start" will be exited early, and the
congestion window remains low for the duration of the TCP connection.
In links with high error rates, the TCP window may remain rather
small for long periods of time.
Not all causes of small windows are related to errors. For example,
HTTP/1.0 commonly closes TCP connections to indicate boundaries
between requested resources. This means that these applications are
constantly closing "trained" TCP connections and opening "untrained"
TCP connections which will execute slow start, beginning with one or
two segments. This can happen even with HTTP/1.1, if webmasters
configure their HTTP/1.1 servers to close connections instead of
waiting to see if the connection will be useful again.
A small window - especially a window of less than four segments -
effectively prevents the sender from taking advantage of Fast
Retransmits. Moreover, efficient recovery from multiple losses
within a single window requires adoption of new proposals (NewReno
[RFC2582]).
Recommendation: Implement Fast Retransmit and Fast Recovery at this
time. This is a widely-implemented optimization and is currently at
Proposed Standard level. [RFC2488] recommends implementation of Fast
Retransmit/Fast Recovery in satellite environments.
2.3 Selective Acknowledgements [RFC2018, RFC2883]
Selective Acknowledgements [RFC2018] allow the repair of multiple
segment losses per window without requiring one (or more) round-trips
per loss.
Dawkins, et al. Best Current Practice [Page 7]
RFC 3155 PILC - Links with Errors August 2001
[RFC2883] proposes a minor extension to SACK that allows receiving
TCPs to provide more information about the order of delivery of
segments, allowing "more robust operation in an environment of
reordered packets, ACK loss, packet replication, and/or early
retransmit timeouts". Unless explicitly stated otherwise, in this
document, "Selective Acknowledgements" (or "SACK") refers to the
combination of [RFC2018] and [RFC2883].
Selective acknowledgments are most useful in LFNs ("Long Fat
Networks") because of the long round trip times that may be
encountered in these environments, according to Section 1.1 of
[RFC1323], and are especially useful if large windows are required,
because there is a higher probability of multiple segment losses per
window.
On the other hand, if error rates are generally low but occasionally
higher due to channel conditions, TCP will have the opportunity to
increase its window to larger values during periods of improved
channel conditions between bursts of errors. When bursts of errors
occur, multiple losses within a window are likely to occur. In this
case, SACK would provide benefits in speeding the recovery and
preventing unnecessary reduction of the window size.
Recommendation: Implement SACK as specified in [RFC2018] and updated
by [RFC2883], both Proposed Standards. In cases where SACK cannot be
enabled for both sides of a connection, TCP senders may use NewReno
[RFC2582] to better handle partial ACKs and multiple losses within a
single window.
3.0 Summary of Recommendations
The Internet does not provide a widely-available loss feedback
mechanism that allows TCP to distinguish between congestion loss and
transmission error. Because congestion affects all traffic on a path
while transmission loss affects only the specific traffic
encountering uncorrected errors, avoiding congestion has to take
precedence over quickly repairing transmission errors. This means
that the best that can be achieved without new feedback mechanisms is
minimizing the amount of time that is spent unnecessarily in
congestion avoidance.
The Fast Retransmit/Fast Recovery mechanism allows quick repair of
loss without giving up the safety of congestion avoidance. In order
for Fast Retransmit/Fast Recovery to work, the window size must be
large enough to force the receiver to send three duplicate
acknowledgments before the retransmission timeout interval expires,
forcing full TCP slow-start.
Dawkins, et al. Best Current Practice [Page 8]
RFC 3155 PILC - Links with Errors August 2001
Selective Acknowledgements (SACK) extend the benefit of Fast
Retransmit/Fast Recovery to situations where multiple segment losses
in the window need to be repaired more quickly than can be
accomplished by executing Fast Retransmit for each segment loss, only
to discover the next segment loss.
These mechanisms are not limited to wireless environments. They are
usable in all environments.
4.0 Topics For Further Work
"Limited Transmit" [RFC3042] has been specified as an optimization
extending Fast Retransmit/Fast Recovery for TCP connections with
small congestion windows that will not trigger three duplicate
acknowledgments. This specification is deemed safe, and it also
provides benefits for TCP connections that experience a large amount
of packet (data or ACK) loss. Implementors should evaluate this
standards track specification for TCP in loss environments.
Delayed Duplicate Acknowledgements [MV97, VMPM99] attempts to prevent
TCP-level retransmission when link-level retransmission is still in
progress, adding additional traffic to the network. This proposal is
worthy of additional study, but is not recommended at this time,
because we don't know how to calculate appropriate amounts of delay
for an arbitrary network topology.
It is not possible to use explicit congestion notification [RFC2481]
as a surrogate for explicit transmission error notification (no
matter how much we wish it was!). Some mechanism to provide explicit
notification of transmission error would be very helpful. This might
be more easily provided in a PEP environment, especially when the PEP
is the "first hop" in a connection path, because current checksum
mechanisms do not distinguish between transmission error to a payload
and transmission error to the header. Furthermore, if the header is
damaged, sending explicit transmission error notification to the
right endpoint is problematic.
Losses that take place on the ACK stream, especially while a TCP is
learning network characteristics, can make the data stream quite
bursty (resulting in losses on the data stream, as well). Several
ways of limiting this burstiness have been proposed, including TCP
transmit pacing at the sender and ACK rate control within the
network.
"Appropriate Byte Counting" (ABC) [ALL99], has been proposed as a way
of opening the congestion window based on the number of bytes that
have been successfully transfered to the receiver, giving more
appropriate behavior for application protocols that initiate
Dawkins, et al. Best Current Practice [Page 9]
RFC 3155 PILC - Links with Errors August 2001
connections with relatively short packets. For SMTP [RFC2821], for
instance, the client might send a short HELO packet, a short MAIL
packet, one or more short RCPT packets, and a short DATA packet -
followed by the entire mail body sent as maximum-length packets. An
ABC TCP sender would not use ACKs for each of these short packets to
increase the congestion window to allow additional full-length
packets. ABC is worthy of additional study, but is not recommended
at this time, because ABC can lead to increased burstiness when
acknowledgments are lost.
4.1 Achieving, and maintaining, large windows
The recommendations described in this document will aid TCPs in
injecting packets into ERRORed connections as fast as possible
without destabilizing the Internet, and so optimizing the use of
available bandwidth.
In addition to these TCP-level recommendations, there is still
additional work to do at the application level, especially with the
dominant application protocol on the World Wide Web, HTTP.
HTTP/1.0 (and earlier versions) closes TCP connections to signal a
receiver that all of a requested resource had been transmitted.
Because WWW objects tend to be small in size [MOGUL], TCPs carrying
HTTP/1.0 traffic experience difficulty in "training" on available
path capacity (a substantial portion of the transfer has already
happened by the time TCP exits slow start).
Several HTTP modifications have been introduced to improve this
interaction with TCP ("persistent connections" in HTTP/1.0, with
improvements in HTTP/1.1 [RFC2616]). For a variety of reasons, many
HTTP interactions are still HTTP/1.0-style - relatively short-lived.
Proposals which reuse TCP congestion information across connections,
like TCP Control Block Interdependence [RFC2140], or the more recent
Congestion Manager [BS00] proposal, will have the effect of making
multiple parallel connections impact the network as if they were a
single connection, "trained" after a single startup transient. These
proposals are critical to the long-term stability of the Internet,
because today's users always have the choice of clicking on the
"reload" button in their browsers and cutting off TCP's exponential
backoff - replacing connections which are building knowledge of the
available bandwidth with connections with no knowledge at all.
Dawkins, et al. Best Current Practice [Page 10]
RFC 3155 PILC - Links with Errors August 2001
5.0 Security Considerations
A potential vulnerability introduced by Fast Retransmit/Fast Recovery
is (as pointed out in [RFC2581]) that an attacker may force TCP
connections to grind to a halt, or, more dangerously, behave more
aggressively. The latter possibility may lead to congestion
collapse, at least in some regions of the network.
Selective acknowledgments is believed to neither strengthen nor
weaken TCP's current security properties [RFC2018].
Given that the recommendations in this document are performed on an
end-to-end basis, they continue working even in the presence of end-
to-end IPsec. This is in direct contrast with mechanisms such as
PEP's which are implemented in intermediate nodes (section 1.2).
6.0 IANA Considerations
This document is a pointer to other, existing IETF standards. There
are no new IANA considerations.
7.0 Acknowledgements
This recommendation has grown out of RFC 2757, "Long Thin Networks",
which was in turn based on work done in the IETF TCPSAT working
group. The authors are indebted to the active members of the PILC
working group. In particular, Mark Allman and Lloyd Wood gave us
copious and insightful feedback, and Dan Grossman and Jamshid Mahdavi
provided text replacements.
References
[ALL99] M. Allman, "TCP Byte Counting Refinements," ACM Computer
Communication Review, Volume 29, Number 3, July 1999.
http://www.acm.org/sigcomm/ccr/archive/ccr-toc/ccr-toc-
99.html
[BS00] Balakrishnan, H. and S. Seshan, "The Congestion Manager",
RFC 3124, June 2001.
[BV97] S. Biaz and N. Vaidya, "Using End-to-end Statistics to
Distinguish Congestion and Corruption Losses: A Negative
Result," Texas A&M University, Technical Report 97-009,
August 18, 1997.
Dawkins, et al. Best Current Practice [Page 11]
RFC 3155 PILC - Links with Errors August 2001
[BV98] S. Biaz and N. Vaidya, "Sender-Based heuristics for
Distinguishing Congestion Losses from Wireless
Transmission Losses," Texas A&M University, Technical
Report 98-013, June 1998.
[BV98a] S. Biaz and N. Vaidya, "Discriminating Congestion Losses
from Wireless Losses using Inter-Arrival Times at the
Receiver," Texas A&M University, Technical Report 98-014,
June 1998.
[MOGUL] "The Case for Persistent-Connection HTTP", J. C. Mogul,
Research Report 95/4, May 1995. Available as
http://www.research.digital.com/wrl/techreports/abstracts/
95.4.html
[MV97] M. Mehta and N. Vaidya, "Delayed Duplicate-
Acknowledgements: A Proposal to Improve Performance of
TCP on Wireless Links," Texas A&M University, December 24,
1997. Available at
http://www.cs.tamu.edu/faculty/vaidya/mobile.html
[PILC-WEB] http://pilc.grc.nasa.gov/
[PFTK98] Padhye, J., Firoiu, V., Towsley, D. and J.Kurose, "TCP
Throughput: A simple model and its empirical validation",
SIGCOMM Symposium on Communications Architectures and
Protocols, August 1998.
[RFC793] Postel, J., "Transmission Control Protocol", STD 7, RFC
793, September 1981.
[RFC2821] Klensin, J., Editor, "Simple Mail Transfer Protocol", RFC
2821, April 2001.
[RFC1122] Braden, R., "Requirements for Internet Hosts --
Communication Layers", STD 3, RFC 1122, October 1989.
[RFC1191] Mogul J., and S. Deering, "Path MTU Discovery", RFC 1191,
November 1990.
[RFC1323] Jacobson, V., Braden, R. and D. Borman. "TCP Extensions
for High Performance", RFC 1323, May 1992.
[RFC2018] Mathis, M., Mahdavi, J., Floyd, S. and A. Romanow "TCP
Selective Acknowledgment Options", RFC 2018, October 1996.
[RFC2140] Touch, J., "TCP Control Block Interdependence", RFC 2140,
April 1997.
Dawkins, et al. Best Current Practice [Page 12]
RFC 3155 PILC - Links with Errors August 2001
[RFC2309] Braden, B., Clark, D., Crowcrfot, J., Davie, B., Deering,
S., Estrin, D., Floyd, S., Jacobson, V., Minshall, G.,
Partridge, C., Peterson, L., Ramakrishnan, K., Shecker,
S., Wroclawski, J. and L, Zhang, "Recommendations on Queue
Management and Congestion Avoidance in the Internet", RFC
2309, April 1998.
[RFC2481] Ramakrishnan K. and S. Floyd, "A Proposal to add Explicit
Congestion Notification (ECN) to IP", RFC 2481, January
1999.
[RFC2488] Allman, M., Glover, D. and L. Sanchez. "Enhancing TCP Over
Satellite Channels using Standard Mechanisms", BCP 28, RFC
2488, January 1999.
[RFC2581] Allman, M., Paxson, V. and W. Stevens, "TCP Congestion
Control", RFC 2581, April 1999.
[RFC2582] Floyd, S. and T. Henderson, "The NewReno Modification to
TCP's Fast Recovery Algorithm", RFC 2582, April 1999.
[RFC2616] Fielding, R., Gettys, J., Mogul, J., Frystyk, H.,
Masinter, L., Leach P. and T. Berners-Lee, "Hypertext
Transfer Protocol -- HTTP/1.1", RFC 2616, June 1999.
[RFC2861] Handley, H., Padhye, J. and S., Floyd, "TCP Congestion
Window Validation", RFC 2861, June 2000.
[RFC2883] Floyd, S., Mahdavi, M., Mathis, M. and M. Podlosky, "An
Extension to the Selective Acknowledgement (SACK) Option
for TCP", RFC 2883, August 1999.
[RFC2923] Lahey, K., "TCP Problems with Path MTU Discovery", RFC
2923, September 2000.
[RFC3042] Allman, M., Balakrishnan, H. and S. Floyd, "Enhancing
TCP's Loss Recovery Using Limited Transmit", RFC 3042,
January, 2001.
[RFC3135] Border, J., Kojo, M., Griner, J., Montenegro, G. and Z.
Shelby, "Performance Enhancing Proxies Intended to
Mitigate Link-Related Degradations", RFC 3135, June 2001.
[VJ-DCAC] Jacobson, V., "Dynamic Congestion Avoidance / Control" e-
mail dated February 11, 1988, available from
http://www.kohala.com/~rstevens/vanj.88feb11.txt
Dawkins, et al. Best Current Practice [Page 13]
RFC 3155 PILC - Links with Errors August 2001
[VMPM99] N. Vaidya, M. Mehta, C. Perkins, and G. Montenegro,
"Delayed Duplicate Acknowledgements: A TCP-Unaware
Approach to Improve Performance of TCP over Wireless,"
Technical Report 99-003, Computer Science Dept., Texas A&M
University, February 1999. Also, to appear in Journal of
Wireless Communications and Wireless Computing (Special
Issue on Reliable Transport Protocols for Mobile
Computing).
Authors' Addresses
Questions about this document may be directed to:
Spencer Dawkins
Fujitsu Network Communications
2801 Telecom Parkway
Richardson, Texas 75082
Phone: +1-972-479-3782
EMail: spencer.dawkins@fnc.fujitsu.com
Gabriel E. Montenegro
Sun Microsystems
Laboratories, Europe
29, chemin du Vieux Chene
38240 Meylan
FRANCE
Phone: +33 476 18 80 45
EMail: gab@sun.com
Markku Kojo
Department of Computer Science
University of Helsinki
P.O. Box 26 (Teollisuuskatu 23)
FIN-00014 HELSINKI
Finland
Phone: +358-9-1914-4179
EMail: kojo@cs.helsinki.fi
Dawkins, et al. Best Current Practice [Page 14]
RFC 3155 PILC - Links with Errors August 2001
Vincent Magret
Alcatel Internetworking, Inc.
26801 W. Agoura road
Calabasas, CA, 91301
Phone: +1 818 878 4485
EMail: vincent.magret@alcatel.com
Nitin H. Vaidya
458 Coodinated Science Laboratory, MC-228
1308 West Main Street
Urbana, IL 61801
Phone: 217-265-5414
E-mail: nhv@crhc.uiuc.edu
Dawkins, et al. Best Current Practice [Page 15]
RFC 3155 PILC - Links with Errors August 2001
Full Copyright Statement
Copyright (C) The Internet Society (2001). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Acknowledgement
Funding for the RFC Editor function is currently provided by the
Internet Society.
Dawkins, et al. Best Current Practice [Page 16]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,843 @@
Network Working Group M. Allman
Request for Comments: 3390 BBN/NASA GRC
Obsoletes: 2414 S. Floyd
Updates: 2581 ICIR
Category: Standards Track C. Partridge
BBN Technologies
October 2002
Increasing TCP's Initial Window
Status of this Memo
This document specifies an Internet standards track protocol for the
Internet community, and requests discussion and suggestions for
improvements. Please refer to the current edition of the "Internet
Official Protocol Standards" (STD 1) for the standardization state
and status of this protocol. Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (2002). All Rights Reserved.
Abstract
This document specifies an optional standard for TCP to increase the
permitted initial window from one or two segment(s) to roughly 4K
bytes, replacing RFC 2414. It discusses the advantages and
disadvantages of the higher initial window, and includes discussion
of experiments and simulations showing that the higher initial window
does not lead to congestion collapse. Finally, this document
provides guidance on implementation issues.
Terminology
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
"SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
document are to be interpreted as described in RFC 2119 [RFC2119].
1. TCP Modification
This document obsoletes [RFC2414] and updates [RFC2581] and specifies
an increase in the permitted upper bound for TCP's initial window
from one or two segment(s) to between two and four segments. In most
cases, this change results in an upper bound on the initial window of
roughly 4K bytes (although given a large segment size, the permitted
initial window of two segments may be significantly larger than 4K
bytes).
Allman, et. al. Standards Track [Page 1]
RFC 3390 Increasing TCP's Initial Window October 2002
The upper bound for the initial window is given more precisely in
(1):
min (4*MSS, max (2*MSS, 4380 bytes)) (1)
Note: Sending a 1500 byte packet indicates a maximum segment size
(MSS) of 1460 bytes (assuming no IP or TCP options). Therefore,
limiting the initial window's MSS to 4380 bytes allows the sender to
transmit three segments initially in the common case when using 1500
byte packets.
Equivalently, the upper bound for the initial window size is based on
the MSS, as follows:
If (MSS <= 1095 bytes)
then win <= 4 * MSS;
If (1095 bytes < MSS < 2190 bytes)
then win <= 4380;
If (2190 bytes <= MSS)
then win <= 2 * MSS;
This increased initial window is optional: a TCP MAY start with a
larger initial window. However, we expect that most general-purpose
TCP implementations would choose to use the larger initial congestion
window given in equation (1) above.
This upper bound for the initial window size represents a change from
RFC 2581 [RFC2581], which specified that the congestion window be
initialized to one or two segments.
This change applies to the initial window of the connection in the
first round trip time (RTT) of data transmission following the TCP
three-way handshake. Neither the SYN/ACK nor its acknowledgment
(ACK) in the three-way handshake should increase the initial window
size above that outlined in equation (1). If the SYN or SYN/ACK is
lost, the initial window used by a sender after a correctly
transmitted SYN MUST be one segment consisting of MSS bytes.
TCP implementations use slow start in as many as three different
ways: (1) to start a new connection (the initial window); (2) to
restart transmission after a long idle period (the restart window);
and (3) to restart transmission after a retransmit timeout (the loss
window). The change specified in this document affects the value of
the initial window. Optionally, a TCP MAY set the restart window to
the minimum of the value used for the initial window and the current
value of cwnd (in other words, using a larger value for the restart
window should never increase the size of cwnd). These changes do NOT
change the loss window, which must remain 1 segment of MSS bytes (to
Allman, et. al. Standards Track [Page 2]
RFC 3390 Increasing TCP's Initial Window October 2002
permit the lowest possible window size in the case of severe
congestion).
2. Implementation Issues
When larger initial windows are implemented along with Path MTU
Discovery [RFC1191], and the MSS being used is found to be too large,
the congestion window `cwnd' SHOULD be reduced to prevent large
bursts of smaller segments. Specifically, `cwnd' SHOULD be reduced
by the ratio of the old segment size to the new segment size.
When larger initial windows are implemented along with Path MTU
Discovery [RFC1191], alternatives are to set the "Don't Fragment"
(DF) bit in all segments in the initial window, or to set the "Don't
Fragment" (DF) bit in one of the segments. It is an open question as
to which of these two alternatives is best; we would hope that
implementation experiences will shed light on this question. In the
first case of setting the DF bit in all segments, if the initial
packets are too large, then all of the initial packets will be
dropped in the network. In the second case of setting the DF bit in
only one segment, if the initial packets are too large, then all but
one of the initial packets will be fragmented in the network. When
the second case is followed, setting the DF bit in the last segment
in the initial window provides the least chance for needless
retransmissions when the initial segment size is found to be too
large, because it minimizes the chances of duplicate ACKs triggering
a Fast Retransmit. However, more attention needs to be paid to the
interaction between larger initial windows and Path MTU Discovery.
The larger initial window specified in this document is not intended
as encouragement for web browsers to open multiple simultaneous TCP
connections, all with large initial windows. When web browsers open
simultaneous TCP connections to the same destination, they are
working against TCP's congestion control mechanisms [FF99],
regardless of the size of the initial window. Combining this
behavior with larger initial windows further increases the unfairness
to other traffic in the network. We suggest the use of HTTP/1.1
[RFC2068] (persistent TCP connections and pipelining) as a way to
achieve better performance of web transfers.
3. Advantages of Larger Initial Windows
1. When the initial window is one segment, a receiver employing
delayed ACKs [RFC1122] is forced to wait for a timeout before
generating an ACK. With an initial window of at least two
segments, the receiver will generate an ACK after the second data
segment arrives. This eliminates the wait on the timeout (often
up to 200 msec, and possibly up to 500 msec [RFC1122]).
Allman, et. al. Standards Track [Page 3]
RFC 3390 Increasing TCP's Initial Window October 2002
2. For connections transmitting only a small amount of data, a
larger initial window reduces the transmission time (assuming at
most moderate segment drop rates). For many email (SMTP [Pos82])
and web page (HTTP [RFC1945, RFC2068]) transfers that are less
than 4K bytes, the larger initial window would reduce the data
transfer time to a single RTT.
3. For connections that will be able to use large congestion
windows, this modification eliminates up to three RTTs and a
delayed ACK timeout during the initial slow-start phase. This
will be of particular benefit for high-bandwidth large-
propagation-delay TCP connections, such as those over satellite
links.
4. Disadvantages of Larger Initial Windows for the Individual
Connection
In high-congestion environments, particularly for routers that have a
bias against bursty traffic (as in the typical Drop Tail router
queues), a TCP connection can sometimes be better off starting with
an initial window of one segment. There are scenarios where a TCP
connection slow-starting from an initial window of one segment might
not have segments dropped, while a TCP connection starting with an
initial window of four segments might experience unnecessary
retransmits due to the inability of the router to handle small
bursts. This could result in an unnecessary retransmit timeout. For
a large-window connection that is able to recover without a
retransmit timeout, this could result in an unnecessarily-early
transition from the slow-start to the congestion-avoidance phase of
the window increase algorithm. These premature segment drops are
unlikely to occur in uncongested networks with sufficient buffering
or in moderately-congested networks where the congested router uses
active queue management (such as Random Early Detection [FJ93,
RFC2309]).
Some TCP connections will receive better performance with the larger
initial window even if the burstiness of the initial window results
in premature segment drops. This will be true if (1) the TCP
connection recovers from the segment drop without a retransmit
timeout, and (2) the TCP connection is ultimately limited to a small
congestion window by either network congestion or by the receiver's
advertised window.
5. Disadvantages of Larger Initial Windows for the Network
In terms of the potential for congestion collapse, we consider two
separate potential dangers for the network. The first danger would
be a scenario where a large number of segments on congested links
Allman, et. al. Standards Track [Page 4]
RFC 3390 Increasing TCP's Initial Window October 2002
were duplicate segments that had already been received at the
receiver. The second danger would be a scenario where a large number
of segments on congested links were segments that would be dropped
later in the network before reaching their final destination.
In terms of the negative effect on other traffic in the network, a
potential disadvantage of larger initial windows would be that they
increase the general packet drop rate in the network. We discuss
these three issues below.
Duplicate segments:
As described in the previous section, the larger initial window
could occasionally result in a segment dropped from the initial
window, when that segment might not have been dropped if the
sender had slow-started from an initial window of one segment.
However, Appendix A shows that even in this case, the larger
initial window would not result in the transmission of a large
number of duplicate segments.
Segments dropped later in the network:
How much would the larger initial window for TCP increase the
number of segments on congested links that would be dropped
before reaching their final destination? This is a problem that
can only occur for connections with multiple congested links,
where some segments might use scarce bandwidth on the first
congested link along the path, only to be dropped later along the
path.
First, many of the TCP connections will have only one congested
link along the path. Segments dropped from these connections do
not "waste" scarce bandwidth, and do not contribute to congestion
collapse.
However, some network paths will have multiple congested links,
and segments dropped from the initial window could use scarce
bandwidth along the earlier congested links before ultimately
being dropped on subsequent congested links. To the extent that
the drop rate is independent of the initial window used by TCP
segments, the problem of congested links carrying segments that
will be dropped before reaching their destination will be similar
for TCP connections that start by sending four segments or one
segment.
Allman, et. al. Standards Track [Page 5]
RFC 3390 Increasing TCP's Initial Window October 2002
An increased packet drop rate:
For a network with a high segment drop rate, increasing the TCP
initial window could increase the segment drop rate even further.
This is in part because routers with Drop Tail queue management
have difficulties with bursty traffic in times of congestion.
However, given uncorrelated arrivals for TCP connections, the
larger TCP initial window should not significantly increase the
segment drop rate. Simulation-based explorations of these issues
are discussed in Section 7.2.
These potential dangers for the network are explored in simulations
and experiments described in the section below. Our judgment is that
while there are dangers of congestion collapse in the current
Internet (see [FF99] for a discussion of the dangers of congestion
collapse from an increased deployment of UDP connections without
end-to-end congestion control), there is no such danger to the
network from increasing the TCP initial window to 4K bytes.
6. Interactions with the Retransmission Timer
Using a larger initial burst of data can exacerbate existing problems
with spurious retransmit timeouts on low-bandwidth paths, assuming
the standard algorithm for determining the TCP retransmission timeout
(RTO) [RFC2988]. The problem is that across low-bandwidth network
paths on which the transmission time of a packet is a large portion
of the round-trip time, the small packets used to establish a TCP
connection do not seed the RTO estimator appropriately. When the
first window of data packets is transmitted, the sender's retransmit
timer could expire before the acknowledgments for those packets are
received. As each acknowledgment arrives, the retransmit timer is
generally reset. Thus, the retransmit timer will not expire as long
as an acknowledgment arrives at least once a second, given the one-
second minimum on the RTO recommended in RFC 2988.
For instance, consider a 9.6 Kbps link. The initial RTT measurement
will be on the order of 67 msec, if we simply consider the
transmission time of 2 packets (the SYN and SYN-ACK), each consisting
of 40 bytes. Using the RTO estimator given in [RFC2988], this yields
an initial RTO of 201 msec (67 + 4*(67/2)). However, we round the
RTO to 1 second as specified in RFC 2988. Then assume we send an
initial window of one or more 1500-byte packets (1460 data bytes plus
overhead). Each packet will take on the order of 1.25 seconds to
transmit. Therefore, the RTO will fire before the ACK for the first
packet returns, causing a spurious timeout. In this case, a larger
initial window of three or four packets exacerbates the problems
caused by this spurious timeout.
Allman, et. al. Standards Track [Page 6]
RFC 3390 Increasing TCP's Initial Window October 2002
One way to deal with this problem is to make the RTO algorithm more
conservative. During the initial window of data, for instance, the
RTO could be updated for each acknowledgment received. In addition,
if the retransmit timer expires for some packet lost in the first
window of data, we could leave the exponential-backoff of the
retransmit timer engaged until at least one valid RTT measurement,
that involves a data packet, is received.
Another method would be to refrain from taking an RTT sample during
connection establishment, leaving the default RTO in place until TCP
takes a sample from a data segment and the corresponding ACK. While
this method likely helps prevent spurious retransmits, it also may
slow the data transfer down if loss occurs before the RTO is seeded.
The use of limited transmit [RFC3042] to aid a TCP connection in
recovering from loss using fast retransmit rather than the RTO timer
mitigates the performance degradation caused by using the high
default RTO during the initial window of data transmission.
This specification leaves the decision about what to do (if anything)
with regards to the RTO, when using a larger initial window, to the
implementer. However, the RECOMMENDED approach is to refrain from
sampling the RTT during the three-way handshake, keeping the default
RTO in place until an RTT sample involving a data packet is taken.
In addition, it is RECOMMENDED that TCPs use limited transmit
[RFC3042].
7. Typical Levels of Burstiness for TCP Traffic.
Larger TCP initial windows would not dramatically increase the
burstiness of TCP traffic in the Internet today, because such traffic
is already fairly bursty. Bursts of two and three segments are
already typical of TCP [Flo97]; a delayed ACK (covering two
previously unacknowledged segments) received during congestion
avoidance causes the congestion window to slide and two segments to
be sent. The same delayed ACK received during slow start causes the
window to slide by two segments and then be incremented by one
segment, resulting in a three-segment burst. While not necessarily
typical, bursts of four and five segments for TCP are not rare.
Assuming delayed ACKs, a single dropped ACK causes the subsequent ACK
to cover four previously unacknowledged segments. During congestion
avoidance this leads to a four-segment burst, and during slow start a
five-segment burst is generated.
There are also changes in progress that reduce the performance
problems posed by moderate traffic bursts. One such change is the
deployment of higher-speed links in some parts of the network, where
a burst of 4K bytes can represent a small quantity of data. A second
change, for routers with sufficient buffering, is the deployment of
Allman, et. al. Standards Track [Page 7]
RFC 3390 Increasing TCP's Initial Window October 2002
queue management mechanisms such as RED, which is designed to be
tolerant of transient traffic bursts.
8. Simulations and Experimental Results
8.1 Studies of TCP Connections using that Larger Initial Window
This section surveys simulations and experiments that explore the
effect of larger initial windows on TCP connections. The first set
of experiments explores performance over satellite links. Larger
initial windows have been shown to improve the performance of TCP
connections over satellite channels [All97b]. In this study, an
initial window of four segments (512 byte MSS) resulted in throughput
improvements of up to 30% (depending upon transfer size). [KAGT98]
shows that the use of larger initial windows results in a decrease in
transfer time in HTTP tests over the ACTS satellite system. A study
involving simulations of a large number of HTTP transactions over
hybrid fiber coax (HFC) indicates that the use of larger initial
windows decreases the time required to load WWW pages [Nic98].
A second set of experiments explored TCP performance over dialup
modem links. In experiments over a 28.8 bps dialup channel [All97a,
AHO98], a four-segment initial window decreased the transfer time of
a 16KB file by roughly 10%, with no accompanying increase in the drop
rate. A simulation study [RFC2416] investigated the effects of using
a larger initial window on a host connected by a slow modem link and
a router with a 3 packet buffer. The study concluded that for the
scenario investigated, the use of larger initial windows was not
harmful to TCP performance.
Finally, [All00] illustrates that the percentage of connections at a
particular web server that experience loss in the initial window of
data transmission increases with the size of the initial congestion
window. However, the increase is in line with what would be expected
from sending a larger burst into the network.
8.2 Studies of Networks using Larger Initial Windows
This section surveys simulations and experiments investigating the
impact of the larger window on other TCP connections sharing the
path. Experiments in [All97a, AHO98] show that for 16 KB transfers
to 100 Internet hosts, four-segment initial windows resulted in a
small increase in the drop rate of 0.04 segments/transfer. While the
drop rate increased slightly, the transfer time was reduced by
roughly 25% for transfers using the four-segment (512 byte MSS)
initial window when compared to an initial window of one segment.
Allman, et. al. Standards Track [Page 8]
RFC 3390 Increasing TCP's Initial Window October 2002
A simulation study in [RFC2415] explores the impact of a larger
initial window on competing network traffic. In this investigation,
HTTP and FTP flows share a single congested gateway (where the number
of HTTP and FTP flows varies from one simulation set to another).
For each simulation set, the paper examines aggregate link
utilization and packet drop rates, median web page delay, and network
power for the FTP transfers. The larger initial window generally
resulted in increased throughput, slightly-increased packet drop
rates, and an increase in overall network power. With the exception
of one scenario, the larger initial window resulted in an increase in
the drop rate of less than 1% above the loss rate experienced when
using a one-segment initial window; in this scenario, the drop rate
increased from 3.5% with one-segment initial windows, to 4.5% with
four-segment initial windows. The overall conclusions were that
increasing the TCP initial window to three packets (or 4380 bytes)
helps to improve perceived performance.
Morris [Mor97] investigated larger initial windows in a highly
congested network with transfers of 20K in size. The loss rate in
networks where all TCP connections use an initial window of four
segments is shown to be 1-2% greater than in a network where all
connections use an initial window of one segment. This relationship
held in scenarios where the loss rates with one-segment initial
windows ranged from 1% to 11%. In addition, in networks where
connections used an initial window of four segments, TCP connections
spent more time waiting for the retransmit timer (RTO) to expire to
resend a segment than was spent using an initial window of one
segment. The time spent waiting for the RTO timer to expire
represents idle time when no useful work was being accomplished for
that connection. These results show that in a very congested
environment, where each connection's share of the bottleneck
bandwidth is close to one segment, using a larger initial window can
cause a perceptible increase in both loss rates and retransmit
timeouts.
9. Security Considerations
This document discusses the initial congestion window permitted for
TCP connections. Changing this value does not raise any known new
security issues with TCP.
10. Conclusion
This document specifies a small change to TCP that will likely be
beneficial to short-lived TCP connections and those over links with
long RTTs (saving several RTTs during the initial slow-start phase).
Allman, et. al. Standards Track [Page 9]
RFC 3390 Increasing TCP's Initial Window October 2002
11. Acknowledgments
We would like to acknowledge Vern Paxson, Tim Shepard, members of the
End-to-End-Interest Mailing List, and members of the IETF TCP
Implementation Working Group for continuing discussions of these
issues and for feedback on this document.
12. References
[AHO98] Mark Allman, Chris Hayes, and Shawn Ostermann, An
Evaluation of TCP with Larger Initial Windows, March 1998.
ACM Computer Communication Review, 28(3), July 1998. URL
"http://roland.lerc.nasa.gov/~mallman/papers/initwin.ps".
[All97a] Mark Allman. An Evaluation of TCP with Larger Initial
Windows. 40th IETF Meeting -- TCP Implementations WG.
December, 1997. Washington, DC.
[All97b] Mark Allman. Improving TCP Performance Over Satellite
Channels. Master's thesis, Ohio University, June 1997.
[All00] Mark Allman. A Web Server's View of the Transport Layer.
ACM Computer Communication Review, 30(5), October 2000.
[FF96] Fall, K., and Floyd, S., Simulation-based Comparisons of
Tahoe, Reno, and SACK TCP. Computer Communication Review,
26(3), July 1996.
[FF99] Sally Floyd, Kevin Fall. Promoting the Use of End-to-End
Congestion Control in the Internet. IEEE/ACM Transactions
on Networking, August 1999. URL
"http://www.icir.org/floyd/end2end-paper.html".
[FJ93] Floyd, S., and Jacobson, V., Random Early Detection
gateways for Congestion Avoidance. IEEE/ACM Transactions on
Networking, V.1 N.4, August 1993, p. 397-413.
[Flo94] Floyd, S., TCP and Explicit Congestion Notification.
Computer Communication Review, 24(5):10-23, October 1994.
[Flo96] Floyd, S., Issues of TCP with SACK. Technical report,
January 1996. Available from http://www-
nrg.ee.lbl.gov/floyd/.
[Flo97] Floyd, S., Increasing TCP's Initial Window. Viewgraphs,
40th IETF Meeting - TCP Implementations WG. December, 1997.
URL "ftp://ftp.ee.lbl.gov/talks/sf-tcp-ietf97.ps".
Allman, et. al. Standards Track [Page 10]
RFC 3390 Increasing TCP's Initial Window October 2002
[KAGT98] Hans Kruse, Mark Allman, Jim Griner, Diepchi Tran. HTTP
Page Transfer Rates Over Geo-Stationary Satellite Links.
March 1998. Proceedings of the Sixth International
Conference on Telecommunication Systems. URL
"http://roland.lerc.nasa.gov/~mallman/papers/nash98.ps".
[Mor97] Robert Morris. Private communication, 1997. Cited for
acknowledgement purposes only.
[Nic98] Kathleen Nichols. Improving Network Simulation With
Feedback, Proceedings of LCN 98, October 1998. URL
"http://www.computer.org/proceedings/lcn/8810/8810toc.htm".
[Pos82] Postel, J., "Simple Mail Transfer Protocol", STD 10, RFC
821, August 1982.
[RFC1122] Braden, R., "Requirements for Internet Hosts --
Communication Layers", STD 3, RFC 1122, October 1989.
[RFC1191] Mogul, J. and S. Deering, "Path MTU Discovery", RFC 1191,
November 1990.
[RFC1945] Berners-Lee, T., Fielding, R. and H. Nielsen, "Hypertext
Transfer Protocol -- HTTP/1.0", RFC 1945, May 1996.
[RFC2068] Fielding, R., Mogul, J., Gettys, J., Frystyk, H. and T.
Berners-Lee, "Hypertext Transfer Protocol -- HTTP/1.1", RFC
2616, January 1997.
[RFC2119] Bradner, S., "Key words for use in RFCs to Indicate
Requirement Levels", BCP 14, RFC 2119, March 1997.
[RFC2309] Braden, B., Clark, D., Crowcroft, J., Davie, B., Deering,
S., Estrin, D., Floyd, S., Jacobson, V., Minshall, G.,
Partridge, C., Peterson, L., Ramakrishnan, K., Shenker, S.,
Wroclawski, J. and L. Zhang, "Recommendations on Queue
Management and Congestion Avoidance in the Internet", RFC
2309, April 1998.
[RFC2414] Allman, M., Floyd, S. and C. Partridge, "Increasing TCP's
Initial Window", RFC 2414, September 1998.
[RFC2415] Poduri, K. and K. Nichols, "Simulation Studies of Increased
Initial TCP Window Size", RFC 2415, September 1998.
[RFC2416] Shepard, T. and C. Partridge, "When TCP Starts Up With Four
Packets Into Only Three Buffers", RFC 2416, September 1998.
Allman, et. al. Standards Track [Page 11]
RFC 3390 Increasing TCP's Initial Window October 2002
[RFC2581] Allman, M., Paxson, V. and W. Stevens, "TCP Congestion
Control", RFC 2581, April 1999.
[RFC2821] Klensin, J., "Simple Mail Transfer Protocol", RFC 2821,
April 2001.
[RFC2988] Paxson, V. and M. Allman, "Computing TCP's Retransmission
Timer", RFC 2988, November 2000.
[RFC3042] Allman, M., Balakrishnan, H. and S. Floyd, "Enhancing TCP's
Loss Recovery Using Limited Transmit", RFC 3042, January
2001.
[RFC3168] Ramakrishnan, K.K., Floyd, S. and D. Black, "The Addition
of Explicit Congestion Notification (ECN) to IP", RFC 3168,
September 2001.
Allman, et. al. Standards Track [Page 12]
RFC 3390 Increasing TCP's Initial Window October 2002
Appendix A - Duplicate Segments
In the current environment (without Explicit Congestion Notification
[Flo94] [RFC2481]), all TCPs use segment drops as indications from
the network about the limits of available bandwidth. We argue here
that the change to a larger initial window should not result in the
sender retransmitting a large number of duplicate segments that have
already arrived at the receiver.
If one segment is dropped from the initial window, there are three
different ways for TCP to recover: (1) Slow-starting from a window of
one segment, as is done after a retransmit timeout, or after Fast
Retransmit in Tahoe TCP; (2) Fast Recovery without selective
acknowledgments (SACK), as is done after three duplicate ACKs in Reno
TCP; and (3) Fast Recovery with SACK, for TCP where both the sender
and the receiver support the SACK option [MMFR96]. In all three
cases, if a single segment is dropped from the initial window, no
duplicate segments (i.e., segments that have already been received at
the receiver) are transmitted. Note that for a TCP sending four
512-byte segments in the initial window, a single segment drop will
not require a retransmit timeout, but can be recovered by using the
Fast Retransmit algorithm (unless the retransmit timer expires
prematurely). In addition, a single segment dropped from an initial
window of three segments might be repaired using the fast retransmit
algorithm, depending on which segment is dropped and whether or not
delayed ACKs are used. For example, dropping the first segment of a
three segment initial window will always require waiting for a
timeout, in the absence of Limited Transmit [RFC3042]. However,
dropping the third segment will always allow recovery via the fast
retransmit algorithm, as long as no ACKs are lost.
Next we consider scenarios where the initial window contains two to
four segments, and at least two of those segments are dropped. If
all segments in the initial window are dropped, then clearly no
duplicate segments are retransmitted, as the receiver has not yet
received any segments. (It is still a possibility that these dropped
segments used scarce bandwidth on the way to their drop point; this
issue was discussed in Section 5.)
When two segments are dropped from an initial window of three
segments, the sender will only send a duplicate segment if the first
two of the three segments were dropped, and the sender does not
receive a packet with the SACK option acknowledging the third
segment.
When two segments are dropped from an initial window of four
segments, an examination of the six possible scenarios (which we
don't go through here) shows that, depending on the position of the
Allman, et. al. Standards Track [Page 13]
RFC 3390 Increasing TCP's Initial Window October 2002
dropped packets, in the absence of SACK the sender might send one
duplicate segment. There are no scenarios in which the sender sends
two duplicate segments.
When three segments are dropped from an initial window of four
segments, then, in the absence of SACK, it is possible that one
duplicate segment will be sent, depending on the position of the
dropped segments.
The summary is that in the absence of SACK, there are some scenarios
with multiple segment drops from the initial window where one
duplicate segment will be transmitted. There are no scenarios in
which more than one duplicate segment will be transmitted. Our
conclusion is than the number of duplicate segments transmitted as a
result of a larger initial window should be small.
Author's Addresses
Mark Allman
BBN Technologies/NASA Glenn Research Center
21000 Brookpark Rd
MS 54-5
Cleveland, OH 44135
EMail: mallman@bbn.com
http://roland.lerc.nasa.gov/~mallman/
Sally Floyd
ICSI Center for Internet Research
1947 Center St, Suite 600
Berkeley, CA 94704
Phone: +1 (510) 666-2989
EMail: floyd@icir.org
http://www.icir.org/floyd/
Craig Partridge
BBN Technologies
10 Moulton St
Cambridge, MA 02138
EMail: craig@bbn.com
Allman, et. al. Standards Track [Page 14]
RFC 3390 Increasing TCP's Initial Window October 2002
Full Copyright Statement
Copyright (C) The Internet Society (2002). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Acknowledgement
Funding for the RFC Editor function is currently provided by the
Internet Society.
Allman, et. al. Standards Track [Page 15]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,563 @@
Network Working Group M. Allman
Request for Comments: 3465 BBN/NASA GRC
Category: Experimental February 2003
TCP Congestion Control with Appropriate Byte Counting (ABC)
Status of this Memo
This memo defines an Experimental Protocol for the Internet
community. It does not specify an Internet standard of any kind.
Discussion and suggestions for improvement are requested.
Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (2003). All Rights Reserved.
Abstract
This document proposes a small modification to the way TCP increases
its congestion window. Rather than the traditional method of
increasing the congestion window by a constant amount for each
arriving acknowledgment, the document suggests basing the increase on
the number of previously unacknowledged bytes each ACK covers. This
change improves the performance of TCP, as well as closes a security
hole TCP receivers can use to induce the sender into increasing the
sending rate too rapidly.
Terminology
Much of the language in this document is taken from [RFC2581].
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
"SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
document are to be interpreted as described in [RFC2119].
1 Introduction
This document proposes a modification to the algorithm for increasing
TCP's congestion window (cwnd) that improves both performance and
security. Rather than increasing a TCP's congestion window based on
the number of acknowledgments (ACKs) that arrive at the data sender
(per the current specification [RFC2581]), the congestion window is
increased based on the number of bytes acknowledged by the arriving
ACKs. The algorithm improves performance by mitigating the impact of
delayed ACKs on the growth of cwnd. At the same time, the algorithm
provides cwnd growth in direct relation to the probed capacity of a
Allman Experimental [Page 1]
RFC 3465 TCP Congestion Control with ABC February 2003
network path, therefore providing a more measured response to ACKs
that cover only small amounts of data (less than a full segment size)
than ACK counting. This more appropriate cwnd growth can improve
both performance and can prevent inappropriate cwnd growth in
response to a misbehaving receiver. On the other hand, in some cases
the modified cwnd growth algorithm causes larger bursts of segments
to be sent into the network. In some cases this can lead to a non-
negligible increase in the drop rate and reduced performance (see
section 4 for a larger discussion of the issues).
This document is organized as follows. Section 2 outlines the
modified algorithm for increasing TCP's congestion window. Section 3
discusses the advantages of using the modified algorithm. Section 4
discusses the disadvantages of the approach outlined in this
document. Section 5 outlines some of the fairness issues that must
be considered for the modified algorithm. Section 6 discusses
security considerations.
Statement of Intent
This specification contains an algorithm improving the performance
of TCP which is understood to be effective and safe, but which has
not been widely deployed. One goal of publication as an
Experimental RFC is to be prudent, and encourage use and
deployment prior to publication in the standards track. It is the
intent of the Transport Area to re-submit this specification as an
IETF Proposed Standard in the future, after more experience has
been gained.
2 A Modified Algorithm for Increasing the Congestion Window
As originally outlined in [Jac88] and specified in [RFC2581], TCP
uses two algorithms for increasing the congestion window. During
steady-state, TCP uses the Congestion Avoidance algorithm to linearly
increase the value of cwnd. At the beginning of a transfer, after a
retransmission timeout or after a long idle period (in some
implementations), TCP uses the Slow Start algorithm to increase cwnd
exponentially. According to RFC 2581, slow start bases the cwnd
increase on the number of incoming acknowledgments. During
congestion avoidance RFC 2581 allows more latitude in increasing
cwnd, but traditionally implementations have based the increase on
the number of arriving ACKs. In the following two subsections, we
detail modifications to these algorithms to increase cwnd based on
the number of bytes being acknowledged by each arriving ACK, rather
than by the number of ACKs that arrive. We call these changes
"Appropriate Byte Counting" (ABC) [All99].
Allman Experimental [Page 2]
RFC 3465 TCP Congestion Control with ABC February 2003
2.1 Congestion Avoidance
RFC 2581 specifies that cwnd should be increased by 1 segment per
round-trip time (RTT) during the congestion avoidance phase of a
transfer. Traditionally, TCPs have approximated this increase by
increasing cwnd by 1/cwnd for each arriving ACK. This algorithm
opens cwnd by roughly 1 segment per RTT if the receiver ACKs each
incoming segment and no ACK loss occurs. However, if the receiver
implements delayed ACKs [Bra89], the receiver returns roughly half as
many ACKs, which causes the sender to open cwnd more conservatively
(by approximately 1 segment every second RTT). The approach that
this document suggests is to store the number of bytes that have been
ACKed in a "bytes_acked" variable in the TCP control block. When
bytes_acked becomes greater than or equal to the value of the
congestion window, bytes_acked is reduced by the value of cwnd.
Next, cwnd is incremented by a full-sized segment (SMSS). The
algorithm suggested above is specifically allowed by RFC 2581 during
congestion avoidance because it opens the window by at most 1 segment
per RTT.
2.2 Slow Start
RFC 2581 states that the sender increments the congestion window by
at most, 1*SMSS bytes for each arriving acknowledgment during slow
start. This document proposes that a TCP sender SHOULD increase cwnd
by the number of previously unacknowledged bytes ACKed by each
incoming acknowledgment, provided the increase is not more than L
bytes. Choosing the limit on the increase, L, is discussed in the
next subsection. When the number of previously unacknowledged bytes
ACKed is less than or equal to 1*SMSS bytes, or L is less than or
equal to 1*SMSS bytes, this proposal is no more aggressive (and
possibly less aggressive) than allowed by RFC 2581. However,
increasing cwnd by more than 1*SMSS bytes in response to a single ACK
is more aggressive than allowed by RFC 2581. The more aggressive
version of the slow start algorithm still falls within the spirit of
the principles outlined in [Jac88] (i.e., of no more than doubling
the cwnd per RTT), and this document proposes ABC for experimentation
in shared networks, provided an appropriate limit is applied (see
next section).
2.3 Choosing the Limit
The limit, L, chosen for the cwnd increase during slow start,
controls the aggressiveness of the algorithm. Choosing L=1*SMSS
bytes provides behavior that is no more aggressive than allowed by
RFC 2581. However, ABC with L=1*SMSS bytes is more conservative in a
Allman Experimental [Page 3]
RFC 3465 TCP Congestion Control with ABC February 2003
number of key ways (as discussed in the next section) and therefore,
this document suggests that even though with L=1*SMSS bytes TCP
stacks will see little performance change, ABC SHOULD be used.
A very large L could potentially lead to large line-rate bursts of
traffic in the face of a large amount of ACK loss or in the case when
the receiver sends "stretch ACKs" (ACKs for more than the two full-
sized segments allowed by the delayed ACK algorithm) [Pax97].
This document specifies that TCP implementations MAY use L=2*SMSS
bytes and MUST NOT use L > 2*SMSS bytes. This choice balances
between being conservative (L=1*SMSS bytes) and being potentially
very aggressive. In addition, L=2*SMSS bytes exactly balances the
negative impact of the delayed ACK algorithm (as discussed in more
detail in section 3.2). Note that when L=2*SMSS bytes cwnd growth is
roughly the same as the case when the standard algorithms are used in
conjunction with a receiver that transmits an ACK for each incoming
segment [All98] (assuming no or small amounts of ACK loss in both
cases).
The exception to the above suggestion is during a slow start phase
that follows a retransmission timeout (RTO). In this situation, a
TCP MUST use L=1*SMSS as specified in RFC 2581 since ACKs for large
amounts of previously unacknowledged data are common during this
phase of a transfer. These ACKs do not necessarily indicate how much
data has left the network in the last RTT, and therefore ABC cannot
accurately determine how much to increase cwnd. As an example, say
segment N is dropped by the network, and segments N+1 and N+2 arrive
successfully at the receiver. The sender will receive only two
duplicate ACKs and therefore must rely on the retransmission timer
(RTO) to detect the loss. When the RTO expires, segment N is
retransmitted. The ACK sent in response to the retransmission will
be for segment N+2. However, this ACK does not indicate that three
segments have left the network in the last RTT, but rather only a
single segment left the network. Therefore, the appropriate cwnd
increment is at most 1*SMSS bytes.
2.4 RTO Implications
[Jac88] shows that increases in cwnd of more than a factor of two in
succeeding RTTs can cause spurious retransmissions on slow links
where the bandwidth dominates the RTT, assuming the RTO estimator
given in [Jac88] and [RFC2988]. ABC stays within this limit of no
more than doubling cwnd in successive RTTs by capping the increase
(no matter what L is employed) by the number of previously
unacknowledged bytes covered by each incoming ACK.
Allman Experimental [Page 4]
RFC 3465 TCP Congestion Control with ABC February 2003
3 Advantages
This section outlines several advantages of using the ABC algorithm
to increase cwnd, rather than the standard ACK counting algorithm
given in [RFC2581].
3.1 More Appropriate Congestion Window Increase
The ABC algorithm outlined in section 2 increases TCP's cwnd in
proportion to the amount of data actually sent into the network. ACK
counting, on the other hand, increments cwnd by a constant upon the
arrival of each ACK. For instance, consider an interactive telnet
connection (e.g., ssh or telnet) in which ACKs generally cover only a
few bytes of data, but cwnd is increased by 1*SMSS bytes for each ACK
received. When a large amount of data needs to be transmitted (e.g.,
displaying a large file) the data is sent in one large burst because
the cwnd grows by 1*SMSS bytes per ACK rather than based on the
actual amount of capacity used. Such a line-rate burst of data can
potentially cause a large amount of segment loss.
Congestion Window Validation (CWV) [RFC2861] addresses the above
problem as well. CWV limits the amount of unused cwnd a TCP
connection can accumulate. ABC can be used in conjunction with CWV
to obtain an accurate measure of the network path.
3.2 Mitigate the Impact of Delayed ACKs and Lost ACKs
Delayed ACKs [RFC1122,RFC2581] allow a TCP receiver to refrain from
sending an ACK for each incoming segment. However, a receiver SHOULD
send an ACK for every second full-sized segment that arrives.
Furthermore, a receiver MUST NOT withhold an ACK for more than 500
ms. By reducing the number of ACKs sent to the data originator the
receiver is slowing the growth of the congestion window under an ACK
counting system. Using ABC with L=2*SMSS bytes can roughly negate
the negative impact imposed by delayed ACKs by allowing cwnd to be
increased for ACKs that are withheld by the receiver. This allows
the congestion window to grow in a manner similar to the case when
the receiver ACKs each incoming segment, but without adding extra
traffic to the network. Simulation studies have shown increased
throughput when a TCP sender uses ABC when compared to the standard
ACK counting algorithm [All99], especially for short transfers that
never leave the initial slow start period.
Note that delayed ACKs should not be an issue during slow start-based
loss recovery, as RFC 2581 recommends that receivers should not delay
ACKs that cover out-of-order segments. Therefore, as discussed
above, ABC with L > 1*SMSS bytes is inappropriate for such slow start
based loss recovery and MUST NOT be used.
Allman Experimental [Page 5]
RFC 3465 TCP Congestion Control with ABC February 2003
Note: In the case when an entire window of data is lost, a TCP
receiver will likely generate delayed ACKs and an L > 1*SMSS bytes
would be safe. However, detecting this scenario is difficult.
Therefore to keep ABC conservative, this document mandates that L
MUST NOT be > 1*SMSS bytes in any slow start-based loss recovery.
ACK loss can also retard the growth of a congestion window that
increases based on the number of ACKs that arrive. When counting
ACKs, dropped ACKs represent forever-missed opportunities to increase
cwnd. Using ABC with L > 1*SMSS bytes allows the sender to mitigate
the effect of lost ACKs.
3.3 Prevents Attacks from Misbehaving Receivers
[SCWA99] outlines several methods for a receiver to induce a TCP
sender into violating congestion control and transmitting data at a
potentially inappropriate rate. One of the outlined attacks is "ACK
Division". This scheme involves the receiver sending multiple ACKs
for each incoming data segment, each ACKing only a small portion of
the original TCP data segment. Since TCP senders have traditionally
used ACK counting to increase cwnd, ACK division causes
inappropriately rapid cwnd growth and, in turn, a potentially
inappropriate sending rate. A TCP sender that uses ABC can prevent
this attack from being used to undermine standard congestion control
because the cwnd increase is based on the number of bytes ACKed,
rather than the number of ACKs received.
To prevent misbehaving receivers from inducing inappropriate sender
behavior, this document suggests TCP implementations use ABC, even if
L=1*SMSS bytes (i.e., not allowing ABC to provide more aggressive
cwnd growth than allowed by RFC 2581).
4 Disadvantages
The main disadvantages of using ABC with L=2*SMSS bytes are an
increase in the burstiness of TCP and a small increase in the overall
loss rate. [All98] discusses the two ways that ABC increases the
burstiness of the TCP sender. First, the "micro burstiness" of the
connection is increased. In other words, the number of segments sent
in response to each incoming ACK is increased by at most 1 segment
when using ABC with L=2*SMSS bytes in conjunction with a receiver
that is sending delayed ACKs. During slow start this translates into
an increase from sending 2 back-to-back segments to sending 3 back-
to-back packets in response to an ACK for a single packet. Or, an
increase from 3 packets to 4 packets when receiving a delayed ACK for
two outstanding packets. Note that ACK loss can cause larger bursts.
However, ABC only increases the burst size by at most 1*SMSS bytes
per ACK received when compared to the standard behavior. This slight
Allman Experimental [Page 6]
RFC 3465 TCP Congestion Control with ABC February 2003
increase in the burstiness should only cause problems for devices
that have very small buffers. In addition, ABC increases the "macro
burstiness" of the TCP sender in response to delayed ACKs in slow
start. Rather than increasing cwnd by roughly 1.5 times per RTT, ABC
roughly doubles the congestion window every RTT. However, doubling
cwnd every RTT fits within the spirit of slow start, as originally
outlined [Jac88].
With the increased burstiness comes a modest increase in the loss
rate for a TCP connection employing ABC (see the next section for a
short discussion on the fairness of ABC to non-ABC flows). The
additional loss can be directly attributable to the increased
aggressiveness of ABC. During slow start cwnd is increased more
rapidly. Therefore when loss occurs cwnd is larger and more drops
are likely. Similarly, a congestion avoidance cycle takes roughly
half, as long when using ABC and delayed ACKs when compared to an ACK
counting implementation. In other words, a TCP sender reaches the
capacity of the network path, drops a packet and reduces the
congestion window by half roughly twice as often when using ABC.
However, as discussed above, in spite of the additional loss an ABC
TCP sender generally obtains better overall performance than a non-
ABC TCP [All99].
Due to the increase in the packet drop rate we suggest ABC be
implemented in conjunction with selective acknowledgments [RFC2018].
5 Fairness Considerations
[All99] presents several simple simulations conducted to measure the
impact of ABC on competing traffic (both ABC and non-ABC). The
experiments show that while ABC increases the drop rate for the
connection using ABC, competing traffic is not greatly effected. The
experiments show that standard TCP and ABC both obtain roughly the
same throughput, regardless of the variant of the competing traffic.
The simulations also reaffirm that ABC outperforms non-ABC TCP in an
environment with varying types of TCP connections. On the other
hand, the simulations presented in [All99] are not necessarily
realistic. Therefore we are encouraging more experimentation in the
Internet.
6 Security Considerations
As discussed in section 3.3, ABC protects a TCP sender from a
misbehaving receiver that induces the sender into transmitting at an
inappropriate rate with an "ACK division" attack. This, in turn,
protects the network from an overly aggressive sender.
Allman Experimental [Page 7]
RFC 3465 TCP Congestion Control with ABC February 2003
7 Conclusions
This document RECOMMENDS that all TCP stacks be modified to use ABC
with L=1*SMSS bytes. This change does not increase the
aggressiveness of TCP. Furthermore, simulations of ABC with L=2*SMSS
bytes show a promising performance improvement that we encourage
researchers to experiment with in the Internet.
Acknowledgments
This document has benefited from discussions with and encouragement
from Sally Floyd. Van Jacobson and Reiner Ludwig provided valuable
input on the implications of byte counting on the RTO. Reiner Ludwig
and Kostas Pentikousis provided valuable feedback on a draft of this
document.
Normative References
[RFC1122] Braden, R., Ed., "Requirements for Internet Hosts --
Communication Layers", STD 3, RFC 1122, October 1989.
[RFC2119] Bradner, S., "Key words for use in RFCs to Indicate
Requirement Levels", BCP 14, RFC 2119, March 1997.
[RFC2581] Allman, M., Paxson, V. and W. Stevens, "TCP Congestion
Control", RFC 2581, April 1999.
Informative References
[All98] Mark Allman. On the Generation and Use of TCP
Acknowledgments. ACM Computer Communication Review, 29(3),
July 1998.
[All99] Mark Allman. TCP Byte Counting Refinements. ACM Computer
Communication Review, 29(3), July 1999.
[Jac88] Van Jacobson. Congestion Avoidance and Control. ACM
SIGCOMM 1988.
[Pax97] Vern Paxson. Automated Packet Trace Analysis of TCP
Implementations. ACM SIGCOMM, September 1997.
[RFC2018] Mathis, M., Mahdavi, J., Floyd, S. and A. Romanow, "TCP
Selective Acknowledgment Options", RFC 2018, October 1996.
[RFC2861] Handley, M., Padhye, J. and S. Floyd, "TCP Congestion
Window Validation", RFC 2861, June 2000.
Allman Experimental [Page 8]
RFC 3465 TCP Congestion Control with ABC February 2003
[SCWA99] Stefan Savage, Neal Cardwell, David Wetherall, Tom
Anderson. TCP Congestion Control with a Misbehaving
Receiver. ACM Computer Communication Review, 29(5),
October 1999.
Author's Address
Mark Allman
BBN Technologies/NASA Glenn Research Center
Lewis Field
21000 Brookpark Rd. MS 54-5
Cleveland, OH 44135
Fax: 216-433-8705
Phone: 216-433-6586
EMail: mallman@bbn.com
http://roland.grc.nasa.gov/~mallman
Allman Experimental [Page 9]
RFC 3465 TCP Congestion Control with ABC February 2003
Full Copyright Statement
Copyright (C) The Internet Society (2003). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Acknowledgement
Funding for the RFC Editor function is currently provided by the
Internet Society.
Allman Experimental [Page 10]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,731 @@
Network Working Group E. Blanton
Request for Comments: 3517 Purdue University
Category: Standards Track M. Allman
BBN/NASA GRC
K. Fall
Intel Research
L. Wang
University of Kentucky
April 2003
A Conservative Selective Acknowledgment (SACK)-based
Loss Recovery Algorithm for TCP
Status of this Memo
This document specifies an Internet standards track protocol for the
Internet community, and requests discussion and suggestions for
improvements. Please refer to the current edition of the "Internet
Official Protocol Standards" (STD 1) for the standardization state
and status of this protocol. Distribution of this memo is unlimited.
Copyright Notice
Copyright (C) The Internet Society (2003). All Rights Reserved.
Abstract
This document presents a conservative loss recovery algorithm for TCP
that is based on the use of the selective acknowledgment (SACK) TCP
option. The algorithm presented in this document conforms to the
spirit of the current congestion control specification (RFC 2581),
but allows TCP senders to recover more effectively when multiple
segments are lost from a single flight of data.
Terminology
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
"SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
document are to be interpreted as described in BCP 14, RFC 2119
[RFC2119].
Blanton, et al. Standards Track [Page 1]
RFC 3517 SACK-based Loss Recovery for TCP April 2003
1 Introduction
This document presents a conservative loss recovery algorithm for TCP
that is based on the use of the selective acknowledgment (SACK) TCP
option. While the TCP SACK [RFC2018] is being steadily deployed in
the Internet [All00], there is evidence that hosts are not using the
SACK information when making retransmission and congestion control
decisions [PF01]. The goal of this document is to outline one
straightforward method for TCP implementations to use SACK
information to increase performance.
[RFC2581] allows advanced loss recovery algorithms to be used by TCP
[RFC793] provided that they follow the spirit of TCP's congestion
control algorithms [RFC2581, RFC2914]. [RFC2582] outlines one such
advanced recovery algorithm called NewReno. This document outlines a
loss recovery algorithm that uses the SACK [RFC2018] TCP option to
enhance TCP's loss recovery. The algorithm outlined in this
document, heavily based on the algorithm detailed in [FF96], is a
conservative replacement of the fast recovery algorithm [Jac90,
RFC2581]. The algorithm specified in this document is a
straightforward SACK-based loss recovery strategy that follows the
guidelines set in [RFC2581] and can safely be used in TCP
implementations. Alternate SACK-based loss recovery methods can be
used in TCP as implementers see fit (as long as the alternate
algorithms follow the guidelines provided in [RFC2581]). Please
note, however, that the SACK-based decisions in this document (such
as what segments are to be sent at what time) are largely decoupled
from the congestion control algorithms, and as such can be treated as
separate issues if so desired.
2 Definitions
The reader is expected to be familiar with the definitions given in
[RFC2581].
The reader is assumed to be familiar with selective acknowledgments
as specified in [RFC2018].
For the purposes of explaining the SACK-based loss recovery algorithm
we define four variables that a TCP sender stores:
"HighACK" is the sequence number of the highest byte of data that
has been cumulatively ACKed at a given point.
"HighData" is the highest sequence number transmitted at a given
point.
Blanton, et al. Standards Track [Page 2]
RFC 3517 SACK-based Loss Recovery for TCP April 2003
"HighRxt" is the highest sequence number which has been
retransmitted during the current loss recovery phase.
"Pipe" is a sender's estimate of the number of bytes outstanding
in the network. This is used during recovery for limiting the
sender's sending rate. The pipe variable allows TCP to use a
fundamentally different congestion control than specified in
[RFC2581]. The algorithm is often referred to as the "pipe
algorithm".
For the purposes of this specification we define a "duplicate
acknowledgment" as a segment that arrives with no data and an
acknowledgment (ACK) number that is equal to the current value of
HighACK, as described in [RFC2581].
We define a variable "DupThresh" that holds the number of duplicate
acknowledgments required to trigger a retransmission. Per [RFC2581]
this threshold is defined to be 3 duplicate acknowledgments.
However, implementers should consult any updates to [RFC2581] to
determine the current value for DupThresh (or method for determining
its value).
Finally, a range of sequence numbers [A,B] is said to "cover"
sequence number S if A <= S <= B.
3 Keeping Track of SACK Information
For a TCP sender to implement the algorithm defined in the next
section it must keep a data structure to store incoming selective
acknowledgment information on a per connection basis. Such a data
structure is commonly called the "scoreboard". The specifics of the
scoreboard data structure are out of scope for this document (as long
as the implementation can perform all functions required by this
specification).
Note that this document refers to keeping account of (marking)
individual octets of data transferred across a TCP connection. A
real-world implementation of the scoreboard would likely prefer to
manage this data as sequence number ranges. The algorithms presented
here allow this, but require arbitrary sequence number ranges to be
marked as having been selectively acknowledged.
Blanton, et al. Standards Track [Page 3]
RFC 3517 SACK-based Loss Recovery for TCP April 2003
4 Processing and Acting Upon SACK Information
For the purposes of the algorithm defined in this document the
scoreboard SHOULD implement the following functions:
Update ():
Given the information provided in an ACK, each octet that is
cumulatively ACKed or SACKed should be marked accordingly in the
scoreboard data structure, and the total number of octets SACKed
should be recorded.
Note: SACK information is advisory and therefore SACKed data MUST
NOT be removed from TCP's retransmission buffer until the data is
cumulatively acknowledged [RFC2018].
IsLost (SeqNum):
This routine returns whether the given sequence number is
considered to be lost. The routine returns true when either
DupThresh discontiguous SACKed sequences have arrived above
'SeqNum' or (DupThresh * SMSS) bytes with sequence numbers greater
than 'SeqNum' have been SACKed. Otherwise, the routine returns
false.
SetPipe ():
This routine traverses the sequence space from HighACK to HighData
and MUST set the "pipe" variable to an estimate of the number of
octets that are currently in transit between the TCP sender and
the TCP receiver. After initializing pipe to zero the following
steps are taken for each octet 'S1' in the sequence space between
HighACK and HighData that has not been SACKed:
(a) If IsLost (S1) returns false:
Pipe is incremented by 1 octet.
The effect of this condition is that pipe is incremented for
packets that have not been SACKed and have not been determined
to have been lost (i.e., those segments that are still assumed
to be in the network).
(b) If S1 <= HighRxt:
Pipe is incremented by 1 octet.
Blanton, et al. Standards Track [Page 4]
RFC 3517 SACK-based Loss Recovery for TCP April 2003
The effect of this condition is that pipe is incremented for
the retransmission of the octet.
Note that octets retransmitted without being considered lost are
counted twice by the above mechanism.
NextSeg ():
This routine uses the scoreboard data structure maintained by the
Update() function to determine what to transmit based on the SACK
information that has arrived from the data receiver (and hence
been marked in the scoreboard). NextSeg () MUST return the
sequence number range of the next segment that is to be
transmitted, per the following rules:
(1) If there exists a smallest unSACKed sequence number 'S2' that
meets the following three criteria for determining loss, the
sequence range of one segment of up to SMSS octets starting
with S2 MUST be returned.
(1.a) S2 is greater than HighRxt.
(1.b) S2 is less than the highest octet covered by any
received SACK.
(1.c) IsLost (S2) returns true.
(2) If no sequence number 'S2' per rule (1) exists but there
exists available unsent data and the receiver's advertised
window allows, the sequence range of one segment of up to SMSS
octets of previously unsent data starting with sequence number
HighData+1 MUST be returned.
(3) If the conditions for rules (1) and (2) fail, but there exists
an unSACKed sequence number 'S3' that meets the criteria for
detecting loss given in steps (1.a) and (1.b) above
(specifically excluding step (1.c)) then one segment of up to
SMSS octets starting with S3 MAY be returned.
Note that rule (3) is a sort of retransmission "last resort".
It allows for retransmission of sequence numbers even when the
sender has less certainty a segment has been lost than as with
rule (1). Retransmitting segments via rule (3) will help
sustain TCP's ACK clock and therefore can potentially help
avoid retransmission timeouts. However, in sending these
segments the sender has two copies of the same data considered
to be in the network (and also in the Pipe estimate). When an
ACK or SACK arrives covering this retransmitted segment, the
Blanton, et al. Standards Track [Page 5]
RFC 3517 SACK-based Loss Recovery for TCP April 2003
sender cannot be sure exactly how much data left the network
(one of the two transmissions of the packet or both
transmissions of the packet). Therefore the sender may
underestimate Pipe by considering both segments to have left
the network when it is possible that only one of the two has.
We believe that the triggering of rule (3) will be rare and
that the implications are likely limited to corner cases
relative to the entire recovery algorithm. Therefore we leave
the decision of whether or not to use rule (3) to
implementors.
(4) If the conditions for each of (1), (2), and (3) are not met,
then NextSeg () MUST indicate failure, and no segment is
returned.
Note: The SACK-based loss recovery algorithm outlined in this
document requires more computational resources than previous TCP loss
recovery strategies. However, we believe the scoreboard data
structure can be implemented in a reasonably efficient manner (both
in terms of computation complexity and memory usage) in most TCP
implementations.
5 Algorithm Details
Upon the receipt of any ACK containing SACK information, the
scoreboard MUST be updated via the Update () routine.
Upon the receipt of the first (DupThresh - 1) duplicate ACKs, the
scoreboard is to be updated as normal. Note: The first and second
duplicate ACKs can also be used to trigger the transmission of
previously unsent segments using the Limited Transmit algorithm
[RFC3042].
When a TCP sender receives the duplicate ACK corresponding to
DupThresh ACKs, the scoreboard MUST be updated with the new SACK
information (via Update ()). If no previous loss event has occurred
on the connection or the cumulative acknowledgment point is beyond
the last value of RecoveryPoint, a loss recovery phase SHOULD be
initiated, per the fast retransmit algorithm outlined in [RFC2581].
The following steps MUST be taken:
(1) RecoveryPoint = HighData
When the TCP sender receives a cumulative ACK for this data octet
the loss recovery phase is terminated.
Blanton, et al. Standards Track [Page 6]
RFC 3517 SACK-based Loss Recovery for TCP April 2003
(2) ssthresh = cwnd = (FlightSize / 2)
The congestion window (cwnd) and slow start threshold (ssthresh)
are reduced to half of FlightSize per [RFC2581].
(3) Retransmit the first data segment presumed dropped -- the segment
starting with sequence number HighACK + 1. To prevent repeated
retransmission of the same data, set HighRxt to the highest
sequence number in the retransmitted segment.
(4) Run SetPipe ()
Set a "pipe" variable to the number of outstanding octets
currently "in the pipe"; this is the data which has been sent by
the TCP sender but for which no cumulative or selective
acknowledgment has been received and the data has not been
determined to have been dropped in the network. It is assumed
that the data is still traversing the network path.
(5) In order to take advantage of potential additional available
cwnd, proceed to step (C) below.
Once a TCP is in the loss recovery phase the following procedure MUST
be used for each arriving ACK:
(A) An incoming cumulative ACK for a sequence number greater than
RecoveryPoint signals the end of loss recovery and the loss
recovery phase MUST be terminated. Any information contained in
the scoreboard for sequence numbers greater than the new value of
HighACK SHOULD NOT be cleared when leaving the loss recovery
phase.
(B) Upon receipt of an ACK that does not cover RecoveryPoint the
following actions MUST be taken:
(B.1) Use Update () to record the new SACK information conveyed
by the incoming ACK.
(B.2) Use SetPipe () to re-calculate the number of octets still
in the network.
(C) If cwnd - pipe >= 1 SMSS the sender SHOULD transmit one or more
segments as follows:
(C.1) The scoreboard MUST be queried via NextSeg () for the
sequence number range of the next segment to transmit (if any),
Blanton, et al. Standards Track [Page 7]
RFC 3517 SACK-based Loss Recovery for TCP April 2003
and the given segment sent. If NextSeg () returns failure (no
data to send) return without sending anything (i.e., terminate
steps C.1 -- C.5).
(C.2) If any of the data octets sent in (C.1) are below HighData,
HighRxt MUST be set to the highest sequence number of the
retransmitted segment.
(C.3) If any of the data octets sent in (C.1) are above HighData,
HighData must be updated to reflect the transmission of
previously unsent data.
(C.4) The estimate of the amount of data outstanding in the
network must be updated by incrementing pipe by the number of
octets transmitted in (C.1).
(C.5) If cwnd - pipe >= 1 SMSS, return to (C.1)
5.1 Retransmission Timeouts
In order to avoid memory deadlocks, the TCP receiver is allowed to
discard data that has already been selectively acknowledged. As a
result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK
information gathered from a receiver upon a retransmission timeout
"since the timeout might indicate that the data receiver has
reneged." Additionally, a TCP sender MUST "ignore prior SACK
information in determining which data to retransmit." However, a
SACK TCP sender SHOULD still use all SACK information made available
during the slow start phase of loss recovery following an RTO.
If an RTO occurs during loss recovery as specified in this document,
RecoveryPoint MUST be set to HighData. Further, the new value of
RecoveryPoint MUST be preserved and the loss recovery algorithm
outlined in this document MUST be terminated. In addition, a new
recovery phase (as described in section 5) MUST NOT be initiated
until HighACK is greater than or equal to the new value of
RecoveryPoint.
As described in Sections 4 and 5, Update () SHOULD continue to be
used appropriately upon receipt of ACKs. This will allow the slow
start recovery period to benefit from all available information
provided by the receiver, despite the fact that SACK information was
expunged due to the RTO.
If there are segments missing from the receiver's buffer following
processing of the retransmitted segment, the corresponding ACK will
contain SACK information. In this case, a TCP sender SHOULD use this
SACK information when determining what data should be sent in each
Blanton, et al. Standards Track [Page 8]
RFC 3517 SACK-based Loss Recovery for TCP April 2003
segment of the slow start. The exact algorithm for this selection is
not specified in this document (specifically NextSeg () is
inappropriate during slow start after an RTO). A relatively
straightforward approach to "filling in" the sequence space reported
as missing should be a reasonable approach.
6 Managing the RTO Timer
The standard TCP RTO estimator is defined in [RFC2988]. Due to the
fact that the SACK algorithm in this document can have an impact on
the behavior of the estimator, implementers may wish to consider how
the timer is managed. [RFC2988] calls for the RTO timer to be
re-armed each time an ACK arrives that advances the cumulative ACK
point. Because the algorithm presented in this document can keep the
ACK clock going through a fairly significant loss event,
(comparatively longer than the algorithm described in [RFC2581]), on
some networks the loss event could last longer than the RTO. In this
case the RTO timer would expire prematurely and a segment that need
not be retransmitted would be resent.
Therefore we give implementers the latitude to use the standard
[RFC2988] style RTO management or, optionally, a more careful variant
that re-arms the RTO timer on each retransmission that is sent during
recovery MAY be used. This provides a more conservative timer than
specified in [RFC2988], and so may not always be an attractive
alternative. However, in some cases it may prevent needless
retransmissions, go-back-N transmission and further reduction of the
congestion window.
7 Research
The algorithm specified in this document is analyzed in [FF96], which
shows that the above algorithm is effective in reducing transfer time
over standard TCP Reno [RFC2581] when multiple segments are dropped
from a window of data (especially as the number of drops increases).
[AHKO97] shows that the algorithm defined in this document can
greatly improve throughput in connections traversing satellite
channels.
8 Security Considerations
The algorithm presented in this paper shares security considerations
with [RFC2581]. A key difference is that an algorithm based on SACKs
is more robust against attackers forging duplicate ACKs to force the
TCP sender to reduce cwnd. With SACKs, TCP senders have an
additional check on whether or not a particular ACK is legitimate.
While not fool-proof, SACK does provide some amount of protection in
this area.
Blanton, et al. Standards Track [Page 9]
RFC 3517 SACK-based Loss Recovery for TCP April 2003
Acknowledgments
The authors wish to thank Sally Floyd for encouraging this document
and commenting on early drafts. The algorithm described in this
document is loosely based on an algorithm outlined by Kevin Fall and
Sally Floyd in [FF96], although the authors of this document assume
responsibility for any mistakes in the above text. Murali Bashyam,
Ken Calvert, Tom Henderson, Reiner Ludwig, Jamshid Mahdavi, Matt
Mathis, Shawn Ostermann, Vern Paxson and Venkat Venkatsubra provided
valuable feedback on earlier versions of this document. We thank
Matt Mathis and Jamshid Mahdavi for implementing the scoreboard in ns
and hence guiding our thinking in keeping track of SACK state.
The first author would like to thank Ohio University and the Ohio
University Internetworking Research Group for supporting the bulk of
his work on this project.
Normative References
[RFC793] Postel, J., "Transmission Control Protocol", STD 7, RFC
793, September 1981.
[RFC2018] Mathis, M., Mahdavi, J., Floyd, S. and A. Romanow, "TCP
Selective Acknowledgment Options", RFC 2018, October 1996.
[RFC2026] Bradner, S., "The Internet Standards Process -- Revision
3", BCP 9, RFC 2026, October 1996.
[RFC2119] Bradner, S., "Key words for use in RFCs to Indicate
Requirement Levels", BCP 14, RFC 2119, March 1997.
[RFC2581] Allman, M., Paxson, V. and R. Stevens, "TCP Congestion
Control", RFC 2581, April 1999.
Informative References
[AHKO97] Mark Allman, Chris Hayes, Hans Kruse, Shawn Ostermann. TCP
Performance Over Satellite Links. Proceedings of the Fifth
International Conference on Telecommunications Systems,
Nashville, TN, March, 1997.
[All00] Mark Allman. A Web Server's View of the Transport Layer.
ACM Computer Communication Review, 30(5), October 2000.
[FF96] Kevin Fall and Sally Floyd. Simulation-based Comparisons
of Tahoe, Reno and SACK TCP. Computer Communication
Review, July 1996.
Blanton, et al. Standards Track [Page 10]
RFC 3517 SACK-based Loss Recovery for TCP April 2003
[Jac90] Van Jacobson. Modified TCP Congestion Avoidance Algorithm.
Technical Report, LBL, April 1990.
[PF01] Jitendra Padhye, Sally Floyd. Identifying the TCP Behavior
of Web Servers, ACM SIGCOMM, August 2001.
[RFC2582] Floyd, S. and T. Henderson, "The NewReno Modification to
TCP's Fast Recovery Algorithm", RFC 2582, April 1999.
[RFC2914] Floyd, S., "Congestion Control Principles", BCP 41, RFC
2914, September 2000.
[RFC2988] Paxson, V. and M. Allman, "Computing TCP's Retransmission
Timer", RFC 2988, November 2000.
[RFC3042] Allman, M., Balakrishnan, H, and S. Floyd, "Enhancing TCP's
Loss Recovery Using Limited Transmit", RFC 3042, January
2001.
Intellectual Property Rights Notice
The IETF takes no position regarding the validity or scope of any
intellectual property or other rights that might be claimed to
pertain to the implementation or use of the technology described in
this document or the extent to which any license under such rights
might or might not be available; neither does it represent that it
has made any effort to identify any such rights. Information on the
IETF's procedures with respect to rights in standards-track and
standards-related documentation can be found in BCP-11. Copies of
claims of rights made available for publication and any assurances of
licenses to be made available, or the result of an attempt made to
obtain a general license or permission for the use of such
proprietary rights by implementors or users of this specification can
be obtained from the IETF Secretariat.
The IETF invites any interested party to bring to its attention any
copyrights, patents or patent applications, or other proprietary
rights which may cover technology that may be required to practice
this standard. Please address the information to the IETF Executive
Director.
Blanton, et al. Standards Track [Page 11]
RFC 3517 SACK-based Loss Recovery for TCP April 2003
Authors' Addresses
Ethan Blanton
Purdue University Computer Sciences
1398 Computer Science Building
West Lafayette, IN 47907
EMail: eblanton@cs.purdue.edu
Mark Allman
BBN Technologies/NASA Glenn Research Center
Lewis Field
21000 Brookpark Rd. MS 54-5
Cleveland, OH 44135
Phone: 216-433-6586
Fax: 216-433-8705
EMail: mallman@bbn.com
http://roland.grc.nasa.gov/~mallman
Kevin Fall
Intel Research
2150 Shattuck Ave., PH Suite
Berkeley, CA 94704
EMail: kfall@intel-research.net
Lili Wang
Laboratory for Advanced Networking
210 Hardymon Building
University of Kentucky
Lexington, KY 40506-0495
EMail: lwang0@uky.edu
Blanton, et al. Standards Track [Page 12]
RFC 3517 SACK-based Loss Recovery for TCP April 2003
Full Copyright Statement
Copyright (C) The Internet Society (2003). All Rights Reserved.
This document and translations of it may be copied and furnished to
others, and derivative works that comment on or otherwise explain it
or assist in its implementation may be prepared, copied, published
and distributed, in whole or in part, without restriction of any
kind, provided that the above copyright notice and this paragraph are
included on all such copies and derivative works. However, this
document itself may not be modified in any way, such as by removing
the copyright notice or references to the Internet Society or other
Internet organizations, except as needed for the purpose of
developing Internet standards in which case the procedures for
copyrights defined in the Internet Standards process must be
followed, or as required to translate it into languages other than
English.
The limited permissions granted above are perpetual and will not be
revoked by the Internet Society or its successors or assigns.
This document and the information contained herein is provided on an
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Acknowledgement
Funding for the RFC Editor function is currently provided by the
Internet Society.
Blanton, et al. Standards Track [Page 13]

Some files were not shown because too many files have changed in this diff Show More