Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* INET An implementation of the TCP/IP protocol suite for the LINUX

2

* INET An implementation of the TCP/IP protocol suite for the LINUX

3

* operating system. INET is implemented using the BSD Socket

3

* operating system. INET is implemented using the BSD Socket

4

* interface as the means of communication with the user level.

4

* interface as the means of communication with the user level.

5

*

5

*

6

* Implementation of the Transmission Control Protocol(TCP).

6

* Implementation of the Transmission Control Protocol(TCP).

7

*

7

*

8

* Authors: Ross Biro

8

* Authors: Ross Biro

9

* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>

9

* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>

10

* Mark Evans, <evansmp@uhura.aston.ac.uk>

10

* Mark Evans, <evansmp@uhura.aston.ac.uk>

11

* Corey Minyard <wf-rch!minyard@relay.EU.net>

11

* Corey Minyard <wf-rch!minyard@relay.EU.net>

12

* Florian La Roche, <flla@stud.uni-sb.de>

12

* Florian La Roche, <flla@stud.uni-sb.de>

13

* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>

13

* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>

14

* Linus Torvalds, <torvalds@cs.helsinki.fi>

14

* Linus Torvalds, <torvalds@cs.helsinki.fi>

15

* Alan Cox, <gw4pts@gw4pts.ampr.org>

15

* Alan Cox, <gw4pts@gw4pts.ampr.org>

16

* Matthew Dillon, <dillon@apollo.west.oic.com>

16

* Matthew Dillon, <dillon@apollo.west.oic.com>

17

* Arnt Gulbrandsen, <agulbra@nvg.unit.no>

17

* Arnt Gulbrandsen, <agulbra@nvg.unit.no>

18

* Jorge Cwik, <jorge@laser.satlink.net>

18

* Jorge Cwik, <jorge@laser.satlink.net>

19

*/

19

*/

20

21

#include <linux/mm.h>

21

#include <linux/mm.h>

22

#include <linux/module.h>

22

#include <linux/module.h>

23

#include <linux/sysctl.h>

23

#include <linux/sysctl.h>

24

#include <linux/workqueue.h>

24

#include <linux/workqueue.h>

25

#include <net/tcp.h>

25

#include <net/tcp.h>

26

#include <net/inet_common.h>

26

#include <net/inet_common.h>

27

#include <net/xfrm.h>

27

#include <net/xfrm.h>

28

29

#ifdef CONFIG_SYSCTL

29

#ifdef CONFIG_SYSCTL

30

#define SYNC_INIT 0 /* let the user enable it */

30

#define SYNC_INIT 0 /* let the user enable it */

31

#else

31

#else

32

#define SYNC_INIT 1

32

#define SYNC_INIT 1

33

#endif

33

#endif

34

35

int sysctl_tcp_syncookies __read_mostly = SYNC_INIT;

35

int sysctl_tcp_syncookies __read_mostly = SYNC_INIT;

36

EXPORT_SYMBOL(sysctl_tcp_syncookies);

36

EXPORT_SYMBOL(sysctl_tcp_syncookies);

37

38

int sysctl_tcp_abort_on_overflow __read_mostly;

38

int sysctl_tcp_abort_on_overflow __read_mostly;

39

40

struct inet_timewait_death_row tcp_death_row = {

40

struct inet_timewait_death_row tcp_death_row = {

41

.sysctl_max_tw_buckets = NR_FILE * 2,

41

.sysctl_max_tw_buckets = NR_FILE * 2,

42

.period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,

42

.period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,

43

.death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),

43

.death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),

44

.hashinfo = &tcp_hashinfo,

44

.hashinfo = &tcp_hashinfo,

45

.tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,

45

.tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,

46

(unsigned long)&tcp_death_row),

46

(unsigned long)&tcp_death_row),

47

.twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,

47

.twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,

48

inet_twdr_twkill_work),

48

inet_twdr_twkill_work),

49

/* Short-time timewait calendar */

49

/* Short-time timewait calendar */

50

51

.twcal_hand = -1,

51

.twcal_hand = -1,

52

.twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,

52

.twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,

53

(unsigned long)&tcp_death_row),

53

(unsigned long)&tcp_death_row),

54

};

54

};

55

56

EXPORT_SYMBOL_GPL(tcp_death_row);

56

EXPORT_SYMBOL_GPL(tcp_death_row);

57

58

static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)

58

static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)

59

{

59

{

60

if (seq == s_win)

60

if (seq == s_win)

61

return 1;

61

return 1;

62

if (after(end_seq, s_win) && before(seq, e_win))

62

if (after(end_seq, s_win) && before(seq, e_win))

63

return 1;

63

return 1;

64

return (seq == e_win && seq == end_seq);

64

return (seq == e_win && seq == end_seq);

65

}

65

}

66

67

/*

67

/*

68

* * Main purpose of TIME-WAIT state is to close connection gracefully,

68

* * Main purpose of TIME-WAIT state is to close connection gracefully,

69

* when one of ends sits in LAST-ACK or CLOSING retransmitting FIN

69

* when one of ends sits in LAST-ACK or CLOSING retransmitting FIN

70

* (and, probably, tail of data) and one or more our ACKs are lost.

70

* (and, probably, tail of data) and one or more our ACKs are lost.

71

* * What is TIME-WAIT timeout? It is associated with maximal packet

71

* * What is TIME-WAIT timeout? It is associated with maximal packet

72

* lifetime in the internet, which results in wrong conclusion, that

72

* lifetime in the internet, which results in wrong conclusion, that

73

* it is set to catch "old duplicate segments" wandering out of their path.

73

* it is set to catch "old duplicate segments" wandering out of their path.

74

* It is not quite correct. This timeout is calculated so that it exceeds

74

* It is not quite correct. This timeout is calculated so that it exceeds

75

* maximal retransmission timeout enough to allow to lose one (or more)

75

* maximal retransmission timeout enough to allow to lose one (or more)

76

* segments sent by peer and our ACKs. This time may be calculated from RTO.

76

* segments sent by peer and our ACKs. This time may be calculated from RTO.

77

* * When TIME-WAIT socket receives RST, it means that another end

77

* * When TIME-WAIT socket receives RST, it means that another end

78

* finally closed and we are allowed to kill TIME-WAIT too.

78

* finally closed and we are allowed to kill TIME-WAIT too.

79

* * Second purpose of TIME-WAIT is catching old duplicate segments.

79

* * Second purpose of TIME-WAIT is catching old duplicate segments.

80

* Well, certainly it is pure paranoia, but if we load TIME-WAIT

80

* Well, certainly it is pure paranoia, but if we load TIME-WAIT

81

* with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.

81

* with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.

82

* * If we invented some more clever way to catch duplicates

82

* * If we invented some more clever way to catch duplicates

83

* (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.

83

* (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.

84

*

84

*

85

* The algorithm below is based on FORMAL INTERPRETATION of RFCs.

85

* The algorithm below is based on FORMAL INTERPRETATION of RFCs.

86

* When you compare it to RFCs, please, read section SEGMENT ARRIVES

86

* When you compare it to RFCs, please, read section SEGMENT ARRIVES

87

* from the very beginning.

87

* from the very beginning.

88

*

88

*

89

* NOTE. With recycling (and later with fin-wait-2) TW bucket

89

* NOTE. With recycling (and later with fin-wait-2) TW bucket

90

* is _not_ stateless. It means, that strictly speaking we must

90

* is _not_ stateless. It means, that strictly speaking we must

91

* spinlock it. I do not want! Well, probability of misbehaviour

91

* spinlock it. I do not want! Well, probability of misbehaviour

92

* is ridiculously low and, seems, we could use some mb() tricks

92

* is ridiculously low and, seems, we could use some mb() tricks

93

* to avoid misread sequence numbers, states etc. --ANK

93

* to avoid misread sequence numbers, states etc. --ANK

94

*/

94

*/

95

enum tcp_tw_status

95

enum tcp_tw_status

96

tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,

96

tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,

97

const struct tcphdr *th)

97

const struct tcphdr *th)

98

{

98

{

99

struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);

99

struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);

100

struct tcp_options_received tmp_opt;

100

struct tcp_options_received tmp_opt;

101

int paws_reject = 0;

101

int paws_reject = 0;

102

103

tmp_opt.saw_tstamp = 0;

103

tmp_opt.saw_tstamp = 0;

104

if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {

104

if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {

105

tcp_parse_options(skb, &tmp_opt, 0);

105

tcp_parse_options(skb, &tmp_opt, 0);

106

107

if (tmp_opt.saw_tstamp) {

107

if (tmp_opt.saw_tstamp) {

108

tmp_opt.ts_recent = tcptw->tw_ts_recent;

108

tmp_opt.ts_recent = tcptw->tw_ts_recent;

109

tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;

109

tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;

110

paws_reject = tcp_paws_reject(&tmp_opt, th->rst);

110

paws_reject = tcp_paws_reject(&tmp_opt, th->rst);

111

}

111

}

112

}

112

}

113

114

if (tw->tw_substate == TCP_FIN_WAIT2) {

114

if (tw->tw_substate == TCP_FIN_WAIT2) {

115

/* Just repeat all the checks of tcp_rcv_state_process() */

115

/* Just repeat all the checks of tcp_rcv_state_process() */

116

117

/* Out of window, send ACK */

117

/* Out of window, send ACK */

118

if (paws_reject ||

118

if (paws_reject ||

119

!tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,

119

!tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,

120

tcptw->tw_rcv_nxt,

120

tcptw->tw_rcv_nxt,

121

tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))

121

tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))

122

return TCP_TW_ACK;

122

return TCP_TW_ACK;

123

124

if (th->rst)

124

if (th->rst)

125

goto kill;

125

goto kill;

126

127

if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))

127

if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))

128

goto kill_with_rst;

128

goto kill_with_rst;

129

130

/* Dup ACK? */

130

/* Dup ACK? */

131

if (!th->ack ||

131

if (!th->ack ||

132

!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||

132

!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||

133

TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {

133

TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {

134

inet_twsk_put(tw);

134

inet_twsk_put(tw);

135

return TCP_TW_SUCCESS;

135

return TCP_TW_SUCCESS;

136

}

136

}

137

138

/* New data or FIN. If new data arrive after half-duplex close,

138

/* New data or FIN. If new data arrive after half-duplex close,

139

* reset.

139

* reset.

140

*/

140

*/

141

if (!th->fin ||

141

if (!th->fin ||

142

TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {

142

TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {

143

kill_with_rst:

143

kill_with_rst:

144

inet_twsk_deschedule(tw, &tcp_death_row);

144

inet_twsk_deschedule(tw, &tcp_death_row);

145

inet_twsk_put(tw);

145

inet_twsk_put(tw);

146

return TCP_TW_RST;

146

return TCP_TW_RST;

147

}

147

}

148

149

/* FIN arrived, enter true time-wait state. */

149

/* FIN arrived, enter true time-wait state. */

150

tw->tw_substate = TCP_TIME_WAIT;

150

tw->tw_substate = TCP_TIME_WAIT;

151

tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;

151

tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;

152

if (tmp_opt.saw_tstamp) {

152

if (tmp_opt.saw_tstamp) {

153

tcptw->tw_ts_recent_stamp = get_seconds();

153

tcptw->tw_ts_recent_stamp = get_seconds();

154

tcptw->tw_ts_recent = tmp_opt.rcv_tsval;

154

tcptw->tw_ts_recent = tmp_opt.rcv_tsval;

155

}

155

}

156

157

/* I am shamed, but failed to make it more elegant.

157

/* I am shamed, but failed to make it more elegant.

158

* Yes, it is direct reference to IP, which is impossible

158

* Yes, it is direct reference to IP, which is impossible

159

* to generalize to IPv6. Taking into account that IPv6

159

* to generalize to IPv6. Taking into account that IPv6

160

* do not understand recycling in any case, it not

160

* do not understand recycling in any case, it not

161

* a big problem in practice. --ANK */

161

* a big problem in practice. --ANK */

162

if (tw->tw_family == AF_INET &&

162

if (tw->tw_family == AF_INET &&

163

tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&

163

tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&

164

tcp_v4_tw_remember_stamp(tw))

164

tcp_v4_tw_remember_stamp(tw))

165

inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,

165

inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,

166

TCP_TIMEWAIT_LEN);

166

TCP_TIMEWAIT_LEN);

167

else

167

else

168

inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,

168

inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,

169

TCP_TIMEWAIT_LEN);

169

TCP_TIMEWAIT_LEN);

170

return TCP_TW_ACK;

170

return TCP_TW_ACK;

171

}

171

}

172

173

/*

173

/*

174

* Now real TIME-WAIT state.

174

* Now real TIME-WAIT state.

175

*

175

*

176

* RFC 1122:

176

* RFC 1122:

177

* "When a connection is [...] on TIME-WAIT state [...]

177

* "When a connection is [...] on TIME-WAIT state [...]

178

* [a TCP] MAY accept a new SYN from the remote TCP to

178

* [a TCP] MAY accept a new SYN from the remote TCP to

179

* reopen the connection directly, if it:

179

* reopen the connection directly, if it:

180

*

180

*

181

* (1) assigns its initial sequence number for the new

181

* (1) assigns its initial sequence number for the new

182

* connection to be larger than the largest sequence

182

* connection to be larger than the largest sequence

183

* number it used on the previous connection incarnation,

183

* number it used on the previous connection incarnation,

184

* and

184

* and

185

*

185

*

186

* (2) returns to TIME-WAIT state if the SYN turns out

186

* (2) returns to TIME-WAIT state if the SYN turns out

187

* to be an old duplicate".

187

* to be an old duplicate".

188

*/

188

*/

189

190

if (!paws_reject &&

190

if (!paws_reject &&

191

(TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&

191

(TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&

192

(TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {

192

(TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {

193

/* In window segment, it may be only reset or bare ack. */

193

/* In window segment, it may be only reset or bare ack. */

194

195

if (th->rst) {

195

if (th->rst) {

196

/* This is TIME_WAIT assassination, in two flavors.

196

/* This is TIME_WAIT assassination, in two flavors.

197

* Oh well... nobody has a sufficient solution to this

197

* Oh well... nobody has a sufficient solution to this

198

* protocol bug yet.

198

* protocol bug yet.

199

*/

199

*/

200

if (sysctl_tcp_rfc1337 == 0) {

200

if (sysctl_tcp_rfc1337 == 0) {

201

kill:

201

kill:

202

inet_twsk_deschedule(tw, &tcp_death_row);

202

inet_twsk_deschedule(tw, &tcp_death_row);

203

inet_twsk_put(tw);

203

inet_twsk_put(tw);

204

return TCP_TW_SUCCESS;

204

return TCP_TW_SUCCESS;

205

}

205

}

206

}

206

}

207

inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,

207

inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,

208

TCP_TIMEWAIT_LEN);

208

TCP_TIMEWAIT_LEN);

209

210

if (tmp_opt.saw_tstamp) {

210

if (tmp_opt.saw_tstamp) {

211

tcptw->tw_ts_recent = tmp_opt.rcv_tsval;

211

tcptw->tw_ts_recent = tmp_opt.rcv_tsval;

212

tcptw->tw_ts_recent_stamp = get_seconds();

212

tcptw->tw_ts_recent_stamp = get_seconds();

213

}

213

}

214

215

inet_twsk_put(tw);

215

inet_twsk_put(tw);

216

return TCP_TW_SUCCESS;

216

return TCP_TW_SUCCESS;

217

}

217

}

218

219

/* Out of window segment.

219

/* Out of window segment.

220

221

All the segments are ACKed immediately.

221

All the segments are ACKed immediately.

222

223

The only exception is new SYN. We accept it, if it is

223

The only exception is new SYN. We accept it, if it is

224

not old duplicate and we are not in danger to be killed

224

not old duplicate and we are not in danger to be killed

225

by delayed old duplicates. RFC check is that it has

225

by delayed old duplicates. RFC check is that it has

226

newer sequence number works at rates <40Mbit/sec.

226

newer sequence number works at rates <40Mbit/sec.

227

However, if paws works, it is reliable AND even more,

227

However, if paws works, it is reliable AND even more,

228

we even may relax silly seq space cutoff.

228

we even may relax silly seq space cutoff.

229

230

RED-PEN: we violate main RFC requirement, if this SYN will appear

230

RED-PEN: we violate main RFC requirement, if this SYN will appear

231

old duplicate (i.e. we receive RST in reply to SYN-ACK),

231

old duplicate (i.e. we receive RST in reply to SYN-ACK),

232

we must return socket to time-wait state. It is not good,

232

we must return socket to time-wait state. It is not good,

233

but not fatal yet.

233

but not fatal yet.

234

*/

234

*/

235

236

if (th->syn && !th->rst && !th->ack && !paws_reject &&

236

if (th->syn && !th->rst && !th->ack && !paws_reject &&

237

(after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||

237

(after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||

238

(tmp_opt.saw_tstamp &&

238

(tmp_opt.saw_tstamp &&

239

(s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {

239

(s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {

240

u32 isn = tcptw->tw_snd_nxt + 65535 + 2;

240

u32 isn = tcptw->tw_snd_nxt + 65535 + 2;

241

if (isn == 0)

241

if (isn == 0)

242

isn++;

242

isn++;

243

TCP_SKB_CB(skb)->when = isn;

243

TCP_SKB_CB(skb)->when = isn;

244

return TCP_TW_SYN;

244

return TCP_TW_SYN;

245

}

245

}

246

247

if (paws_reject)

247

if (paws_reject)

248

NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);

248

NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);

249

250

if (!th->rst) {

250

if (!th->rst) {

251

/* In this case we must reset the TIMEWAIT timer.

251

/* In this case we must reset the TIMEWAIT timer.

252

*

252

*

253

* If it is ACKless SYN it may be both old duplicate

253

* If it is ACKless SYN it may be both old duplicate

254

* and new good SYN with random sequence number <rcv_nxt.

254

* and new good SYN with random sequence number <rcv_nxt.

255

* Do not reschedule in the last case.

255

* Do not reschedule in the last case.

256

*/

256

*/

257

if (paws_reject || th->ack)

257

if (paws_reject || th->ack)

258

inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,

258

inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,

259

TCP_TIMEWAIT_LEN);

259

TCP_TIMEWAIT_LEN);

260

261

/* Send ACK. Note, we do not put the bucket,

261

/* Send ACK. Note, we do not put the bucket,

262

* it will be released by caller.

262

* it will be released by caller.

263

*/

263

*/

264

return TCP_TW_ACK;

264

return TCP_TW_ACK;

265

}

265

}

266

inet_twsk_put(tw);

266

inet_twsk_put(tw);

267

return TCP_TW_SUCCESS;

267

return TCP_TW_SUCCESS;

268

}

268

}

269

270

/*

270

/*

271

* Move a socket to time-wait or dead fin-wait-2 state.

271

* Move a socket to time-wait or dead fin-wait-2 state.

272

*/

272

*/

273

void tcp_time_wait(struct sock *sk, int state, int timeo)

273

void tcp_time_wait(struct sock *sk, int state, int timeo)

274

{

274

{

275

struct inet_timewait_sock *tw = NULL;

275

struct inet_timewait_sock *tw = NULL;

276

const struct inet_connection_sock *icsk = inet_csk(sk);

276

const struct inet_connection_sock *icsk = inet_csk(sk);

277

const struct tcp_sock *tp = tcp_sk(sk);

277

const struct tcp_sock *tp = tcp_sk(sk);

278

int recycle_ok = 0;

278

int recycle_ok = 0;

279

280

if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)

280

if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)

281

recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);

281

recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);

282

283

if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)

283

if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)

284

tw = inet_twsk_alloc(sk, state);

284

tw = inet_twsk_alloc(sk, state);

285

286

if (tw != NULL) {

286

if (tw != NULL) {

287

struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);

287

struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);

288

const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);

288

const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);

289

290

tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;

290

tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;

291

tcptw->tw_rcv_nxt = tp->rcv_nxt;

291

tcptw->tw_rcv_nxt = tp->rcv_nxt;

292

tcptw->tw_snd_nxt = tp->snd_nxt;

292

tcptw->tw_snd_nxt = tp->snd_nxt;

293

tcptw->tw_rcv_wnd = tcp_receive_window(tp);

293

tcptw->tw_rcv_wnd = tcp_receive_window(tp);

294

tcptw->tw_ts_recent = tp->rx_opt.ts_recent;

294

tcptw->tw_ts_recent = tp->rx_opt.ts_recent;

295

tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;

295

tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;

296

297

#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)

297

#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)

298

if (tw->tw_family == PF_INET6) {

298

if (tw->tw_family == PF_INET6) {

299

struct ipv6_pinfo *np = inet6_sk(sk);

299

struct ipv6_pinfo *np = inet6_sk(sk);

300

struct inet6_timewait_sock *tw6;

300

struct inet6_timewait_sock *tw6;

301

302

tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);

302

tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);

303

tw6 = inet6_twsk((struct sock *)tw);

303

tw6 = inet6_twsk((struct sock *)tw);

304

ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);

304

ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);

305

ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);

305

ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);

306

tw->tw_ipv6only = np->ipv6only;

306

tw->tw_ipv6only = np->ipv6only;

307

}

307

}

308

#endif

308

#endif

309

310

#ifdef CONFIG_TCP_MD5SIG

310

#ifdef CONFIG_TCP_MD5SIG

311

/*

311

/*

312

* The timewait bucket does not have the key DB from the

312

* The timewait bucket does not have the key DB from the

313

* sock structure. We just make a quick copy of the

313

* sock structure. We just make a quick copy of the

314

* md5 key being used (if indeed we are using one)

314

* md5 key being used (if indeed we are using one)

315

* so the timewait ack generating code has the key.

315

* so the timewait ack generating code has the key.

316

*/

316

*/

317

do {

317

do {

318

struct tcp_md5sig_key *key;

318

struct tcp_md5sig_key *key;

319

memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key));

319

memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key));

320

tcptw->tw_md5_keylen = 0;

320

tcptw->tw_md5_keylen = 0;

321

key = tp->af_specific->md5_lookup(sk, sk);

321

key = tp->af_specific->md5_lookup(sk, sk);

322

if (key != NULL) {

322

if (key != NULL) {

323

memcpy(&tcptw->tw_md5_key, key->key, key->keylen);

323

memcpy(&tcptw->tw_md5_key, key->key, key->keylen);

324

tcptw->tw_md5_keylen = key->keylen;

324

tcptw->tw_md5_keylen = key->keylen;

325

if (tcp_alloc_md5sig_pool(sk) == NULL)

325

if (tcp_alloc_md5sig_pool(sk) == NULL)

326

BUG();

326

BUG();

327

}

327

}

328

} while (0);

328

} while (0);

329

#endif

329

#endif

330

331

/* Linkage updates. */

331

/* Linkage updates. */

332

__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);

332

__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);

333

334

/* Get the TIME_WAIT timeout firing. */

334

/* Get the TIME_WAIT timeout firing. */

335

if (timeo < rto)

335

if (timeo < rto)

336

timeo = rto;

336

timeo = rto;

337

338

if (recycle_ok) {

338

if (recycle_ok) {

339

tw->tw_timeout = rto;

339

tw->tw_timeout = rto;

340

} else {

340

} else {

341

tw->tw_timeout = TCP_TIMEWAIT_LEN;

341

tw->tw_timeout = TCP_TIMEWAIT_LEN;

342

if (state == TCP_TIME_WAIT)

342

if (state == TCP_TIME_WAIT)

343

timeo = TCP_TIMEWAIT_LEN;

343

timeo = TCP_TIMEWAIT_LEN;

344

}

344

}

345

346

inet_twsk_schedule(tw, &tcp_death_row, timeo,

346

inet_twsk_schedule(tw, &tcp_death_row, timeo,

347

TCP_TIMEWAIT_LEN);

347

TCP_TIMEWAIT_LEN);

348

inet_twsk_put(tw);

348

inet_twsk_put(tw);

349

} else {

349

} else {

350

/* Sorry, if we're out of memory, just CLOSE this

350

/* Sorry, if we're out of memory, just CLOSE this

351

* socket up. We've got bigger problems than

351

* socket up. We've got bigger problems than

352

* non-graceful socket closings.

352

* non-graceful socket closings.

353

*/

353

*/

354

LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n");

354

LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n");

355

}

355

}

356

357

tcp_update_metrics(sk);

357

tcp_update_metrics(sk);

358

tcp_done(sk);

358

tcp_done(sk);

359

}

359

}

360

361

void tcp_twsk_destructor(struct sock *sk)

361

void tcp_twsk_destructor(struct sock *sk)

362

{

362

{

363

#ifdef CONFIG_TCP_MD5SIG

363

#ifdef CONFIG_TCP_MD5SIG

364

struct tcp_timewait_sock *twsk = tcp_twsk(sk);

364

struct tcp_timewait_sock *twsk = tcp_twsk(sk);

365

if (twsk->tw_md5_keylen)

365

if (twsk->tw_md5_keylen)

366

tcp_free_md5sig_pool();

366

tcp_free_md5sig_pool();

367

#endif

367

#endif

368

}

368

}

369

370

EXPORT_SYMBOL_GPL(tcp_twsk_destructor);

370

EXPORT_SYMBOL_GPL(tcp_twsk_destructor);

371

372

static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,

372

static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,

373

struct request_sock *req)

373

struct request_sock *req)

374

{

374

{

375

tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;

375

tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;

376

}

376

}

377

378

/* This is not only more efficient than what we used to do, it eliminates

378

/* This is not only more efficient than what we used to do, it eliminates

379

* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM

379

* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM

380

*

380

*

381

* Actually, we could lots of memory writes here. tp of listening

381

* Actually, we could lots of memory writes here. tp of listening

382

* socket contains all necessary default parameters.

382

* socket contains all necessary default parameters.

383

*/

383

*/

384

struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)

384

struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)

385

{

385

{

386

struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);

386

struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);

387

388

if (newsk != NULL) {

388

if (newsk != NULL) {

389

const struct inet_request_sock *ireq = inet_rsk(req);

389

const struct inet_request_sock *ireq = inet_rsk(req);

390

struct tcp_request_sock *treq = tcp_rsk(req);

390

struct tcp_request_sock *treq = tcp_rsk(req);

391

struct inet_connection_sock *newicsk = inet_csk(newsk);

391

struct inet_connection_sock *newicsk = inet_csk(newsk);

392

struct tcp_sock *newtp;

392

struct tcp_sock *newtp;

393

394

/* Now setup tcp_sock */

394

/* Now setup tcp_sock */

395

newtp = tcp_sk(newsk);

395

newtp = tcp_sk(newsk);

396

newtp->pred_flags = 0;

396

newtp->pred_flags = 0;

397

newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;

397

newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;

398

newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;

398

newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;

399

newtp->snd_up = treq->snt_isn + 1;

399

newtp->snd_up = treq->snt_isn + 1;

400

401

tcp_prequeue_init(newtp);

401

tcp_prequeue_init(newtp);

402

403

tcp_init_wl(newtp, treq->rcv_isn);

403

tcp_init_wl(newtp, treq->rcv_isn);

404

405

newtp->srtt = 0;

405

newtp->srtt = 0;

406

newtp->mdev = TCP_TIMEOUT_INIT;

406

newtp->mdev = TCP_TIMEOUT_INIT;

407

newicsk->icsk_rto = TCP_TIMEOUT_INIT;

407

newicsk->icsk_rto = TCP_TIMEOUT_INIT;

408

409

newtp->packets_out = 0;

409

newtp->packets_out = 0;

410

newtp->retrans_out = 0;

410

newtp->retrans_out = 0;

411

newtp->sacked_out = 0;

411

newtp->sacked_out = 0;

412

newtp->fackets_out = 0;

412

newtp->fackets_out = 0;

413

newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;

413

newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;

414

415

/* So many TCP implementations out there (incorrectly) count the

415

/* So many TCP implementations out there (incorrectly) count the

416

* initial SYN frame in their delayed-ACK and congestion control

416

* initial SYN frame in their delayed-ACK and congestion control

417

* algorithms that we must have the following bandaid to talk

417

* algorithms that we must have the following bandaid to talk

418

* efficiently to them. -DaveM

418

* efficiently to them. -DaveM

419

*/

419

*/

420

newtp->snd_cwnd = 2;

420

newtp->snd_cwnd = 2;

421

newtp->snd_cwnd_cnt = 0;

421

newtp->snd_cwnd_cnt = 0;

422

newtp->bytes_acked = 0;

422

newtp->bytes_acked = 0;

423

424

newtp->frto_counter = 0;

424

newtp->frto_counter = 0;

425

newtp->frto_highmark = 0;

425

newtp->frto_highmark = 0;

426

427

newicsk->icsk_ca_ops = &tcp_init_congestion_ops;

427

newicsk->icsk_ca_ops = &tcp_init_congestion_ops;

428

429

tcp_set_ca_state(newsk, TCP_CA_Open);

429

tcp_set_ca_state(newsk, TCP_CA_Open);

430

tcp_init_xmit_timers(newsk);

430

tcp_init_xmit_timers(newsk);

431

skb_queue_head_init(&newtp->out_of_order_queue);

431

skb_queue_head_init(&newtp->out_of_order_queue);

432

newtp->write_seq = treq->snt_isn + 1;

432

newtp->write_seq = treq->snt_isn + 1;

433

newtp->pushed_seq = newtp->write_seq;

433

newtp->pushed_seq = newtp->write_seq;

434

435

newtp->rx_opt.saw_tstamp = 0;

435

newtp->rx_opt.saw_tstamp = 0;

436

437

newtp->rx_opt.dsack = 0;

437

newtp->rx_opt.dsack = 0;

438

newtp->rx_opt.num_sacks = 0;

438

newtp->rx_opt.num_sacks = 0;

439

440

newtp->urg_data = 0;

440

newtp->urg_data = 0;

441

442

if (sock_flag(newsk, SOCK_KEEPOPEN))

442

if (sock_flag(newsk, SOCK_KEEPOPEN))

443

inet_csk_reset_keepalive_timer(newsk,

443

inet_csk_reset_keepalive_timer(newsk,

444

keepalive_time_when(newtp));

444

keepalive_time_when(newtp));

445

446

newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;

446

newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;

447

if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {

447

if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {

448

if (sysctl_tcp_fack)

448

if (sysctl_tcp_fack)

449

tcp_enable_fack(newtp);

449

tcp_enable_fack(newtp);

450

}

450

}

451

newtp->window_clamp = req->window_clamp;

451

newtp->window_clamp = req->window_clamp;

452

newtp->rcv_ssthresh = req->rcv_wnd;

452

newtp->rcv_ssthresh = req->rcv_wnd;

453

newtp->rcv_wnd = req->rcv_wnd;

453

newtp->rcv_wnd = req->rcv_wnd;

454

newtp->rx_opt.wscale_ok = ireq->wscale_ok;

454

newtp->rx_opt.wscale_ok = ireq->wscale_ok;

455

if (newtp->rx_opt.wscale_ok) {

455

if (newtp->rx_opt.wscale_ok) {

456

newtp->rx_opt.snd_wscale = ireq->snd_wscale;

456

newtp->rx_opt.snd_wscale = ireq->snd_wscale;

457

newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;

457

newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;

458

} else {

458

} else {

459

newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;

459

newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;

460

newtp->window_clamp = min(newtp->window_clamp, 65535U);

460

newtp->window_clamp = min(newtp->window_clamp, 65535U);

461

}

461

}

462

newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<

462

newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<

463

newtp->rx_opt.snd_wscale);

463

newtp->rx_opt.snd_wscale);

464

newtp->max_window = newtp->snd_wnd;

464

newtp->max_window = newtp->snd_wnd;

465

466

if (newtp->rx_opt.tstamp_ok) {

466

if (newtp->rx_opt.tstamp_ok) {

467

newtp->rx_opt.ts_recent = req->ts_recent;

467

newtp->rx_opt.ts_recent = req->ts_recent;

468

newtp->rx_opt.ts_recent_stamp = get_seconds();

468

newtp->rx_opt.ts_recent_stamp = get_seconds();

469

newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;

469

newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;

470

} else {

470

} else {

471

newtp->rx_opt.ts_recent_stamp = 0;

471

newtp->rx_opt.ts_recent_stamp = 0;

472

newtp->tcp_header_len = sizeof(struct tcphdr);

472

newtp->tcp_header_len = sizeof(struct tcphdr);

473

}

473

}

474

#ifdef CONFIG_TCP_MD5SIG

474

#ifdef CONFIG_TCP_MD5SIG

475

newtp->md5sig_info = NULL; /*XXX*/

475

newtp->md5sig_info = NULL; /*XXX*/

476

if (newtp->af_specific->md5_lookup(sk, newsk))

476

if (newtp->af_specific->md5_lookup(sk, newsk))

477

newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;

477

newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;

478

#endif

478

#endif

479

if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)

479

if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)

480

newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;

480

newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;

481

newtp->rx_opt.mss_clamp = req->mss;

481

newtp->rx_opt.mss_clamp = req->mss;

482

TCP_ECN_openreq_child(newtp, req);

482

TCP_ECN_openreq_child(newtp, req);

483

484

TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);

484

TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);

485

}

485

}

486

return newsk;

486

return newsk;

487

}

487

}

488

489

/*

489

/*

490

* Process an incoming packet for SYN_RECV sockets represented

490

* Process an incoming packet for SYN_RECV sockets represented

491

* as a request_sock.

491

* as a request_sock.

492

*/

492

*/

493

494

struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,

494

struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,

495

struct request_sock *req,

495

struct request_sock *req,

496

struct request_sock **prev)

496

struct request_sock **prev)

497

{

497

{

498

const struct tcphdr *th = tcp_hdr(skb);

498

const struct tcphdr *th = tcp_hdr(skb);

499

__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);

499

__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);

500

int paws_reject = 0;

500

int paws_reject = 0;

501

struct tcp_options_received tmp_opt;

501

struct tcp_options_received tmp_opt;

502

struct sock *child;

502

struct sock *child;

503

504

tmp_opt.saw_tstamp = 0;

504

tmp_opt.saw_tstamp = 0;

505

if (th->doff > (sizeof(struct tcphdr)>>2)) {

505

if (th->doff > (sizeof(struct tcphdr)>>2)) {

506

tcp_parse_options(skb, &tmp_opt, 0);

506

tcp_parse_options(skb, &tmp_opt, 0);

507

508

if (tmp_opt.saw_tstamp) {

508

if (tmp_opt.saw_tstamp) {

509

tmp_opt.ts_recent = req->ts_recent;

509

tmp_opt.ts_recent = req->ts_recent;

510

/* We do not store true stamp, but it is not required,

510

/* We do not store true stamp, but it is not required,

511

* it can be estimated (approximately)

511

* it can be estimated (approximately)

512

* from another data.

512

* from another data.

513

*/

513

*/

514

tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);

514

tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);

515

paws_reject = tcp_paws_reject(&tmp_opt, th->rst);

515

paws_reject = tcp_paws_reject(&tmp_opt, th->rst);

516

}

516

}

517

}

517

}

518

519

/* Check for pure retransmitted SYN. */

519

/* Check for pure retransmitted SYN. */

520

if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&

520

if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&

521

flg == TCP_FLAG_SYN &&

521

flg == TCP_FLAG_SYN &&

522

!paws_reject) {

522

!paws_reject) {

523

/*

523

/*

524

* RFC793 draws (Incorrectly! It was fixed in RFC1122)

524

* RFC793 draws (Incorrectly! It was fixed in RFC1122)

525

* this case on figure 6 and figure 8, but formal

525

* this case on figure 6 and figure 8, but formal

526

* protocol description says NOTHING.

526

* protocol description says NOTHING.

527

* To be more exact, it says that we should send ACK,

527

* To be more exact, it says that we should send ACK,

528

* because this segment (at least, if it has no data)

528

* because this segment (at least, if it has no data)

529

* is out of window.

529

* is out of window.

530

*

530

*

531

* CONCLUSION: RFC793 (even with RFC1122) DOES NOT

531

* CONCLUSION: RFC793 (even with RFC1122) DOES NOT

532

* describe SYN-RECV state. All the description

532

* describe SYN-RECV state. All the description

533

* is wrong, we cannot believe to it and should

533

* is wrong, we cannot believe to it and should

534

* rely only on common sense and implementation

534

* rely only on common sense and implementation

535

* experience.

535

* experience.

536

*

536

*

537

* Enforce "SYN-ACK" according to figure 8, figure 6

537

* Enforce "SYN-ACK" according to figure 8, figure 6

538

* of RFC793, fixed by RFC1122.

538

* of RFC793, fixed by RFC1122.

539

*/

539

*/

540

req->rsk_ops->rtx_syn_ack(sk, req);

540

req->rsk_ops->rtx_syn_ack(sk, req);

541

return NULL;

541

return NULL;

542

}

542

}

543

544

/* Further reproduces section "SEGMENT ARRIVES"

544

/* Further reproduces section "SEGMENT ARRIVES"

545

for state SYN-RECEIVED of RFC793.

545

for state SYN-RECEIVED of RFC793.

546

It is broken, however, it does not work only

546

It is broken, however, it does not work only

547

when SYNs are crossed.

547

when SYNs are crossed.

548

549

You would think that SYN crossing is impossible here, since

549

You would think that SYN crossing is impossible here, since

550

we should have a SYN_SENT socket (from connect()) on our end,

550

we should have a SYN_SENT socket (from connect()) on our end,

551

but this is not true if the crossed SYNs were sent to both

551

but this is not true if the crossed SYNs were sent to both

552

ends by a malicious third party. We must defend against this,

552

ends by a malicious third party. We must defend against this,

553

and to do that we first verify the ACK (as per RFC793, page

553

and to do that we first verify the ACK (as per RFC793, page

554

36) and reset if it is invalid. Is this a true full defense?

554

36) and reset if it is invalid. Is this a true full defense?

555

To convince ourselves, let us consider a way in which the ACK

555

To convince ourselves, let us consider a way in which the ACK

556

test can still pass in this 'malicious crossed SYNs' case.

556

test can still pass in this 'malicious crossed SYNs' case.

557

Malicious sender sends identical SYNs (and thus identical sequence

557

Malicious sender sends identical SYNs (and thus identical sequence

558

numbers) to both A and B:

558

numbers) to both A and B:

559

560

A: gets SYN, seq=7

560

A: gets SYN, seq=7

561

B: gets SYN, seq=7

561

B: gets SYN, seq=7

562

563

By our good fortune, both A and B select the same initial

563

By our good fortune, both A and B select the same initial

564

send sequence number of seven :-)

564

send sequence number of seven :-)

565

566

A: sends SYN|ACK, seq=7, ack_seq=8

566

A: sends SYN|ACK, seq=7, ack_seq=8

567

B: sends SYN|ACK, seq=7, ack_seq=8

567

B: sends SYN|ACK, seq=7, ack_seq=8

568

569

So we are now A eating this SYN|ACK, ACK test passes. So

569

So we are now A eating this SYN|ACK, ACK test passes. So

570

does sequence test, SYN is truncated, and thus we consider

570

does sequence test, SYN is truncated, and thus we consider

571

it a bare ACK.

571

it a bare ACK.

572

573

If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this

573

If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this

574

bare ACK. Otherwise, we create an established connection. Both

574

bare ACK. Otherwise, we create an established connection. Both

575

ends (listening sockets) accept the new incoming connection and try

575

ends (listening sockets) accept the new incoming connection and try

576

to talk to each other. 8-)

576

to talk to each other. 8-)

577

578

Note: This case is both harmless, and rare. Possibility is about the

578

Note: This case is both harmless, and rare. Possibility is about the

579

same as us discovering intelligent life on another plant tomorrow.

579

same as us discovering intelligent life on another plant tomorrow.

580

581

But generally, we should (RFC lies!) to accept ACK

581

But generally, we should (RFC lies!) to accept ACK

582

from SYNACK both here and in tcp_rcv_state_process().

582

from SYNACK both here and in tcp_rcv_state_process().

583

tcp_rcv_state_process() does not, hence, we do not too.

583

tcp_rcv_state_process() does not, hence, we do not too.

584

585

Note that the case is absolutely generic:

585

Note that the case is absolutely generic:

586

we cannot optimize anything here without

586

we cannot optimize anything here without

587

violating protocol. All the checks must be made

587

violating protocol. All the checks must be made

588

before attempt to create socket.

588

before attempt to create socket.

589

*/

589

*/

590

591

/* RFC793 page 36: "If the connection is in any non-synchronized state ...

591

/* RFC793 page 36: "If the connection is in any non-synchronized state ...

592

* and the incoming segment acknowledges something not yet

592

* and the incoming segment acknowledges something not yet

593

* sent (the segment carries an unacceptable ACK) ...

593

* sent (the segment carries an unacceptable ACK) ...

594

* a reset is sent."

594

* a reset is sent."

595

*

595

*

596

* Invalid ACK: reset will be sent by listening socket

596

* Invalid ACK: reset will be sent by listening socket

597

*/

597

*/

598

if ((flg & TCP_FLAG_ACK) &&

598

if ((flg & TCP_FLAG_ACK) &&

599

(TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))

599

(TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))

600

return sk;

600

return sk;

601

602

/* Also, it would be not so bad idea to check rcv_tsecr, which

602

/* Also, it would be not so bad idea to check rcv_tsecr, which

603

* is essentially ACK extension and too early or too late values

603

* is essentially ACK extension and too early or too late values

604

* should cause reset in unsynchronized states.

604

* should cause reset in unsynchronized states.

605

*/

605

*/

606

607

/* RFC793: "first check sequence number". */

607

/* RFC793: "first check sequence number". */

608

609

if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,

609

if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,

610

tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {

610

tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {

611

/* Out of window: send ACK and drop. */

611

/* Out of window: send ACK and drop. */

612

if (!(flg & TCP_FLAG_RST))

612

if (!(flg & TCP_FLAG_RST))

613

req->rsk_ops->send_ack(sk, skb, req);

613

req->rsk_ops->send_ack(sk, skb, req);

614

if (paws_reject)

614

if (paws_reject)

615

NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);

615

NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);

616

return NULL;

616

return NULL;

617

}

617

}

618

619

/* In sequence, PAWS is OK. */

619

/* In sequence, PAWS is OK. */

620

621

if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))

621

if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))

622

req->ts_recent = tmp_opt.rcv_tsval;

622

req->ts_recent = tmp_opt.rcv_tsval;

623

624

if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {

624

if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {

625

/* Truncate SYN, it is out of window starting

625

/* Truncate SYN, it is out of window starting

626

at tcp_rsk(req)->rcv_isn + 1. */

626

at tcp_rsk(req)->rcv_isn + 1. */

627

flg &= ~TCP_FLAG_SYN;

627

flg &= ~TCP_FLAG_SYN;

628

}

628

}

629

630

/* RFC793: "second check the RST bit" and

630

/* RFC793: "second check the RST bit" and

631

* "fourth, check the SYN bit"

631

* "fourth, check the SYN bit"

632

*/

632

*/

633

if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {

633

if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {

634

TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);

634

TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);

635

goto embryonic_reset;

635

goto embryonic_reset;

636

}

636

}

637

638

/* ACK sequence verified above, just make sure ACK is

638

/* ACK sequence verified above, just make sure ACK is

639

* set. If ACK not set, just silently drop the packet.

639

* set. If ACK not set, just silently drop the packet.

640

*/

640

*/

641

if (!(flg & TCP_FLAG_ACK))

641

if (!(flg & TCP_FLAG_ACK))

642

return NULL;

642

return NULL;

643

644

/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */

644

/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */

645

if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&

645

if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&

646

TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {

646

TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {

647

inet_csk(sk)->icsk_accept_queue.rskq_defer_accept--;

647

inet_rsk(req)->acked = 1;

648

inet_rsk(req)->acked = 1;

648

return NULL;

649

return NULL;

649

}

650

}

650

651

/* OK, ACK is valid, create big socket and

652

/* OK, ACK is valid, create big socket and

652

* feed this segment to it. It will repeat all

653

* feed this segment to it. It will repeat all

653

* the tests. THIS SEGMENT MUST MOVE SOCKET TO

654

* the tests. THIS SEGMENT MUST MOVE SOCKET TO

654

* ESTABLISHED STATE. If it will be dropped after

655

* ESTABLISHED STATE. If it will be dropped after

655

* socket is created, wait for troubles.

656

* socket is created, wait for troubles.

656

*/

657

*/

657

child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);

658

child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);

658

if (child == NULL)

659

if (child == NULL)

659

goto listen_overflow;

660

goto listen_overflow;

660

661

inet_csk_reqsk_queue_unlink(sk, req, prev);

662

inet_csk_reqsk_queue_unlink(sk, req, prev);

662

inet_csk_reqsk_queue_removed(sk, req);

663

inet_csk_reqsk_queue_removed(sk, req);

663

664

inet_csk_reqsk_queue_add(sk, req, child);

665

inet_csk_reqsk_queue_add(sk, req, child);

665

return child;

666

return child;

666

667

listen_overflow:

668

listen_overflow:

668

if (!sysctl_tcp_abort_on_overflow) {

669

if (!sysctl_tcp_abort_on_overflow) {

669

inet_rsk(req)->acked = 1;

670

inet_rsk(req)->acked = 1;

670

return NULL;

671

return NULL;

671

}

672

}

672

673

embryonic_reset:

674

embryonic_reset:

674

NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);

675

NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);

675

if (!(flg & TCP_FLAG_RST))

676

if (!(flg & TCP_FLAG_RST))

676

req->rsk_ops->send_reset(sk, skb);

677

req->rsk_ops->send_reset(sk, skb);

677

678

inet_csk_reqsk_queue_drop(sk, req, prev);

679

inet_csk_reqsk_queue_drop(sk, req, prev);

679

return NULL;

680

return NULL;

680

}

681

}

681

682

/*

683

/*

683

* Queue segment on the new socket if the new socket is active,

684

* Queue segment on the new socket if the new socket is active,

684

* otherwise we just shortcircuit this and continue with

685

* otherwise we just shortcircuit this and continue with

685

* the new socket.

686

* the new socket.

686

*/

687

*/

687

688

int tcp_child_process(struct sock *parent, struct sock *child,

689

int tcp_child_process(struct sock *parent, struct sock *child,

689

struct sk_buff *skb)

690

struct sk_buff *skb)

690

{

691

{

691

int ret = 0;

692

int ret = 0;

692

int state = child->sk_state;

693

int state = child->sk_state;

693

694

if (!sock_owned_by_user(child)) {

695

if (!sock_owned_by_user(child)) {

695

ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),

696

ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),

696

skb->len);

697

skb->len);

697

/* Wakeup parent, send SIGIO */

698

/* Wakeup parent, send SIGIO */

698

if (state == TCP_SYN_RECV && child->sk_state != state)

699

if (state == TCP_SYN_RECV && child->sk_state != state)

699

parent->sk_data_ready(parent, 0);

700

parent->sk_data_ready(parent, 0);

700

} else {

701

} else {

701

/* Alas, it is possible again, because we do lookup

702

/* Alas, it is possible again, because we do lookup

702

* in main socket hash table and lock on listening

703

* in main socket hash table and lock on listening

703

* socket does not protect us more.

704

* socket does not protect us more.

704

*/

705

*/

705

sk_add_backlog(child, skb);

706

sk_add_backlog(child, skb);

706

}

707

}

707

708

bh_unlock_sock(child);

709

bh_unlock_sock(child);

709

sock_put(child);

710

sock_put(child);

710

return ret;

711

return ret;

711

}

712

}

712

713

EXPORT_SYMBOL(tcp_check_req);

714

EXPORT_SYMBOL(tcp_check_req);

714

EXPORT_SYMBOL(tcp_child_process);

715

EXPORT_SYMBOL(tcp_child_process);

715

EXPORT_SYMBOL(tcp_create_openreq_child);

716

EXPORT_SYMBOL(tcp_create_openreq_child);

716

EXPORT_SYMBOL(tcp_timewait_state_process);

717

EXPORT_SYMBOL(tcp_timewait_state_process);

717

718

GITLAB

tcp: fix tcp_defer_accept to consider the timeout

 /*
  * INET		An implementation of the TCP/IP protocol suite for the LINUX
  *		operating system.  INET is implemented using the  BSD Socket
  *		interface as the means of communication with the user level.
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
  * Authors:	Ross Biro
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
  *		Florian La Roche, <flla@stud.uni-sb.de>
  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  *		Matthew Dillon, <dillon@apollo.west.oic.com>
  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  *		Jorge Cwik, <jorge@laser.satlink.net>
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
 #include <linux/workqueue.h>
 #include <net/tcp.h>
 #include <net/inet_common.h>
 #include <net/xfrm.h>
 #ifdef CONFIG_SYSCTL
 #define SYNC_INIT 0 /* let the user enable it */
 #else
 #define SYNC_INIT 1
 #endif
 int sysctl_tcp_syncookies __read_mostly = SYNC_INIT;
 EXPORT_SYMBOL(sysctl_tcp_syncookies);
 int sysctl_tcp_abort_on_overflow __read_mostly;
 struct inet_timewait_death_row tcp_death_row = {
 	.sysctl_max_tw_buckets = NR_FILE * 2,
 	.period		= TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
 	.death_lock	= __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
 	.hashinfo	= &tcp_hashinfo,
 	.tw_timer	= TIMER_INITIALIZER(inet_twdr_hangman, 0,
 					    (unsigned long)&tcp_death_row),
 	.twkill_work	= __WORK_INITIALIZER(tcp_death_row.twkill_work,
 					     inet_twdr_twkill_work),
 /* Short-time timewait calendar */
 	.twcal_hand	= -1,
 	.twcal_timer	= TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
 					    (unsigned long)&tcp_death_row),
 };
 EXPORT_SYMBOL_GPL(tcp_death_row);
 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 {
 	if (seq == s_win)
 		return 1;
 	if (after(end_seq, s_win) && before(seq, e_win))
 		return 1;
 	return (seq == e_win && seq == end_seq);
 }
 /*
  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
  *   (and, probably, tail of data) and one or more our ACKs are lost.
  * * What is TIME-WAIT timeout? It is associated with maximal packet
  *   lifetime in the internet, which results in wrong conclusion, that
  *   it is set to catch "old duplicate segments" wandering out of their path.
  *   It is not quite correct. This timeout is calculated so that it exceeds
  *   maximal retransmission timeout enough to allow to lose one (or more)
  *   segments sent by peer and our ACKs. This time may be calculated from RTO.
  * * When TIME-WAIT socket receives RST, it means that another end
  *   finally closed and we are allowed to kill TIME-WAIT too.
  * * Second purpose of TIME-WAIT is catching old duplicate segments.
  *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
  *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
  * * If we invented some more clever way to catch duplicates
  *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
  *
  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
  * from the very beginning.
  *
  * NOTE. With recycling (and later with fin-wait-2) TW bucket
  * is _not_ stateless. It means, that strictly speaking we must
  * spinlock it. I do not want! Well, probability of misbehaviour
  * is ridiculously low and, seems, we could use some mb() tricks
  * to avoid misread sequence numbers, states etc.  --ANK
  */
 enum tcp_tw_status
 tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 			   const struct tcphdr *th)
 {
 	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 	struct tcp_options_received tmp_opt;
 	int paws_reject = 0;
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
 		tcp_parse_options(skb, &tmp_opt, 0);
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
 			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;
 			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
 		}
 	}
 	if (tw->tw_substate == TCP_FIN_WAIT2) {
 		/* Just repeat all the checks of tcp_rcv_state_process() */
 		/* Out of window, send ACK */
 		if (paws_reject ||
 		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 				   tcptw->tw_rcv_nxt,
 				   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
 			return TCP_TW_ACK;
 		if (th->rst)
 			goto kill;
 		if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
 			goto kill_with_rst;
 		/* Dup ACK? */
 		if (!th->ack ||
 		    !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
 		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
 			inet_twsk_put(tw);
 			return TCP_TW_SUCCESS;
 		}
 		/* New data or FIN. If new data arrive after half-duplex close,
 		 * reset.
 		 */
 		if (!th->fin ||
 		    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
 kill_with_rst:
 			inet_twsk_deschedule(tw, &tcp_death_row);
 			inet_twsk_put(tw);
 			return TCP_TW_RST;
 		}
 		/* FIN arrived, enter true time-wait state. */
 		tw->tw_substate	  = TCP_TIME_WAIT;
 		tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if (tmp_opt.saw_tstamp) {
 			tcptw->tw_ts_recent_stamp = get_seconds();
 			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
 		}
 		/* I am shamed, but failed to make it more elegant.
 		 * Yes, it is direct reference to IP, which is impossible
 		 * to generalize to IPv6. Taking into account that IPv6
 		 * do not understand recycling in any case, it not
 		 * a big problem in practice. --ANK */
 		if (tw->tw_family == AF_INET &&
 		    tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
 		    tcp_v4_tw_remember_stamp(tw))
 			inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
 					   TCP_TIMEWAIT_LEN);
 		else
 			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
 					   TCP_TIMEWAIT_LEN);
 		return TCP_TW_ACK;
 	}
 	/*
 	 *	Now real TIME-WAIT state.
 	 *
 	 *	RFC 1122:
 	 *	"When a connection is [...] on TIME-WAIT state [...]
 	 *	[a TCP] MAY accept a new SYN from the remote TCP to
 	 *	reopen the connection directly, if it:
 	 *
 	 *	(1)  assigns its initial sequence number for the new
 	 *	connection to be larger than the largest sequence
 	 *	number it used on the previous connection incarnation,
 	 *	and
 	 *
 	 *	(2)  returns to TIME-WAIT state if the SYN turns out
 	 *	to be an old duplicate".
 	 */
 	if (!paws_reject &&
 	    (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
 	     (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
 		/* In window segment, it may be only reset or bare ack. */
 		if (th->rst) {
 			/* This is TIME_WAIT assassination, in two flavors.
 			 * Oh well... nobody has a sufficient solution to this
 			 * protocol bug yet.
 			 */
 			if (sysctl_tcp_rfc1337 == 0) {
 kill:
 				inet_twsk_deschedule(tw, &tcp_death_row);
 				inet_twsk_put(tw);
 				return TCP_TW_SUCCESS;
 			}
 		}
 		inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
 				   TCP_TIMEWAIT_LEN);
 		if (tmp_opt.saw_tstamp) {
 			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
 			tcptw->tw_ts_recent_stamp = get_seconds();
 		}
 		inet_twsk_put(tw);
 		return TCP_TW_SUCCESS;
 	}
 	/* Out of window segment.
 	   All the segments are ACKed immediately.
 	   The only exception is new SYN. We accept it, if it is
 	   not old duplicate and we are not in danger to be killed
 	   by delayed old duplicates. RFC check is that it has
 	   newer sequence number works at rates <40Mbit/sec.
 	   However, if paws works, it is reliable AND even more,
 	   we even may relax silly seq space cutoff.
 	   RED-PEN: we violate main RFC requirement, if this SYN will appear
 	   old duplicate (i.e. we receive RST in reply to SYN-ACK),
 	   we must return socket to time-wait state. It is not good,
 	   but not fatal yet.
 	 */
 	if (th->syn && !th->rst && !th->ack && !paws_reject &&
 	    (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
 	     (tmp_opt.saw_tstamp &&
 	      (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
 		u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
 		if (isn == 0)
 			isn++;
 		TCP_SKB_CB(skb)->when = isn;
 		return TCP_TW_SYN;
 	}
 	if (paws_reject)
 		NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
 	if (!th->rst) {
 		/* In this case we must reset the TIMEWAIT timer.
 		 *
 		 * If it is ACKless SYN it may be both old duplicate
 		 * and new good SYN with random sequence number <rcv_nxt.
 		 * Do not reschedule in the last case.
 		 */
 		if (paws_reject || th->ack)
 			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
 					   TCP_TIMEWAIT_LEN);
 		/* Send ACK. Note, we do not put the bucket,
 		 * it will be released by caller.
 		 */
 		return TCP_TW_ACK;
 	}
 	inet_twsk_put(tw);
 	return TCP_TW_SUCCESS;
 }
 /*
  * Move a socket to time-wait or dead fin-wait-2 state.
  */
 void tcp_time_wait(struct sock *sk, int state, int timeo)
 {
 	struct inet_timewait_sock *tw = NULL;
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_sock *tp = tcp_sk(sk);
 	int recycle_ok = 0;
 	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
 		recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
 	if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
 		tw = inet_twsk_alloc(sk, state);
 	if (tw != NULL) {
 		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
 		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
 		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
 		tcptw->tw_snd_nxt	= tp->snd_nxt;
 		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);
 		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		if (tw->tw_family == PF_INET6) {
 			struct ipv6_pinfo *np = inet6_sk(sk);
 			struct inet6_timewait_sock *tw6;
 			tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
 			tw6 = inet6_twsk((struct sock *)tw);
 			ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
 			ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
 			tw->tw_ipv6only = np->ipv6only;
 		}
 #endif
 #ifdef CONFIG_TCP_MD5SIG
 		/*
 		 * The timewait bucket does not have the key DB from the
 		 * sock structure. We just make a quick copy of the
 		 * md5 key being used (if indeed we are using one)
 		 * so the timewait ack generating code has the key.
 		 */
 		do {
 			struct tcp_md5sig_key *key;
 			memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key));
 			tcptw->tw_md5_keylen = 0;
 			key = tp->af_specific->md5_lookup(sk, sk);
 			if (key != NULL) {
 				memcpy(&tcptw->tw_md5_key, key->key, key->keylen);
 				tcptw->tw_md5_keylen = key->keylen;
 				if (tcp_alloc_md5sig_pool(sk) == NULL)
 					BUG();
 			}
 		} while (0);
 #endif
 		/* Linkage updates. */
 		__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
 		/* Get the TIME_WAIT timeout firing. */
 		if (timeo < rto)
 			timeo = rto;
 		if (recycle_ok) {
 			tw->tw_timeout = rto;
 		} else {
 			tw->tw_timeout = TCP_TIMEWAIT_LEN;
 			if (state == TCP_TIME_WAIT)
 				timeo = TCP_TIMEWAIT_LEN;
 		}
 		inet_twsk_schedule(tw, &tcp_death_row, timeo,
 				   TCP_TIMEWAIT_LEN);
 		inet_twsk_put(tw);
 	} else {
 		/* Sorry, if we're out of memory, just CLOSE this
 		 * socket up.  We've got bigger problems than
 		 * non-graceful socket closings.
 		 */
 		LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n");
 	}
 	tcp_update_metrics(sk);
 	tcp_done(sk);
 }
 void tcp_twsk_destructor(struct sock *sk)
 {
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_timewait_sock *twsk = tcp_twsk(sk);
 	if (twsk->tw_md5_keylen)
 		tcp_free_md5sig_pool();
 #endif
 }
 EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
 static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
 					 struct request_sock *req)
 {
 	tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
 }
 /* This is not only more efficient than what we used to do, it eliminates
  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
  *
  * Actually, we could lots of memory writes here. tp of listening
  * socket contains all necessary default parameters.
  */
 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
 {
 	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
 	if (newsk != NULL) {
 		const struct inet_request_sock *ireq = inet_rsk(req);
 		struct tcp_request_sock *treq = tcp_rsk(req);
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 		struct tcp_sock *newtp;
 		/* Now setup tcp_sock */
 		newtp = tcp_sk(newsk);
 		newtp->pred_flags = 0;
 		newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
 		newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;
 		newtp->snd_up = treq->snt_isn + 1;
 		tcp_prequeue_init(newtp);
 		tcp_init_wl(newtp, treq->rcv_isn);
 		newtp->srtt = 0;
 		newtp->mdev = TCP_TIMEOUT_INIT;
 		newicsk->icsk_rto = TCP_TIMEOUT_INIT;
 		newtp->packets_out = 0;
 		newtp->retrans_out = 0;
 		newtp->sacked_out = 0;
 		newtp->fackets_out = 0;
 		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 		/* So many TCP implementations out there (incorrectly) count the
 		 * initial SYN frame in their delayed-ACK and congestion control
 		 * algorithms that we must have the following bandaid to talk
 		 * efficiently to them.  -DaveM
 		 */
 		newtp->snd_cwnd = 2;
 		newtp->snd_cwnd_cnt = 0;
 		newtp->bytes_acked = 0;
 		newtp->frto_counter = 0;
 		newtp->frto_highmark = 0;
 		newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
 		tcp_set_ca_state(newsk, TCP_CA_Open);
 		tcp_init_xmit_timers(newsk);
 		skb_queue_head_init(&newtp->out_of_order_queue);
 		newtp->write_seq = treq->snt_isn + 1;
 		newtp->pushed_seq = newtp->write_seq;
 		newtp->rx_opt.saw_tstamp = 0;
 		newtp->rx_opt.dsack = 0;
 		newtp->rx_opt.num_sacks = 0;
 		newtp->urg_data = 0;
 		if (sock_flag(newsk, SOCK_KEEPOPEN))
 			inet_csk_reset_keepalive_timer(newsk,
 						       keepalive_time_when(newtp));
 		newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
 		if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
 			if (sysctl_tcp_fack)
 				tcp_enable_fack(newtp);
 		}
 		newtp->window_clamp = req->window_clamp;
 		newtp->rcv_ssthresh = req->rcv_wnd;
 		newtp->rcv_wnd = req->rcv_wnd;
 		newtp->rx_opt.wscale_ok = ireq->wscale_ok;
 		if (newtp->rx_opt.wscale_ok) {
 			newtp->rx_opt.snd_wscale = ireq->snd_wscale;
 			newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
 		} else {
 			newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
 			newtp->window_clamp = min(newtp->window_clamp, 65535U);
 		}
 		newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<
 				  newtp->rx_opt.snd_wscale);
 		newtp->max_window = newtp->snd_wnd;
 		if (newtp->rx_opt.tstamp_ok) {
 			newtp->rx_opt.ts_recent = req->ts_recent;
 			newtp->rx_opt.ts_recent_stamp = get_seconds();
 			newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
 		} else {
 			newtp->rx_opt.ts_recent_stamp = 0;
 			newtp->tcp_header_len = sizeof(struct tcphdr);
 		}
 #ifdef CONFIG_TCP_MD5SIG
 		newtp->md5sig_info = NULL;	/*XXX*/
 		if (newtp->af_specific->md5_lookup(sk, newsk))
 			newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
 #endif
 		if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
 			newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
 		newtp->rx_opt.mss_clamp = req->mss;
 		TCP_ECN_openreq_child(newtp, req);
 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
 	}
 	return newsk;
 }
 /*
  *	Process an incoming packet for SYN_RECV sockets represented
  *	as a request_sock.
  */
 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 			   struct request_sock *req,
 			   struct request_sock **prev)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
 	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
 	int paws_reject = 0;
 	struct tcp_options_received tmp_opt;
 	struct sock *child;
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
 		tcp_parse_options(skb, &tmp_opt, 0);
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent = req->ts_recent;
 			/* We do not store true stamp, but it is not required,
 			 * it can be estimated (approximately)
 			 * from another data.
 			 */
 			tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
 			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
 		}
 	}
 	/* Check for pure retransmitted SYN. */
 	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
 	    flg == TCP_FLAG_SYN &&
 	    !paws_reject) {
 		/*
 		 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
 		 * this case on figure 6 and figure 8, but formal
 		 * protocol description says NOTHING.
 		 * To be more exact, it says that we should send ACK,
 		 * because this segment (at least, if it has no data)
 		 * is out of window.
 		 *
 		 *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
 		 *  describe SYN-RECV state. All the description
 		 *  is wrong, we cannot believe to it and should
 		 *  rely only on common sense and implementation
 		 *  experience.
 		 *
 		 * Enforce "SYN-ACK" according to figure 8, figure 6
 		 * of RFC793, fixed by RFC1122.
 		 */
 		req->rsk_ops->rtx_syn_ack(sk, req);
 		return NULL;
 	}
 	/* Further reproduces section "SEGMENT ARRIVES"
 	   for state SYN-RECEIVED of RFC793.
 	   It is broken, however, it does not work only
 	   when SYNs are crossed.
 	   You would think that SYN crossing is impossible here, since
 	   we should have a SYN_SENT socket (from connect()) on our end,
 	   but this is not true if the crossed SYNs were sent to both
 	   ends by a malicious third party.  We must defend against this,
 	   and to do that we first verify the ACK (as per RFC793, page
 	   36) and reset if it is invalid.  Is this a true full defense?
 	   To convince ourselves, let us consider a way in which the ACK
 	   test can still pass in this 'malicious crossed SYNs' case.
 	   Malicious sender sends identical SYNs (and thus identical sequence
 	   numbers) to both A and B:
 		A: gets SYN, seq=7
 		B: gets SYN, seq=7
 	   By our good fortune, both A and B select the same initial
 	   send sequence number of seven :-)
 		A: sends SYN|ACK, seq=7, ack_seq=8
 		B: sends SYN|ACK, seq=7, ack_seq=8
 	   So we are now A eating this SYN|ACK, ACK test passes.  So
 	   does sequence test, SYN is truncated, and thus we consider
 	   it a bare ACK.
 	   If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
 	   bare ACK.  Otherwise, we create an established connection.  Both
 	   ends (listening sockets) accept the new incoming connection and try
 	   to talk to each other. 8-)
 	   Note: This case is both harmless, and rare.  Possibility is about the
 	   same as us discovering intelligent life on another plant tomorrow.
 	   But generally, we should (RFC lies!) to accept ACK
 	   from SYNACK both here and in tcp_rcv_state_process().
 	   tcp_rcv_state_process() does not, hence, we do not too.
 	   Note that the case is absolutely generic:
 	   we cannot optimize anything here without
 	   violating protocol. All the checks must be made
 	   before attempt to create socket.
 	 */
 	/* RFC793 page 36: "If the connection is in any non-synchronized state ...
 	 *                  and the incoming segment acknowledges something not yet
 	 *                  sent (the segment carries an unacceptable ACK) ...
 	 *                  a reset is sent."
 	 *
 	 * Invalid ACK: reset will be sent by listening socket
 	 */
 	if ((flg & TCP_FLAG_ACK) &&
 	    (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
 		return sk;
 	/* Also, it would be not so bad idea to check rcv_tsecr, which
 	 * is essentially ACK extension and too early or too late values
 	 * should cause reset in unsynchronized states.
 	 */
 	/* RFC793: "first check sequence number". */
 	if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 					  tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
 		/* Out of window: send ACK and drop. */
 		if (!(flg & TCP_FLAG_RST))
 			req->rsk_ops->send_ack(sk, skb, req);
 		if (paws_reject)
 			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
 		return NULL;
 	}
 	/* In sequence, PAWS is OK. */
 	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
 		req->ts_recent = tmp_opt.rcv_tsval;
 	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
 		/* Truncate SYN, it is out of window starting
 		   at tcp_rsk(req)->rcv_isn + 1. */
 		flg &= ~TCP_FLAG_SYN;
 	}
 	/* RFC793: "second check the RST bit" and
 	 *	   "fourth, check the SYN bit"
 	 */
 	if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
 		goto embryonic_reset;
 	}
 	/* ACK sequence verified above, just make sure ACK is
 	 * set.  If ACK not set, just silently drop the packet.
 	 */
 	if (!(flg & TCP_FLAG_ACK))
 		return NULL;
 	/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
 	if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
 	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+		inet_csk(sk)->icsk_accept_queue.rskq_defer_accept--;
 		inet_rsk(req)->acked = 1;
 		return NULL;
 	}
 	/* OK, ACK is valid, create big socket and
 	 * feed this segment to it. It will repeat all
 	 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
 	 * ESTABLISHED STATE. If it will be dropped after
 	 * socket is created, wait for troubles.
 	 */
 	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
 	if (child == NULL)
 		goto listen_overflow;
 	inet_csk_reqsk_queue_unlink(sk, req, prev);
 	inet_csk_reqsk_queue_removed(sk, req);
 	inet_csk_reqsk_queue_add(sk, req, child);
 	return child;
 listen_overflow:
 	if (!sysctl_tcp_abort_on_overflow) {
 		inet_rsk(req)->acked = 1;
 		return NULL;
 	}
 embryonic_reset:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
 	if (!(flg & TCP_FLAG_RST))
 		req->rsk_ops->send_reset(sk, skb);
 	inet_csk_reqsk_queue_drop(sk, req, prev);
 	return NULL;
 }
 /*
  * Queue segment on the new socket if the new socket is active,
  * otherwise we just shortcircuit this and continue with
  * the new socket.
  */
 int tcp_child_process(struct sock *parent, struct sock *child,
 		      struct sk_buff *skb)
 {
 	int ret = 0;
 	int state = child->sk_state;
 	if (!sock_owned_by_user(child)) {
 		ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
 					    skb->len);
 		/* Wakeup parent, send SIGIO */
 		if (state == TCP_SYN_RECV && child->sk_state != state)
 			parent->sk_data_ready(parent, 0);
 	} else {
 		/* Alas, it is possible again, because we do lookup
 		 * in main socket hash table and lock on listening
 		 * socket does not protect us more.
 		 */
 		sk_add_backlog(child, skb);
 	}
 	bh_unlock_sock(child);
 	sock_put(child);
 	return ret;
 }
 EXPORT_SYMBOL(tcp_check_req);
 EXPORT_SYMBOL(tcp_child_process);
 EXPORT_SYMBOL(tcp_create_openreq_child);
 EXPORT_SYMBOL(tcp_timewait_state_process);