insnemu.S 15 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592


/*
 *  Copyright (C) 2003-2013 Altera Corporation
 *  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */


#include <linux/linkage.h>
#include <asm/entry.h>

.set noat
.set nobreak

/*
* Explicitly allow the use of r1 (the assembler temporary register)
* within this code. This register is normally reserved for the use of
* the compiler.
*/

ENTRY(instruction_trap)
	ldw	r1, PT_R1(sp)		// Restore registers
	ldw	r2, PT_R2(sp)
	ldw	r3, PT_R3(sp)
	ldw	r4, PT_R4(sp)
	ldw	r5, PT_R5(sp)
	ldw	r6, PT_R6(sp)
	ldw	r7, PT_R7(sp)
	ldw	r8, PT_R8(sp)
	ldw	r9, PT_R9(sp)
	ldw	r10, PT_R10(sp)
	ldw	r11, PT_R11(sp)
	ldw	r12, PT_R12(sp)
	ldw	r13, PT_R13(sp)
	ldw	r14, PT_R14(sp)
	ldw	r15, PT_R15(sp)
	ldw	ra, PT_RA(sp)
	ldw	fp, PT_FP(sp)
	ldw	gp, PT_GP(sp)
	ldw	et, PT_ESTATUS(sp)
	wrctl	estatus, et
	ldw	ea, PT_EA(sp)
	ldw	et, PT_SP(sp)		/* backup sp in et */

	addi	sp, sp, PT_REGS_SIZE

	/* INSTRUCTION EMULATION
	*  ---------------------
	*
	* Nios II processors generate exceptions for unimplemented instructions.
	* The routines below emulate these instructions.  Depending on the
	* processor core, the only instructions that might need to be emulated
	* are div, divu, mul, muli, mulxss, mulxsu, and mulxuu.
	*
	* The emulations match the instructions, except for the following
	* limitations:
	*
	* 1) The emulation routines do not emulate the use of the exception
	*    temporary register (et) as a source operand because the exception
	*    handler already has modified it.
	*
	* 2) The routines do not emulate the use of the stack pointer (sp) or
	*    the exception return address register (ea) as a destination because
	*    modifying these registers crashes the exception handler or the
	*    interrupted routine.
	*
	* Detailed Design
	* ---------------
	*
	* The emulation routines expect the contents of integer registers r0-r31
	* to be on the stack at addresses sp, 4(sp), 8(sp), ... 124(sp).  The
	* routines retrieve source operands from the stack and modify the
	* destination register's value on the stack prior to the end of the
	* exception handler.  Then all registers except the destination register
	* are restored to their previous values.
	*
	* The instruction that causes the exception is found at address -4(ea).
	* The instruction's OP and OPX fields identify the operation to be
	* performed.
	*
	* One instruction, muli, is an I-type instruction that is identified by
	* an OP field of 0x24.
	*
	* muli   AAAAA,BBBBB,IIIIIIIIIIIIIIII,-0x24-
	*           27    22                6      0    <-- LSB of field
	*
	* The remaining emulated instructions are R-type and have an OP field
	* of 0x3a.  Their OPX fields identify them.
	*
	* R-type AAAAA,BBBBB,CCCCC,XXXXXX,NNNNN,-0x3a-
	*           27    22    17     11     6      0  <-- LSB of field
	*
	*
	* Opcode Encoding.  muli is identified by its OP value.  Then OPX & 0x02
	* is used to differentiate between the division opcodes and the
	* remaining multiplication opcodes.
	*
	* Instruction   OP      OPX    OPX & 0x02
	* -----------   ----    ----   ----------
	* muli          0x24
	* divu          0x3a    0x24         0
	* div           0x3a    0x25         0
	* mul           0x3a    0x27      != 0
	* mulxuu        0x3a    0x07      != 0
	* mulxsu        0x3a    0x17      != 0
	* mulxss        0x3a    0x1f      != 0
	*/


	/*
	* Save everything on the stack to make it easy for the emulation
	* routines to retrieve the source register operands.
	*/

	addi sp, sp, -128
	stw zero, 0(sp)	/* Save zero on stack to avoid special case for r0. */
	stw r1, 4(sp)
	stw r2,  8(sp)
	stw r3, 12(sp)
	stw r4, 16(sp)
	stw r5, 20(sp)
	stw r6, 24(sp)
	stw r7, 28(sp)
	stw r8, 32(sp)
	stw r9, 36(sp)
	stw r10, 40(sp)
	stw r11, 44(sp)
	stw r12, 48(sp)
	stw r13, 52(sp)
	stw r14, 56(sp)
	stw r15, 60(sp)
	stw r16, 64(sp)
	stw r17, 68(sp)
	stw r18, 72(sp)
	stw r19, 76(sp)
	stw r20, 80(sp)
	stw r21, 84(sp)
	stw r22, 88(sp)
	stw r23, 92(sp)
		/* Don't bother to save et.  It's already been changed. */
	rdctl r5, estatus
	stw r5,  100(sp)

	stw gp, 104(sp)
	stw et, 108(sp)	/* et contains previous sp value. */
	stw fp, 112(sp)
	stw ea, 116(sp)
	stw ra, 120(sp)


	/*
	* Split the instruction into its fields.  We need 4*A, 4*B, and 4*C as
	* offsets to the stack pointer for access to the stored register values.
	*/
	ldw r2,-4(ea)	/* r2 = AAAAA,BBBBB,IIIIIIIIIIIIIIII,PPPPPP */
	roli r3, r2, 7	/* r3 = BBB,IIIIIIIIIIIIIIII,PPPPPP,AAAAA,BB */
	roli r4, r3, 3	/* r4 = IIIIIIIIIIIIIIII,PPPPPP,AAAAA,BBBBB */
	roli r5, r4, 2	/* r5 = IIIIIIIIIIIIII,PPPPPP,AAAAA,BBBBB,II */
	srai r4, r4, 16	/* r4 = (sign-extended) IMM16 */
	roli r6, r5, 5	/* r6 = XXXX,NNNNN,PPPPPP,AAAAA,BBBBB,CCCCC,XX */
	andi r2, r2, 0x3f	/* r2 = 00000000000000000000000000,PPPPPP */
	andi r3, r3, 0x7c	/* r3 = 0000000000000000000000000,AAAAA,00 */
	andi r5, r5, 0x7c	/* r5 = 0000000000000000000000000,BBBBB,00 */
	andi r6, r6, 0x7c	/* r6 = 0000000000000000000000000,CCCCC,00 */

	/* Now
	* r2 = OP
	* r3 = 4*A
	* r4 = IMM16 (sign extended)
	* r5 = 4*B
	* r6 = 4*C
	*/

	/*
	* Get the operands.
	*
	* It is necessary to check for muli because it uses an I-type
	* instruction format, while the other instructions are have an R-type
	* format.
	*
	*  Prepare for either multiplication or division loop.
	*  They both loop 32 times.
	*/
	movi r14, 32

	add  r3, r3, sp		/* r3 = address of A-operand. */
	ldw  r3, 0(r3)		/* r3 = A-operand. */
	movi r7, 0x24		/* muli opcode (I-type instruction format) */
	beq r2, r7, mul_immed /* muli doesn't use the B register as a source */

	add  r5, r5, sp		/* r5 = address of B-operand. */
	ldw  r5, 0(r5)		/* r5 = B-operand. */
				/* r4 = SSSSSSSSSSSSSSSS,-----IMM16------ */
				/* IMM16 not needed, align OPX portion */
				/* r4 = SSSSSSSSSSSSSSSS,CCCCC,-OPX--,00000 */
	srli r4, r4, 5		/* r4 = 00000,SSSSSSSSSSSSSSSS,CCCCC,-OPX-- */
	andi r4, r4, 0x3f	/* r4 = 00000000000000000000000000,-OPX-- */

	/* Now
	* r2 = OP
	* r3 = src1
	* r5 = src2
	* r4 = OPX (no longer can be muli)
	* r6 = 4*C
	*/


	/*
	*  Multiply or Divide?
	*/
	andi r7, r4, 0x02	/* For R-type multiply instructions,
				   OPX & 0x02 != 0 */
	bne r7, zero, multiply


	/* DIVISION
	*
	* Divide an unsigned dividend by an unsigned divisor using
	* a shift-and-subtract algorithm.  The example below shows
	* 43 div 7 = 6 for 8-bit integers.  This classic algorithm uses a
	* single register to store both the dividend and the quotient,
	* allowing both values to be shifted with a single instruction.
	*
	*                               remainder dividend:quotient
	*                               --------- -----------------
	*   initialize                   00000000     00101011:
	*   shift                        00000000     0101011:_
	*   remainder >= divisor? no     00000000     0101011:0
	*   shift                        00000000     101011:0_
	*   remainder >= divisor? no     00000000     101011:00
	*   shift                        00000001     01011:00_
	*   remainder >= divisor? no     00000001     01011:000
	*   shift                        00000010     1011:000_
	*   remainder >= divisor? no     00000010     1011:0000
	*   shift                        00000101     011:0000_
	*   remainder >= divisor? no     00000101     011:00000
	*   shift                        00001010     11:00000_
	*   remainder >= divisor? yes    00001010     11:000001
	*       remainder -= divisor   - 00000111
	*                              ----------
	*                                00000011     11:000001
	*   shift                        00000111     1:000001_
	*   remainder >= divisor? yes    00000111     1:0000011
	*       remainder -= divisor   - 00000111
	*                              ----------
	*                                00000000     1:0000011
	*   shift                        00000001     :0000011_
	*   remainder >= divisor? no     00000001     :00000110
	*
	* The quotient is 00000110.
	*/

divide:
	/*
	*  Prepare for division by assuming the result
	*  is unsigned, and storing its "sign" as 0.
	*/
	movi r17, 0


	/* Which division opcode? */
	xori r7, r4, 0x25		/* OPX of div */
	bne r7, zero, unsigned_division


	/*
	*  OPX is div.  Determine and store the sign of the quotient.
	*  Then take the absolute value of both operands.
	*/
	xor r17, r3, r5		/* MSB contains sign of quotient */
	bge r3,zero,dividend_is_nonnegative
	sub r3, zero, r3	/* -r3 */
dividend_is_nonnegative:
	bge r5, zero, divisor_is_nonnegative
	sub r5, zero, r5	/* -r5 */
divisor_is_nonnegative:


unsigned_division:
	/* Initialize the unsigned-division loop. */
	movi r13, 0	/* remainder = 0 */

	/* Now
	* r3 = dividend : quotient
	* r4 = 0x25 for div, 0x24 for divu
	* r5 = divisor
	* r13 = remainder
	* r14 = loop counter (already initialized to 32)
	* r17 = MSB contains sign of quotient
	*/


	/*
	*   for (count = 32; count > 0; --count)
	*   {
	*/
divide_loop:

	/*
	*       Division:
	*
	*       (remainder:dividend:quotient) <<= 1;
	*/
	slli r13, r13, 1
	cmplt r7, r3, zero	/* r7 = MSB of r3 */
	or r13, r13, r7
	slli r3, r3, 1


	/*
	*       if (remainder >= divisor)
	*       {
	*           set LSB of quotient
	*           remainder -= divisor;
	*       }
	*/
	bltu r13, r5, div_skip
	ori r3, r3, 1
	sub r13, r13, r5
div_skip:

	/*
	*   }
	*/
	subi r14, r14, 1
	bne r14, zero, divide_loop


	/* Now
	* r3 = quotient
	* r4 = 0x25 for div, 0x24 for divu
	* r6 = 4*C
	* r17 = MSB contains sign of quotient
	*/


	/*
	*  Conditionally negate signed quotient.  If quotient is unsigned,
	*  the sign already is initialized to 0.
	*/
	bge r17, zero, quotient_is_nonnegative
	sub r3, zero, r3		/* -r3 */
	quotient_is_nonnegative:


	/*
	*  Final quotient is in r3.
	*/
	add r6, r6, sp
	stw r3, 0(r6)	/* write quotient to stack */
	br restore_registers


	/* MULTIPLICATION
	*
	* A "product" is the number that one gets by summing a "multiplicand"
	* several times.  The "multiplier" specifies the number of copies of the
	* multiplicand that are summed.
	*
	* Actual multiplication algorithms don't use repeated addition, however.
	* Shift-and-add algorithms get the same answer as repeated addition, and
	* they are faster.  To compute the lower half of a product (pppp below)
	* one shifts the product left before adding in each of the partial
	* products (a * mmmm) through (d * mmmm).
	*
	* To compute the upper half of a product (PPPP below), one adds in the
	* partial products (d * mmmm) through (a * mmmm), each time following
	* the add by a right shift of the product.
	*
	*     mmmm
	*   * abcd
	*   ------
	*     ####  = d * mmmm
	*    ####   = c * mmmm
	*   ####    = b * mmmm
	*  ####     = a * mmmm
	* --------
	* PPPPpppp
	*
	* The example above shows 4 partial products.  Computing actual Nios II
	* products requires 32 partials.
	*
	* It is possible to compute the result of mulxsu from the result of
	* mulxuu because the only difference between the results of these two
	* opcodes is the value of the partial product associated with the sign
	* bit of rA.
	*
	*   mulxsu = mulxuu - (rA < 0) ? rB : 0;
	*
	* It is possible to compute the result of mulxss from the result of
	* mulxsu because the only difference between the results of these two
	* opcodes is the value of the partial product associated with the sign
	* bit of rB.
	*
	*   mulxss = mulxsu - (rB < 0) ? rA : 0;
	*
	*/

mul_immed:
	/* Opcode is muli.  Change it into mul for remainder of algorithm. */
	mov r6, r5		/* Field B is dest register, not field C. */
	mov r5, r4		/* Field IMM16 is src2, not field B. */
	movi r4, 0x27		/* OPX of mul is 0x27 */

multiply:
	/* Initialize the multiplication loop. */
	movi r9, 0	/* mul_product    = 0 */
	movi r10, 0	/* mulxuu_product = 0 */
	mov r11, r5	/* save original multiplier for mulxsu and mulxss */
	mov r12, r5	/* mulxuu_multiplier (will be shifted) */
	movi r16, 1	/* used to create "rori B,A,1" from "ror B,A,r16" */

	/* Now
	* r3 = multiplicand
	* r5 = mul_multiplier
	* r6 = 4 * dest_register (used later as offset to sp)
	* r7 = temp
	* r9 = mul_product
	* r10 = mulxuu_product
	* r11 = original multiplier
	* r12 = mulxuu_multiplier
	* r14 = loop counter (already initialized)
	* r16 = 1
	*/


	/*
	*   for (count = 32; count > 0; --count)
	*   {
	*/
multiply_loop:

	/*
	*       mul_product <<= 1;
	*       lsb = multiplier & 1;
	*/
	slli r9, r9, 1
	andi r7, r12, 1

	/*
	*       if (lsb == 1)
	*       {
	*           mulxuu_product += multiplicand;
	*       }
	*/
	beq r7, zero, mulx_skip
	add r10, r10, r3
	cmpltu r7, r10, r3 /* Save the carry from the MSB of mulxuu_product. */
	ror r7, r7, r16	/* r7 = 0x80000000 on carry, or else 0x00000000 */
mulx_skip:

	/*
	*       if (MSB of mul_multiplier == 1)
	*       {
	*           mul_product += multiplicand;
	*       }
	*/
	bge r5, zero, mul_skip
	add r9, r9, r3
mul_skip:

	/*
	*       mulxuu_product >>= 1;           logical shift
	*       mul_multiplier <<= 1;           done with MSB
	*       mulx_multiplier >>= 1;          done with LSB
	*/
	srli r10, r10, 1
	or r10, r10, r7		/* OR in the saved carry bit. */
	slli r5, r5, 1
	srli r12, r12, 1


	/*
	*   }
	*/
	subi r14, r14, 1
	bne r14, zero, multiply_loop


	/*
	*  Multiply emulation loop done.
	*/

	/* Now
	* r3 = multiplicand
	* r4 = OPX
	* r6 = 4 * dest_register (used later as offset to sp)
	* r7 = temp
	* r9 = mul_product
	* r10 = mulxuu_product
	* r11 = original multiplier
	*/


	/* Calculate address for result from 4 * dest_register */
	add r6, r6, sp


	/*
	* Select/compute the result based on OPX.
	*/


	/* OPX == mul?  Then store. */
	xori r7, r4, 0x27
	beq r7, zero, store_product

	/* It's one of the mulx.. opcodes.  Move over the result. */
	mov r9, r10

	/* OPX == mulxuu?  Then store. */
	xori r7, r4, 0x07
	beq r7, zero, store_product

	/* Compute mulxsu
	 *
	 * mulxsu = mulxuu - (rA < 0) ? rB : 0;
	 */
	bge r3, zero, mulxsu_skip
	sub r9, r9, r11
mulxsu_skip:

	/* OPX == mulxsu?  Then store. */
	xori r7, r4, 0x17
	beq r7, zero, store_product

	/* Compute mulxss
	 *
	 * mulxss = mulxsu - (rB < 0) ? rA : 0;
	 */
	bge r11,zero,mulxss_skip
	sub r9, r9, r3
mulxss_skip:
	/* At this point, assume that OPX is mulxss, so store*/


store_product:
	stw r9, 0(r6)


restore_registers:
			/* No need to restore r0. */
	ldw r5, 100(sp)
	wrctl estatus, r5

	ldw r1, 4(sp)
	ldw r2, 8(sp)
	ldw r3, 12(sp)
	ldw r4, 16(sp)
	ldw r5, 20(sp)
	ldw r6, 24(sp)
	ldw r7, 28(sp)
	ldw r8, 32(sp)
	ldw r9, 36(sp)
	ldw r10, 40(sp)
	ldw r11, 44(sp)
	ldw r12, 48(sp)
	ldw r13, 52(sp)
	ldw r14, 56(sp)
	ldw r15, 60(sp)
	ldw r16, 64(sp)
	ldw r17, 68(sp)
	ldw r18, 72(sp)
	ldw r19, 76(sp)
	ldw r20, 80(sp)
	ldw r21, 84(sp)
	ldw r22, 88(sp)
	ldw r23, 92(sp)
			/* Does not need to restore et */
	ldw gp, 104(sp)

	ldw fp, 112(sp)
	ldw ea, 116(sp)
	ldw ra, 120(sp)
	ldw sp, 108(sp)	/* last restore sp */
	eret

.set at
.set break