Commit a2c95a72247990dee9a03b26b4dc9fc0182c97ed

Authored by Stefan Roese
1 parent fc6c4a67ae

PPC440 DDR setup: Set SDRAM0_CFG0[PMU]=0 for best performance

AMCC suggested to set the PMU bit to 0 for best performace on
the PPC440 DDR controller.
Please see doc/README.440-DDR-performance for details.
Patch by Stefan Roese, 28 Jul 2006

Showing 6 changed files with 103 additions and 7 deletions Side-by-side Diff

... ... @@ -2,6 +2,12 @@
2 2 Changes since U-Boot 1.1.4:
3 3 ======================================================================
4 4  
  5 +* PPC440 DDR setup: Set SDRAM0_CFG0[PMU]=0 for best performance
  6 + AMCC suggested to set the PMU bit to 0 for best performace on
  7 + the PPC440 DDR controller.
  8 + Please see doc/README.440-DDR-performance for details.
  9 + Patch by Stefan Roese, 28 Jul 2006
  10 +
5 11 * AMCC bamboo (440EP) U-Boot image reduced to 384kbyte
6 12 Please see doc/README.bamboo for details.
7 13 Patch by Stefan Roese, 27 Jul 2006
board/amcc/yellowstone/yellowstone.c
... ... @@ -313,13 +313,13 @@
313 313 mtsdram(mem_tr0, 0x410a4012); /* ?? */
314 314 mtsdram(mem_rtr, 0x04080000); /* ?? */
315 315 mtsdram(mem_cfg1, 0x00000000); /* Self-refresh exit, disable PM */
316   - mtsdram(mem_cfg0, 0x34000000); /* Disable EEC */
  316 + mtsdram(mem_cfg0, 0x30000000); /* Disable EEC */
317 317 udelay(400); /* Delay 200 usecs (min) */
318 318  
319 319 /*--------------------------------------------------------------------
320 320 * Enable the controller, then wait for DCEN to complete
321 321 *------------------------------------------------------------------*/
322   - mtsdram(mem_cfg0, 0x84000000); /* Enable */
  322 + mtsdram(mem_cfg0, 0x80000000); /* Enable */
323 323  
324 324 for (;;) {
325 325 mfsdram(mem_mcsts, reg);
board/amcc/yosemite/yosemite.c
... ... @@ -309,13 +309,13 @@
309 309 mtsdram(mem_tr0, 0x410a4012); /* ?? */
310 310 mtsdram(mem_rtr, 0x04080000); /* ?? */
311 311 mtsdram(mem_cfg1, 0x00000000); /* Self-refresh exit, disable PM */
312   - mtsdram(mem_cfg0, 0x34000000); /* Disable EEC */
  312 + mtsdram(mem_cfg0, 0x30000000); /* Disable EEC */
313 313 udelay(400); /* Delay 200 usecs (min) */
314 314  
315 315 /*--------------------------------------------------------------------
316 316 * Enable the controller, then wait for DCEN to complete
317 317 *------------------------------------------------------------------*/
318   - mtsdram(mem_cfg0, 0x84000000); /* Enable */
  318 + mtsdram(mem_cfg0, 0x80000000); /* Enable */
319 319  
320 320 for (;;) {
321 321 mfsdram(mem_mcsts, reg);
... ... @@ -379,7 +379,7 @@
379 379 /*
380 380 * Enable the controller, then wait for DCEN to complete
381 381 */
382   - mtsdram(mem_cfg0, 0x86000000); /* DCEN=1, PMUD=1, 64-bit */
  382 + mtsdram(mem_cfg0, 0x82000000); /* DCEN=1, PMUD=0, 64-bit */
383 383 udelay(10000);
384 384  
385 385 if (get_ram_size(0, mb0cf[i].size) == mb0cf[i].size) {
cpu/ppc4xx/spd_sdram.c
... ... @@ -1007,9 +1007,9 @@
1007 1007 }
1008 1008  
1009 1009 /*
1010   - * program Page Management Unit
  1010 + * program Page Management Unit (0 == enabled)
1011 1011 */
1012   - cfg0 |= SDRAM_CFG0_PMUD;
  1012 + cfg0 &= ~SDRAM_CFG0_PMUD;
1013 1013  
1014 1014 /*
1015 1015 * program Memory Controller Options 0
doc/README.440-DDR-performance
  1 +AMCC suggested to set the PMU bit to 0 for best performace on the
  2 +PPC440 DDR controller. The 440er common DDR setup files (sdram.c &
  3 +spd_sdram.c) are changed accordingly. So all 440er boards using
  4 +these setup routines will automatically receive this performance
  5 +increase.
  6 +
  7 +Please see below some benchmarks done by AMCC to demonstrate this
  8 +performance changes:
  9 +
  10 +
  11 +----------------------------------------
  12 +SDRAM0_CFG0[PMU] = 1 (U-boot default for Bamboo, Yosemite and Yellowstone)
  13 +----------------------------------------
  14 +Stream benchmark results
  15 +-------------------------------------------------------------
  16 +This system uses 8 bytes per DOUBLE PRECISION word.
  17 +-------------------------------------------------------------
  18 +Array size = 2000000, Offset = 0
  19 +Total memory required = 45.8 MB.
  20 +Each test is run 10 times, but only
  21 +the *best* time for each is used.
  22 +-------------------------------------------------------------
  23 +Your clock granularity/precision appears to be 1 microseconds.
  24 +Each test below will take on the order of 112345 microseconds.
  25 + (= 112345 clock ticks)
  26 +Increase the size of the arrays if this shows that you are not getting
  27 +at least 20 clock ticks per test.
  28 +-------------------------------------------------------------
  29 +WARNING -- The above is only a rough guideline.
  30 +For best results, please be sure you know the precision of your system
  31 +timer.
  32 +-------------------------------------------------------------
  33 +Function Rate (MB/s) RMS time Min time Max time
  34 +Copy: 256.7683 0.1248 0.1246 0.1250
  35 +Scale: 246.0157 0.1302 0.1301 0.1302
  36 +Add: 255.0316 0.1883 0.1882 0.1885
  37 +Triad: 253.1245 0.1897 0.1896 0.1899
  38 +
  39 +
  40 +TTCP Benchmark Results
  41 +ttcp-t: socket
  42 +ttcp-t: connect
  43 +ttcp-t: buflen=8192, nbuf=2048, align=16384/0, port=5000 tcp ->
  44 +localhost
  45 +ttcp-t: 16777216 bytes in 0.28 real seconds = 454.29 Mbit/sec +++
  46 +ttcp-t: 2048 I/O calls, msec/call = 0.14, calls/sec = 7268.57
  47 +ttcp-t: 0.0user 0.1sys 0:00real 60% 0i+0d 0maxrss 0+2pf 3+1506csw
  48 +
  49 +----------------------------------------
  50 +SDRAM0_CFG0[PMU] = 0 (Suggested modification)
  51 +Setting PMU = 0 provides a noticeable performance improvement *2% to
  52 +5% improvement in memory performance.
  53 +*Improves the Mbit/sec for TTCP benchmark by almost 76%.
  54 +----------------------------------------
  55 +Stream benchmark results
  56 +-------------------------------------------------------------
  57 +This system uses 8 bytes per DOUBLE PRECISION word.
  58 +-------------------------------------------------------------
  59 +Array size = 2000000, Offset = 0
  60 +Total memory required = 45.8 MB.
  61 +Each test is run 10 times, but only
  62 +the *best* time for each is used.
  63 +-------------------------------------------------------------
  64 +Your clock granularity/precision appears to be 1 microseconds.
  65 +Each test below will take on the order of 120066 microseconds.
  66 + (= 120066 clock ticks)
  67 +Increase the size of the arrays if this shows that you are not getting
  68 +at least 20 clock ticks per test.
  69 +-------------------------------------------------------------
  70 +WARNING -- The above is only a rough guideline.
  71 +For best results, please be sure you know the precision of your system
  72 +timer.
  73 +-------------------------------------------------------------
  74 +Function Rate (MB/s) RMS time Min time Max time
  75 +Copy: 262.5167 0.1221 0.1219 0.1223
  76 +Scale: 258.4856 0.1238 0.1238 0.1240
  77 +Add: 262.5404 0.1829 0.1828 0.1831
  78 +Triad: 266.8594 0.1800 0.1799 0.1802
  79 +
  80 +TTCP Benchmark Results
  81 +ttcp-t: socket
  82 +ttcp-t: connect
  83 +ttcp-t: buflen=8192, nbuf=2048, align=16384/0, port=5000 tcp ->
  84 +localhost
  85 +ttcp-t: 16777216 bytes in 0.16 real seconds = 804.06 Mbit/sec +++
  86 +ttcp-t: 2048 I/O calls, msec/call = 0.08, calls/sec = 12864.89
  87 +ttcp-t: 0.0user 0.0sys 0:00real 46% 0i+0d 0maxrss 0+2pf 120+1csw
  88 +
  89 +
  90 +2006-07-28, Stefan Roese <sr@denx.de>