soc/cavium: import raw BDK sources

This imports common BDK sources that will be used in subsequent patches. The BDK is licensed under BSD and will be reduced in size and optimized to compile under coreboot. Change-Id: Icb32ee670d9fa9e5c10f9abb298cebf616fa67ad Signed-off-by: David Hendricks <dhendricks@fb.com> Reviewed-on: https://review.coreboot.org/25524 Tested-by: build bot (Jenkins) <no-reply@coreboot.org> Reviewed-by: David Hendricks <david.hendricks@gmail.com>
author: David Hendricks <dhendricks@fb.com> 2018-03-09 13:58:27 -0800
committer: Patrick Georgi <pgeorgi@google.com> 2018-04-06 06:48:11 +0000
commit: 2004b93aed993aa02bbc588b8d82c22418ac52ec (patch)
tree: cdd5e95a154e2e0139474288262835a7f5847665 /src/vendorcode/cavium/bdk/libdram/dram-tune-ddr3.c
parent: 71cbd71eb5c0e8e13b25b5d5dd2f495e7d2967eb (diff)
download: coreboot-2004b93aed993aa02bbc588b8d82c22418ac52ec.tar.xz
1 files changed, 2012 insertions, 0 deletions
diff --git a/src/vendorcode/cavium/bdk/libdram/dram-tune-ddr3.c b/src/vendorcode/cavium/bdk/libdram/dram-tune-ddr3.c
new file mode 100644
index 0000000000..e0e9d4442c
--- /dev/null
+++ b/src/vendorcode/cavium/bdk/libdram/dram-tune-ddr3.c
@@ -0,0 +1,2012 @@
+/***********************license start***********************************
+* Copyright (c) 2003-2017  Cavium Inc. (support@cavium.com). All rights
+* reserved.
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met:
+*
+*   * Redistributions of source code must retain the above copyright
+*     notice, this list of conditions and the following disclaimer.
+*
+*   * Redistributions in binary form must reproduce the above
+*     copyright notice, this list of conditions and the following
+*     disclaimer in the documentation and/or other materials provided
+*     with the distribution.
+*
+*   * Neither the name of Cavium Inc. nor the names of
+*     its contributors may be used to endorse or promote products
+*     derived from this software without specific prior written
+*     permission.
+*
+* This Software, including technical data, may be subject to U.S. export
+* control laws, including the U.S. Export Administration Act and its
+* associated regulations, and may be subject to export or import
+* regulations in other countries.
+*
+* TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+* AND WITH ALL FAULTS AND CAVIUM INC. MAKES NO PROMISES, REPRESENTATIONS OR
+* WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT
+* TO THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY
+* REPRESENTATION OR DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT
+* DEFECTS, AND CAVIUM SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES
+* OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR
+* PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT,
+* QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. THE ENTIRE  RISK
+* ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE LIES WITH YOU.
+***********************license end**************************************/
+#include <bdk.h>
+#include "dram-internal.h"
+
+// if enhanced verbosity levels are defined, use them 
+#if defined(VB_PRT)
+#define ddr_print2(format, ...) VB_PRT(VBL_FAE,  format, ##__VA_ARGS__)
+#define ddr_print3(format, ...) VB_PRT(VBL_TME,  format, ##__VA_ARGS__)
+#define ddr_print4(format, ...) VB_PRT(VBL_DEV,  format, ##__VA_ARGS__)
+#define ddr_print5(format, ...) VB_PRT(VBL_DEV3, format, ##__VA_ARGS__)
+#else
+#define ddr_print2 ddr_print
+#define ddr_print4 ddr_print
+#define ddr_print5 ddr_print
+#endif
+
+static  int64_t test_dram_byte_threads_done;
+static uint64_t test_dram_byte_threads_errs;
+static uint64_t test_dram_byte_lmc_errs[4];
+
+#if 0
+/*
+ * Suggested testing patterns.
+ */
+static const uint64_t test_pattern_2[] = {
+    0xFFFFFFFFFFFFFFFFULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0xFFFFFFFFFFFFFFFFULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0xFFFFFFFFFFFFFFFFULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0xFFFFFFFFFFFFFFFFULL,
+    0x5555555555555555ULL,
+    0xFFFFFFFFFFFFFFFFULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xFFFFFFFFFFFFFFFFULL,
+    0x5555555555555555ULL,
+};
+ /*
+ *  or possibly
+ */
+static const uint64_t test_pattern_3[] = {
+    0xFDFDFDFDFDFDFDFDULL,
+    0x8787878787878787ULL,
+    0xFEFEFEFEFEFEFEFEULL,
+    0xC3C3C3C3C3C3C3C3ULL,
+    0x7F7F7F7F7F7F7F7FULL,
+    0xE1E1E1E1E1E1E1E1ULL,
+    0xBFBFBFBFBFBFBFBFULL,
+    0xF0F0F0F0F0F0F0F0ULL,
+    0xDFDFDFDFDFDFDFDFULL,
+    0x7878787878787878ULL,
+    0xEFEFEFEFEFEFEFEFULL,
+    0x3C3C3C3C3C3C3C3CULL,
+    0xF7F7F7F7F7F7F7F7ULL,
+    0x1E1E1E1E1E1E1E1EULL,
+    0xFBFBFBFBFBFBFBFBULL,
+    0x0F0F0F0F0F0F0F0FULL,
+};
+
+static const uint64_t test_pattern_1[] = {
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+#if 0 // only need a cacheline size
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+#endif
+};
+
+// setup default for test pattern array
+static const uint64_t *dram_tune_test_pattern = test_pattern_1;
+#endif
+
+// set this to 1 to shorten the testing to exit when all byte lanes have errors
+// having this at 0 forces the testing to take place over the entire range every iteration,
+// hopefully ensuring an even load on the memory subsystem 
+#define EXIT_WHEN_ALL_LANES_HAVE_ERRORS 0
+
+#define DEFAULT_TEST_BURSTS 5 // FIXME: this is what works so far...// FIXME: was 7
+int dram_tune_use_bursts = DEFAULT_TEST_BURSTS;
+
+// dram_tune_rank_offset is used to offset the second area used in test_dram_mem_xor.
+//
+// If only a single-rank DIMM, the offset will be 256MB from the start of the first area,
+//  which is more than enough for the restricted looping/address range actually tested...
+//
+// If a 2-rank DIMM, the offset will be the size of a rank's address space, so the effect
+//  will be to have the first and second areas in different ranks on the same DIMM.
+//
+// So, we default this to single-rank, and it will be overridden when 2-ranks are detected.
+//
+
+// FIXME: ASSUME that we have DIMMS no less than 4GB in size
+
+// offset to first area that avoids any boot stuff in low range (below 256MB)
+#define AREA_BASE_OFFSET (1ULL << 28) // bit 28 always ON
+
+// offset to duplicate area; may coincide with rank 1 base address for 2-rank 4GB DIMM
+#define AREA_DUPE_OFFSET (1ULL << 31) // bit 31 always ON
+
+// defaults to DUPE, but will be set elsewhere to offset to next RANK if multi-rank DIMM
+static uint64_t dram_tune_rank_offset = AREA_DUPE_OFFSET; // default
+
+// defaults to 0, but will be set elsewhere to the address offset to next DIMM if multi-slot
+static uint64_t dram_tune_dimm_offset = 0; // default
+
+
+static int speed_bin_offset[3] = {25, 20, 15};
+static int speed_bin_winlen[3] = {70, 60, 60};
+
+static int
+get_speed_bin(bdk_node_t node, int lmc)
+{
+    uint32_t mts_speed = (libdram_get_freq_from_pll(node, lmc) / 1000000) * 2;
+    int ret = 0;
+
+    // FIXME: is this reasonable speed "binning"?
+    if (mts_speed >= 1700) {
+	if (mts_speed >= 2000)
+	    ret = 2;
+	else
+	    ret = 1;
+    }
+
+    debug_print("N%d.LMC%d: %s: returning bin %d for MTS %d\n", 
+		node, lmc, __FUNCTION__, ret, mts_speed);
+
+    return ret;
+}
+
+static int is_low_risk_offset(int speed_bin, int offset)
+{
+    return (_abs(offset) <= speed_bin_offset[speed_bin]);
+}
+static int is_low_risk_winlen(int speed_bin, int winlen)
+{
+    return (winlen >= speed_bin_winlen[speed_bin]);
+}
+
+#define ENABLE_PREFETCH 0
+#define ENABLE_WBIL2    1
+#define ENABLE_SBLKDTY  0
+
+#define BDK_SYS_CVMCACHE_INV_L2 "#0,c11,c1,#1"          // L2 Cache Invalidate
+#define BDK_CACHE_INV_L2(address) { asm volatile ("sys " BDK_SYS_CVMCACHE_INV_L2 ", %0" : : "r" (address)); }
+
+int dram_tuning_mem_xor(bdk_node_t node, int lmc, uint64_t p, uint64_t bitmask, uint64_t *xor_data)
+{
+    uint64_t p1, p2, d1, d2;
+    uint64_t v, v1;
+    uint64_t p2offset = 0x10000000/* was: dram_tune_rank_offset; */; // FIXME?
+    uint64_t datamask;
+    uint64_t xor;
+    uint64_t i, j, k;
+    uint64_t ii;
+    int errors = 0;
+    //uint64_t index;
+    uint64_t pattern1 = bdk_rng_get_random64();
+    uint64_t pattern2 = 0;
+    uint64_t bad_bits[2] = {0,0};
+
+#if ENABLE_SBLKDTY
+    BDK_CSR_MODIFY(c, node, BDK_L2C_CTL, c.s.dissblkdty = 0);
+#endif
+
+    // Byte lanes may be clear in the mask to indicate no testing on that lane.
+    datamask = bitmask;
+
+    // final address must include LMC and node
+    p |= (lmc<<7); /* Map address into proper interface */
+    p = bdk_numa_get_address(node, p); /* Map to node */
+
+    /* Add offset to both test regions to not clobber boot stuff
+     * when running from L2 for NAND boot.
+     */
+    p += AREA_BASE_OFFSET; // make sure base is out of the way of boot
+
+#define II_INC (1ULL << 29)
+#define II_MAX (1ULL << 31)
+#define K_INC  (1ULL << 14)
+#define K_MAX  (1ULL << 20)
+#define J_INC  (1ULL <<  9)
+#define J_MAX  (1ULL << 12)
+#define I_INC  (1ULL <<  3)
+#define I_MAX  (1ULL <<  7)
+
+    debug_print("N%d.LMC%d: dram_tuning_mem_xor: phys_addr=0x%lx\n", 
+              node, lmc, p);
+
+#if 0
+    int ix;
+    // add this loop to fill memory with the test pattern first
+    // loops are ordered so that only entire cachelines are written 
+    for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
+	for (k = 0; k < K_MAX; k += K_INC) {
+	    for (j = 0; j < J_MAX; j += J_INC) {
+		p1 = p + ii + k + j;
+		p2 = p1 + p2offset;
+		for (i = 0, ix = 0; i < I_MAX; i += I_INC, ix++) {
+
+		    v = dram_tune_test_pattern[ix];
+		    v1 = v; // write the same thing to both areas
+
+		    __bdk_dram_write64(p1 + i, v);
+		    __bdk_dram_write64(p2 + i, v1);
+
+		}
+#if ENABLE_WBIL2
+		BDK_CACHE_WBI_L2(p1);
+		BDK_CACHE_WBI_L2(p2);
+#endif
+	    }
+	}
+    } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
+#endif
+
+#if ENABLE_PREFETCH
+    BDK_PREFETCH(p           , BDK_CACHE_LINE_SIZE);
+    BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
+#endif
+
+    // loops are ordered so that only a single 64-bit slot is written to each cacheline at one time,
+    // then the cachelines are forced out; this should maximize read/write traffic
+    for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
+	for (k = 0; k < K_MAX; k += K_INC) {
+	    for (i = 0; i < I_MAX; i += I_INC) {
+		for (j = 0; j < J_MAX; j += J_INC) {
+
+		    p1 = p + ii + k + j;
+		    p2 = p1 + p2offset;
+
+#if ENABLE_PREFETCH
+                    if (j < (J_MAX - J_INC)) {
+                        BDK_PREFETCH(p1 + J_INC, BDK_CACHE_LINE_SIZE);
+                        BDK_PREFETCH(p2 + J_INC, BDK_CACHE_LINE_SIZE);
+                    }
+#endif
+	    
+		    v = pattern1 * (p1 + i);
+		    v1 = v; // write the same thing to both areas
+
+		    __bdk_dram_write64(p1 + i, v);
+		    __bdk_dram_write64(p2 + i, v1);
+
+#if ENABLE_WBIL2
+		    BDK_CACHE_WBI_L2(p1);
+		    BDK_CACHE_WBI_L2(p2);
+#endif
+		}
+	    }
+	}
+    } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
+
+    BDK_DCACHE_INVALIDATE;
+
+    debug_print("N%d.LMC%d: dram_tuning_mem_xor: done INIT loop\n", 
+              node, lmc);
+
+    /* Make a series of passes over the memory areas. */
+
+    for (int burst = 0; burst < 1/* was: dram_tune_use_bursts*/; burst++)
+    {
+	uint64_t this_pattern = bdk_rng_get_random64();
+	pattern2 ^= this_pattern;
+
+        /* XOR the data with a random value, applying the change to both
+         * memory areas.
+         */
+#if ENABLE_PREFETCH
+	BDK_PREFETCH(p           , BDK_CACHE_LINE_SIZE);
+	BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
+#endif
+
+	for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
+	    for (k = 0; k < K_MAX; k += K_INC) {
+		for (i = 0; i < I_MAX; i += I_INC) { // FIXME: rearranged, did not make much difference?
+		    for (j = 0; j < J_MAX; j += J_INC) {
+
+			p1 = p + ii + k + j;
+			p2 = p1 + p2offset;
+
+#if ENABLE_PREFETCH
+                        if (j < (J_MAX - J_INC)) {
+                            BDK_PREFETCH(p1 + J_INC, BDK_CACHE_LINE_SIZE);
+                            BDK_PREFETCH(p2 + J_INC, BDK_CACHE_LINE_SIZE);
+                        }
+#endif
+	    
+			v  = __bdk_dram_read64(p1 + i) ^ this_pattern;
+			v1 = __bdk_dram_read64(p2 + i) ^ this_pattern;
+
+#if ENABLE_WBIL2
+			BDK_CACHE_INV_L2(p1);
+			BDK_CACHE_INV_L2(p2);
+#endif
+
+			__bdk_dram_write64(p1 + i, v);
+			__bdk_dram_write64(p2 + i, v1);
+
+#if ENABLE_WBIL2
+			BDK_CACHE_WBI_L2(p1);
+			BDK_CACHE_WBI_L2(p2);
+#endif
+		    }
+		}
+	    }
+	} /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
+
+        BDK_DCACHE_INVALIDATE;
+
+        debug_print("N%d.LMC%d: dram_tuning_mem_xor: done MODIFY loop\n", 
+                  node, lmc);
+
+#if ENABLE_PREFETCH
+	BDK_PREFETCH(p           , BDK_CACHE_LINE_SIZE);
+	BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
+#endif
+
+        /* Look for differences in the areas. If there is a mismatch, reset
+         * both memory locations with the same pattern. Failing to do so
+         * means that on all subsequent passes the pair of locations remain
+         * out of sync giving spurious errors.
+         */
+	// FIXME: change the loop order so that an entire cache line is compared at one time
+	// FIXME: this is so that a read error that occurs *anywhere* on the cacheline will be caught,
+	// FIXME: rather than comparing only 1 cacheline slot at a time, where an error on a different
+	// FIXME: slot will be missed that time around
+	// Does the above make sense?
+
+	for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
+	    for (k = 0; k < K_MAX; k += K_INC) {
+		for (j = 0; j < J_MAX; j += J_INC) {
+
+		    p1 = p + ii + k + j;
+		    p2 = p1 + p2offset;
+
+#if ENABLE_PREFETCH
+                    if (j < (J_MAX - J_INC)) {
+                        BDK_PREFETCH(p1 + J_INC, BDK_CACHE_LINE_SIZE);
+                        BDK_PREFETCH(p2 + J_INC, BDK_CACHE_LINE_SIZE);
+                    }
+#endif
+	    
+		    // process entire cachelines in the innermost loop
+		    for (i = 0; i < I_MAX; i += I_INC) {
+
+			v = ((p1 + i) * pattern1) ^ pattern2; // FIXME: this should predict what we find...???
+			d1 = __bdk_dram_read64(p1 + i);
+			d2 = __bdk_dram_read64(p2 + i);
+
+			xor = ((d1 ^ v) | (d2 ^ v)) & datamask; // union of error bits only in active byte lanes
+
+                        if (!xor)
+                            continue;
+
+                        // accumulate bad bits
+                        bad_bits[0] |= xor;
+                        //bad_bits[1] |= ~mpr_data1 & 0xffUL; // cannot do ECC here
+
+			int bybit = 1;
+			uint64_t bymsk = 0xffULL; // start in byte lane 0
+			while (xor != 0) {
+			    debug_print("ERROR(%03d): [0x%016lX] [0x%016lX]  expected 0x%016lX d1 %016lX d2 %016lX\n",
+					burst, p1, p2, v, d1, d2);
+			    if (xor & bymsk) { // error(s) in this lane
+				errors |= bybit; // set the byte error bit
+				xor &= ~bymsk; // clear byte lane in error bits
+				datamask &= ~bymsk; // clear the byte lane in the mask
+#if EXIT_WHEN_ALL_LANES_HAVE_ERRORS
+				if (datamask == 0) { // nothing left to do
+				    return errors; // completely done when errors found in all byte lanes in datamask
+				}
+#endif /* EXIT_WHEN_ALL_LANES_HAVE_ERRORS */
+			    }
+			    bymsk <<= 8; // move mask into next byte lane
+			    bybit <<= 1; // move bit into next byte position
+			}
+		    }
+#if ENABLE_WBIL2
+		    BDK_CACHE_WBI_L2(p1);
+		    BDK_CACHE_WBI_L2(p2);
+#endif
+		}
+	    }
+	} /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
+
+        debug_print("N%d.LMC%d: dram_tuning_mem_xor: done TEST loop\n", 
+                  node, lmc);
+
+    } /* for (int burst = 0; burst < dram_tune_use_bursts; burst++) */
+
+    if (xor_data != NULL) { // send the bad bits back...
+        xor_data[0] = bad_bits[0];
+        xor_data[1] = bad_bits[1]; // let it be zeroed
+    }
+
+#if ENABLE_SBLKDTY
+    BDK_CSR_MODIFY(c, node, BDK_L2C_CTL, c.s.dissblkdty = 1);
+#endif
+
+    return errors;
+}
+
+#undef II_INC
+#undef II_MAX
+
+#define EXTRACT(v, lsb, width) (((v) >> (lsb)) & ((1ull << (width)) - 1))
+#define LMCNO(address, xbits) (EXTRACT(address, 7, xbits) ^ EXTRACT(address, 20, xbits) ^ EXTRACT(address, 12, xbits))
+
+static int dram_tuning_mem_xor2(uint64_t p, uint64_t bitmask, int xbits)
+{
+    uint64_t p1, p2, d1, d2;
+    uint64_t v, vpred;
+    uint64_t p2offset = dram_tune_rank_offset; // FIXME?
+    uint64_t datamask;
+    uint64_t xor;
+    uint64_t ii;
+    uint64_t pattern1 = bdk_rng_get_random64();
+    uint64_t pattern2 = 0;
+    int errors = 0;
+    int errs_by_lmc[4] = { 0,0,0,0 };
+    int lmc;
+    uint64_t vbase, vincr;
+
+    // Byte lanes may be clear in the mask to indicate no testing on that lane.
+    datamask = bitmask;
+
+    /* Add offset to both test regions to not clobber boot stuff
+     * when running from L2 for NAND boot.
+     */
+    p += AREA_BASE_OFFSET; // make sure base is out of the way of boot
+
+    // move the multiplies outside the loop
+    vbase = p * pattern1;
+    vincr = 8 * pattern1;
+
+#define II_INC (1ULL <<  3)
+#define II_MAX (1ULL << 22) // stop where the core ID bits start
+
+    // walk the memory areas by 8-byte words
+    v = vbase;
+    for (ii = 0; ii < II_MAX; ii += II_INC) {
+
+	p1 = p + ii;
+	p2 = p1 + p2offset;
+
+	__bdk_dram_write64(p1, v);
+	__bdk_dram_write64(p2, v);
+
+        v += vincr;
+    }
+
+    __bdk_dram_flush_to_mem_range(p           , p            + II_MAX);
+    __bdk_dram_flush_to_mem_range(p + p2offset, p + p2offset + II_MAX);
+    BDK_DCACHE_INVALIDATE;
+
+    /* Make a series of passes over the memory areas. */
+
+    for (int burst = 0; burst < dram_tune_use_bursts; burst++)
+    {
+	uint64_t this_pattern = bdk_rng_get_random64();
+	pattern2 ^= this_pattern;
+
+        /* XOR the data with a random value, applying the change to both
+         * memory areas.
+         */
+#if 0
+	BDK_PREFETCH(p           , BDK_CACHE_LINE_SIZE);
+	BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
+#endif
+	for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
+
+	    p1 = p + ii;
+	    p2 = p1 + p2offset;
+
+	    d1 = __bdk_dram_read64(p1) ^ this_pattern;
+	    d2 = __bdk_dram_read64(p2) ^ this_pattern;
+
+	    __bdk_dram_write64(p1, d1);
+	    __bdk_dram_write64(p2, d2);
+
+	}
+	__bdk_dram_flush_to_mem_range(p           , p            + II_MAX);
+	__bdk_dram_flush_to_mem_range(p + p2offset, p + p2offset + II_MAX);
+        BDK_DCACHE_INVALIDATE;
+
+        /* Look for differences in the areas. If there is a mismatch, reset
+         * both memory locations with the same pattern. Failing to do so
+         * means that on all subsequent passes the pair of locations remain
+         * out of sync giving spurious errors.
+         */
+#if 0
+	BDK_PREFETCH(p           , BDK_CACHE_LINE_SIZE);
+	BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
+#endif
+        vpred = vbase;
+	for (ii = 0; ii < II_MAX; ii += II_INC) {
+
+	    p1 = p + ii;
+	    p2 = p1 + p2offset;
+
+	    v = vpred ^ pattern2; // this should predict what we find...
+	    d1 = __bdk_dram_read64(p1);
+	    d2 = __bdk_dram_read64(p2);
+            vpred += vincr;
+
+	    xor = ((d1 ^ v) | (d2 ^ v)) & datamask; // union of error bits only in active byte lanes
+	    if (!xor) // no errors
+		continue;
+
+	    lmc = LMCNO(p1, xbits); // FIXME: LMC should be SAME for p1 and p2!!!
+	    if (lmc != (int)LMCNO(p2, xbits)) {
+		printf("ERROR: LMCs for addresses [0x%016lX] (%lld) and [0x%016lX] (%lld) differ!!!\n",
+		       p1, LMCNO(p1, xbits), p2, LMCNO(p2, xbits));
+	    }
+	    int bybit = 1;
+	    uint64_t bymsk = 0xffULL; // start in byte lane 0
+	    while (xor != 0) {
+		debug_print("ERROR(%03d): [0x%016lX] [0x%016lX]  expected 0x%016lX d1 %016lX d2 %016lX\n",
+			    burst, p1, p2, v, d1, d2);
+		if (xor & bymsk) { // error(s) in this lane
+		    errs_by_lmc[lmc] |= bybit; // set the byte error bit in the LMCs errors
+		    errors |= bybit; // set the byte error bit
+		    xor &= ~bymsk; // clear byte lane in error bits
+		    //datamask &= ~bymsk; // clear the byte lane in the mask
+		}
+		bymsk <<= 8; // move mask into next byte lane
+		bybit <<= 1; // move bit into next byte position
+	    } /* while (xor != 0) */
+	} /* for (ii = 0; ii < II_MAX; ii += II_INC) */
+    } /* for (int burst = 0; burst < dram_tune_use_bursts; burst++) */
+
+    // update the global LMC error states
+    for (lmc = 0; lmc < 4; lmc++) {
+	if (errs_by_lmc[lmc]) {
+	    bdk_atomic_fetch_and_bset64_nosync(&test_dram_byte_lmc_errs[lmc], errs_by_lmc[lmc]);
+	}
+    }
+
+    return errors;
+}
+
+#if 0
+static int dram_tuning_mem_rows(uint64_t p, uint64_t bitmask)
+{
+    uint64_t p1, p2, d1, d2;
+    uint64_t v, v1;
+    uint64_t p2offset = dram_tune_rank_offset; // FIXME?
+    uint64_t datamask;
+    uint64_t xor;
+    int i, j, k, ii;
+    int errors = 0;
+    int index;
+    uint64_t pattern1 = 0; // FIXME: maybe this could be from a table?
+    uint64_t pattern2;
+
+    // Byte lanes may be clear in the mask to indicate no testing on that lane.
+    datamask = bitmask;
+
+    /* Add offset to both test regions to not clobber boot stuff
+     * when running from L2 for NAND boot.
+     */
+    p += 0x10000000; // FIXME? was: 0x4000000; // make sure base is out of the way of cores for tuning
+
+    pattern2 = pattern1;
+    for (k = 0; k < (1 << 20); k += (1 << 14)) {
+	for (j = 0; j < (1 << 12); j += (1 << 9)) {
+	    for (i = 0; i < (1 << 7); i += 8) {
+		index = i + j + k;
+		p1 = p + index;
+		p2 = p1 + p2offset;
+
+		v = pattern2;
+		v1 = v; // write the same thing to same slot in both cachelines
+		pattern2 = ~pattern2; // flip bits for next slots
+
+		__bdk_dram_write64(p1, v);
+		__bdk_dram_write64(p2, v1);
+	    }
+#if 1
+	    BDK_CACHE_WBI_L2(p1);
+	    BDK_CACHE_WBI_L2(p2);
+#endif
+	}
+    }
+
+#if 0
+    __bdk_dram_flush_to_mem_range(p, p + (1ULL << 20)); // max_addr is start + where k stops...
+    __bdk_dram_flush_to_mem_range(p + p2offset, p + p2offset + (1ULL << 20)); // max_addr is start + where k stops...
+#endif
+    BDK_DCACHE_INVALIDATE;
+
+    /* Make a series of passes over the memory areas. */
+
+    for (int burst = 0; burst < dram_tune_use_bursts; burst++)
+    {
+        /* just read and flip the bits applying the change to both
+         * memory areas.
+         */
+	for (k = 0; k < (1 << 20); k += (1 << 14)) {
+	    for (j = 0; j < (1 << 12); j += (1 << 9)) {
+		for (i = 0; i < (1 << 7); i += 8) {
+		    index = i + j + k;
+		    p1 = p + index;
+		    p2 = p1 + p2offset;
+
+		    v  = ~__bdk_dram_read64(p1);
+		    v1 = ~__bdk_dram_read64(p2);
+
+		    __bdk_dram_write64(p1, v);
+		    __bdk_dram_write64(p2, v1);
+		}
+#if 1
+		BDK_CACHE_WBI_L2(p1);
+		BDK_CACHE_WBI_L2(p2);
+#endif
+	    }
+	}
+
+#if 0
+	__bdk_dram_flush_to_mem_range(p, p + (1ULL << 20)); // max_addr is start + where k stops...
+	__bdk_dram_flush_to_mem_range(p + p2offset, p + p2offset + (1ULL << 20)); // max_addr is start + where k stops...
+#endif
+        BDK_DCACHE_INVALIDATE;
+
+        /* Look for differences in the areas. If there is a mismatch, reset
+         * both memory locations with the same pattern. Failing to do so
+         * means that on all subsequent passes the pair of locations remain
+         * out of sync giving spurious errors.
+         */
+
+	// FIXME: change the loop order so that an entire cache line is compared at one time
+	// FIXME: this is so that a read error that occurs *anywhere* on the cacheline will be caught,
+	// FIXME: rather than comparing only 1 cacheline slot at a time, where an error on a different
+	// FIXME: slot will be missed that time around
+	// Does the above make sense?
+
+	pattern2 = ~pattern1; // slots have been flipped by the above loop
+
+	for (k = 0; k < (1 << 20); k += (1 << 14)) {
+	    for (j = 0; j < (1 << 12); j += (1 << 9)) {
+		for (i = 0; i < (1 << 7); i += 8) {
+		    index = i + j + k;
+		    p1 = p + index;
+		    p2 = p1 + p2offset;
+
+		    v = pattern2; // FIXME: this should predict what we find...???
+		    d1 = __bdk_dram_read64(p1);
+		    d2 = __bdk_dram_read64(p2);
+		    pattern2 = ~pattern2; // flip for next slot
+
+		    xor = ((d1 ^ v) | (d2 ^ v)) & datamask; // union of error bits only in active byte lanes
+
+		    int bybit = 1;
+		    uint64_t bymsk = 0xffULL; // start in byte lane 0
+		    while (xor != 0) {
+			debug_print("ERROR(%03d): [0x%016lX] [0x%016lX]  expected 0x%016lX d1 %016lX d2 %016lX\n",
+				  burst, p1, p2, v, d1, d2);
+			if (xor & bymsk) { // error(s) in this lane
+			    errors |= bybit; // set the byte error bit
+			    xor &= ~bymsk; // clear byte lane in error bits
+			    datamask &= ~bymsk; // clear the byte lane in the mask
+#if EXIT_WHEN_ALL_LANES_HAVE_ERRORS
+			    if (datamask == 0) { // nothing left to do
+				return errors; // completely done when errors found in all byte lanes in datamask
+			    }
+#endif /* EXIT_WHEN_ALL_LANES_HAVE_ERRORS */
+			}
+			bymsk <<= 8; // move mask into next byte lane
+			bybit <<= 1; // move bit into next byte position
+		    }
+		}
+	    }
+	}
+	pattern1 = ~pattern1; // flip the starting pattern for the next burst
+
+    } /* for (int burst = 0; burst < dram_tune_use_bursts; burst++) */
+    return errors;
+}
+#endif
+
+// cores to use
+#define DEFAULT_USE_CORES 44   // FIXME: was (1 << CORE_BITS)
+int dram_tune_use_cores = DEFAULT_USE_CORES; // max cores to use, override available
+int dram_tune_max_cores; // max cores available on a node
+#define CORE_SHIFT 22          // FIXME: offset into rank_address passed to test_dram_byte
+
+typedef void (*__dram_tuning_thread_t)(int arg, void *arg1);
+
+typedef struct
+{
+    bdk_node_t node;
+    int64_t num_lmcs;
+    uint64_t byte_mask;
+} test_dram_byte_info_t;
+
+static void dram_tuning_thread(int arg, void *arg1)
+{
+    test_dram_byte_info_t *test_info = arg1;
+    int core = arg;
+    uint64_t errs;
+    bdk_node_t node = test_info->node;
+    int num_lmcs, lmc;
+#if 0
+    num_lmcs = test_info->num_lmcs;
+    // map core numbers into hopefully equal groups per LMC
+    lmc = core % num_lmcs;
+#else
+    // FIXME: this code should allow running all the cores on a single LMC...
+    // if incoming num_lmcs > 0, then use as normal; if < 0 remap to a single LMC
+    if (test_info->num_lmcs >= 0) {
+	num_lmcs = test_info->num_lmcs;
+	// map core numbers into hopefully equal groups per LMC
+	lmc = core % num_lmcs;
+    } else {
+	num_lmcs = 1;
+	// incoming num_lmcs is (desired LMC - 10)
+	lmc = 10 + test_info->num_lmcs;
+    }
+#endif
+    uint64_t base_address = 0/* was: (lmc << 7); now done by callee */;
+    uint64_t bytemask = test_info->byte_mask;
+
+    /* Figure out our work memory range.
+     *
+     * Note: base_address above just provides the physical offset which determines
+     * specific LMC portions of the address space and does not have the node bits set.
+     */
+    //was: base_address  = bdk_numa_get_address(node, base_address); // map to node // now done by callee
+    base_address |= (core << CORE_SHIFT); // FIXME: also put full core into address
+    if (dram_tune_dimm_offset) { // if multi-slot in some way, choose a DIMM for the core
+	base_address |= (core & (1 << (num_lmcs >> 1))) ? dram_tune_dimm_offset : 0;
+    }
+
+    debug_print("Node %d, core %d, Testing area 1 at 0x%011lx, area 2 at 0x%011lx\n",
+		node, core, base_address + AREA_BASE_OFFSET,
+		base_address + AREA_BASE_OFFSET + dram_tune_rank_offset);
+
+    errs = dram_tuning_mem_xor(node, lmc, base_address, bytemask, NULL);
+    //errs = dram_tuning_mem_rows(base_address, bytemask);
+
+    /* Report that we're done */
+    debug_print("Core %d on LMC %d node %d done with test_dram_byte with 0x%lx errs\n",
+	      core, lmc, node, errs);
+
+    if (errs) {
+	bdk_atomic_fetch_and_bset64_nosync(&test_dram_byte_threads_errs, errs);
+	bdk_atomic_fetch_and_bset64_nosync(&test_dram_byte_lmc_errs[lmc], errs);
+    }
+
+    bdk_atomic_add64_nosync(&test_dram_byte_threads_done, 1);
+
+    return;
+}
+
+static void dram_tuning_thread2(int arg, void *arg1)
+{
+    test_dram_byte_info_t *test_info = arg1;
+    int core = arg;
+    uint64_t errs;
+    bdk_node_t node = test_info->node;
+    int num_lmcs = test_info->num_lmcs;
+
+    uint64_t base_address = 0; // 
+    uint64_t bytemask = test_info->byte_mask;
+
+    /* Figure out our work memory range.
+     *
+     * Note: base_address above just provides the physical offset which determines
+     * specific portions of the address space and does not have the node bits set.
+     */
+    base_address  = bdk_numa_get_address(node, base_address); // map to node
+    base_address |= (core << CORE_SHIFT); // FIXME: also put full core into address
+    if (dram_tune_dimm_offset) { // if multi-slot in some way, choose a DIMM for the core
+	base_address |= (core & 1) ? dram_tune_dimm_offset : 0;
+    }
+
+    debug_print("Node %d, core %d, Testing area 1 at 0x%011lx, area 2 at 0x%011lx\n",
+                node, core, base_address + AREA_BASE_OFFSET,
+                base_address + AREA_BASE_OFFSET + dram_tune_rank_offset);
+
+    errs = dram_tuning_mem_xor2(base_address, bytemask, (num_lmcs >> 1)); // 4->2, 2->1, 1->0
+    //errs = dram_tuning_mem_rows(base_address, bytemask);
+
+    /* Report that we're done */
+    debug_print("Core %d on LMC %d node %d done with test_dram_byte with 0x%lx errs\n",
+	      core, lmc, node, errs);
+
+    if (errs) {
+	bdk_atomic_fetch_and_bset64_nosync(&test_dram_byte_threads_errs, errs);
+	// FIXME: this will have been done already in the called test routine
+	//bdk_atomic_fetch_and_bset64_nosync(&test_dram_byte_lmc_errs[lmc], errs);
+    }
+
+    bdk_atomic_add64_nosync(&test_dram_byte_threads_done, 1);
+
+    return;
+}
+
+static int dram_tune_use_xor2 = 1; // FIXME: do NOT default to original mem_xor (LMC-based) code
+
+static int
+run_dram_tuning_threads(bdk_node_t node, int num_lmcs, uint64_t bytemask)
+{
+    test_dram_byte_info_t test_dram_byte_info;
+    test_dram_byte_info_t *test_info = &test_dram_byte_info;
+    int total_count = 0;
+    __dram_tuning_thread_t thread_p = (dram_tune_use_xor2) ? dram_tuning_thread2 : dram_tuning_thread;
+
+    test_info->node = node;
+    test_info->num_lmcs = num_lmcs;
+    test_info->byte_mask = bytemask;
+
+    // init some global data
+    bdk_atomic_set64(&test_dram_byte_threads_done, 0);
+    bdk_atomic_set64((int64_t *)&test_dram_byte_threads_errs, 0);
+    bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[0], 0);
+    bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[1], 0);
+    bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[2], 0);
+    bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[3], 0);
+
+    /* Start threads for cores on the node */
+    if (bdk_numa_exists(node)) {
+	debug_print("Starting %d threads for test_dram_byte\n", dram_tune_use_cores);
+	for (int core = 0; core < dram_tune_use_cores; core++) {
+	    if (bdk_thread_create(node, 0, thread_p, core, (void *)test_info, 0)) {
+		bdk_error("Failed to create thread %d for test_dram_byte\n", core);
+	    } else {
+		total_count++;
+	    }
+	}
+    }
+
+#if 0
+    /* Wait for threads to finish */
+    while (bdk_atomic_get64(&test_dram_byte_threads_done) < total_count)
+	bdk_thread_yield();
+#else
+#define TIMEOUT_SECS 5  // FIXME: long enough so a pass for a given setting will not print
+        /* Wait for threads to finish, with progress */
+        int cur_count;
+        uint64_t cur_time;
+        uint64_t period = bdk_clock_get_rate(bdk_numa_local(), BDK_CLOCK_TIME) * TIMEOUT_SECS; // FIXME? 
+        uint64_t timeout = bdk_clock_get_count(BDK_CLOCK_TIME) + period;
+        do {
+            bdk_thread_yield();
+            cur_count = bdk_atomic_get64(&test_dram_byte_threads_done);
+            cur_time = bdk_clock_get_count(BDK_CLOCK_TIME);
+            if (cur_time >= timeout) {
+                printf("Waiting for %d cores\n", total_count - cur_count);
+                timeout = cur_time + period;
+            }
+        } while (cur_count < total_count);
+#endif
+
+    // NOTE: this is the summary of errors across all LMCs
+    return (int)bdk_atomic_get64((int64_t *)&test_dram_byte_threads_errs);
+}
+
+/* These variables count the number of ECC errors. They should only be accessed atomically */
+extern int64_t __bdk_dram_ecc_single_bit_errors[];
+extern int64_t __bdk_dram_ecc_double_bit_errors[];
+
+#if 0
+// make the tuning test callable as a standalone
+int
+bdk_run_dram_tuning_test(int node)
+{
+    int num_lmcs = __bdk_dram_get_num_lmc(node);
+    const char *s;
+    int lmc, byte;
+    int errors;
+    uint64_t start_dram_dclk[4], start_dram_ops[4];
+    int save_use_bursts;
+
+    // check for the cores on this node, abort if not more than 1 // FIXME?
+    dram_tune_max_cores = bdk_get_num_running_cores(node);
+    if (dram_tune_max_cores < 2) {
+	//bdk_init_cores(node, 0);
+	printf("N%d: ERROR: not enough cores to run the DRAM tuning test.\n", node);
+	return 0;
+    }
+
+    // but use only a certain number of cores, at most what is available
+    if ((s = getenv("ddr_tune_use_cores")) != NULL) {
+	dram_tune_use_cores = strtoul(s, NULL, 0);
+	if (dram_tune_use_cores <= 0) // allow 0 or negative to mean all
+	    dram_tune_use_cores = dram_tune_max_cores;
+    }
+    if (dram_tune_use_cores > dram_tune_max_cores)
+	dram_tune_use_cores = dram_tune_max_cores;
+
+    // save the original bursts, so we can replace it with a better number for just testing
+    save_use_bursts = dram_tune_use_bursts;
+    dram_tune_use_bursts = 1500; // FIXME: hard code bursts for the test here...
+
+    // allow override of the test repeats (bursts) per thread create
+    if ((s = getenv("ddr_tune_use_bursts")) != NULL) {
+        dram_tune_use_bursts = strtoul(s, NULL, 10);
+    }
+
+    // allow override of the test mem_xor algorithm
+    if ((s = getenv("ddr_tune_use_xor2")) != NULL) {
+        dram_tune_use_xor2 = !!strtoul(s, NULL, 10);
+    }
+
+    // FIXME? consult LMC0 only
+    BDK_CSR_INIT(lmcx_config, node, BDK_LMCX_CONFIG(0));
+    if (lmcx_config.s.rank_ena) { // replace the default offset when there is more than 1 rank...
+	dram_tune_rank_offset = 1ull << (28 + lmcx_config.s.pbank_lsb - lmcx_config.s.rank_ena + (num_lmcs/2));
+	ddr_print("N%d: run_dram_tuning_test: changing rank offset to 0x%lx\n", node, dram_tune_rank_offset);
+    }
+    if (lmcx_config.s.init_status & 0x0c) { // bit 2 or 3 set indicates 2 DIMMs
+	dram_tune_dimm_offset = 1ull << (28 + lmcx_config.s.pbank_lsb + (num_lmcs/2));
+	ddr_print("N%d: run_dram_tuning_test: changing dimm offset to 0x%lx\n", node, dram_tune_dimm_offset);
+    }
+    int ddr_interface_64b = !lmcx_config.s.mode32b;
+
+    // construct the bytemask
+    int bytes_todo = (ddr_interface_64b) ? 0xff : 0x0f; // FIXME: hack?
+    uint64_t bytemask = 0;
+    for (byte = 0; byte < 8; ++byte) {
+	uint64_t bitmask;
+	if (bytes_todo & (1 << byte)) {
+	    bitmask = ((!ddr_interface_64b) && (byte == 4)) ? 0x0f: 0xff;
+	    bytemask |= bitmask << (8*byte); // set the bytes bits in the bytemask 
+	}
+    } /* for (byte = 0; byte < 8; ++byte) */
+
+    // print current working values
+    ddr_print("N%d: run_dram_tuning_test: max %d cores, use %d cores, use %d bursts.\n",
+	      node, dram_tune_max_cores, dram_tune_use_cores, dram_tune_use_bursts);
+
+    // do the setup on active LMCs
+    for (lmc = 0; lmc < num_lmcs; lmc++) {
+	// record start cycle CSRs here for utilization measure
+	start_dram_dclk[lmc] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(lmc));
+	start_dram_ops[lmc]  = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(lmc));
+#if 0
+	bdk_atomic_set64(&__bdk_dram_ecc_single_bit_errors[lmc], 0);
+	bdk_atomic_set64(&__bdk_dram_ecc_double_bit_errors[lmc], 0);
+#else
+	__bdk_dram_ecc_single_bit_errors[lmc] = 0;
+	__bdk_dram_ecc_double_bit_errors[lmc] = 0;
+#endif
+    } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+    bdk_watchdog_poke();
+
+    // run the test(s)
+    // only 1 call should be enough, let the bursts, etc, control the load...  
+    errors = run_dram_tuning_threads(node, num_lmcs, bytemask);
+
+    /* Check ECC error counters after the test */
+    int64_t ecc_single = 0;
+    int64_t ecc_double = 0;
+    int64_t ecc_single_errs[4];
+    int64_t ecc_double_errs[4];
+
+    // finally, print the utilizations all together, and sum the ECC errors
+    for (lmc = 0; lmc < num_lmcs; lmc++) {
+	uint64_t dclk_diff = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(lmc)) - start_dram_dclk[lmc];
+	uint64_t ops_diff  = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(lmc)) - start_dram_ops[lmc];
+	uint64_t percent_x10 = ops_diff * 1000 / dclk_diff;
+	printf("N%d.LMC%d: ops %lu, cycles %lu, used %lu.%lu%%\n",
+		  node, lmc, ops_diff, dclk_diff, percent_x10 / 10, percent_x10 % 10);
+
+        ecc_single += (ecc_single_errs[lmc] = bdk_atomic_get64(&__bdk_dram_ecc_single_bit_errors[lmc]));
+        ecc_double += (ecc_double_errs[lmc] = bdk_atomic_get64(&__bdk_dram_ecc_double_bit_errors[lmc]));
+    } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+    /* Always print any ECC errors */
+    if (ecc_single || ecc_double) {
+        printf("Test \"%s\": ECC errors, %ld/%ld/%ld/%ld corrected, %ld/%ld/%ld/%ld uncorrected\n",
+	       "DRAM Tuning Test",
+	       ecc_single_errs[0], ecc_single_errs[1], ecc_single_errs[2], ecc_single_errs[3],
+	       ecc_double_errs[0], ecc_double_errs[1], ecc_double_errs[2], ecc_double_errs[3]);
+    }
+    if (errors || ecc_double || ecc_single) {
+	printf("Test \"%s\": FAIL: %ld single, %ld double, %d compare errors\n",
+	       "DRAM Tuning Test", ecc_single, ecc_double, errors);
+    }
+
+    // restore bursts
+    dram_tune_use_bursts = save_use_bursts;
+
+    return (errors + ecc_double + ecc_single);
+}
+#endif /* 0 */
+
+#define DEFAULT_SAMPLE_GRAN 3 // sample for errors every N offset values
+#define MIN_BYTE_OFFSET -63
+#define MAX_BYTE_OFFSET +63
+int dram_tune_use_gran = DEFAULT_SAMPLE_GRAN;
+
+static int
+auto_set_dll_offset(bdk_node_t node, int dll_offset_mode,
+		    int num_lmcs, int ddr_interface_64b,
+		    int do_tune)
+{
+    int byte_offset;
+    //unsigned short result[9];
+    int byte;
+    int byte_delay_start[4][9];
+    int byte_delay_count[4][9];
+    uint64_t byte_delay_windows [4][9];
+    int byte_delay_best_start[4][9];
+    int byte_delay_best_count[4][9];
+    //int this_rodt;
+    uint64_t ops_sum[4], dclk_sum[4];
+    uint64_t start_dram_dclk[4], stop_dram_dclk[4];
+    uint64_t start_dram_ops[4], stop_dram_ops[4];
+    int errors, tot_errors;
+    int lmc;
+    char *mode_str = (dll_offset_mode == 2) ? "Read" : "Write";
+    int mode_is_read = (dll_offset_mode == 2);
+    char *mode_blk = (dll_offset_mode == 2) ? " " : "";
+    int start_offset, end_offset, incr_offset;
+
+    int speed_bin = get_speed_bin(node, 0); // FIXME: just get from LMC0?
+    int low_risk_count = 0, needs_review_count = 0;
+
+    if (dram_tune_use_gran != DEFAULT_SAMPLE_GRAN) {
+	ddr_print2("N%d: Changing sample granularity from %d to %d\n",
+		  node, DEFAULT_SAMPLE_GRAN, dram_tune_use_gran);
+    }
+    // ensure sample is taken at 0
+    start_offset = MIN_BYTE_OFFSET - (MIN_BYTE_OFFSET % dram_tune_use_gran);
+    end_offset   = MAX_BYTE_OFFSET - (MAX_BYTE_OFFSET % dram_tune_use_gran);
+    incr_offset  = dram_tune_use_gran;
+
+    memset(ops_sum, 0, sizeof(ops_sum));
+    memset(dclk_sum, 0, sizeof(dclk_sum));
+    memset(byte_delay_start, 0, sizeof(byte_delay_start));
+    memset(byte_delay_count, 0, sizeof(byte_delay_count));
+    memset(byte_delay_windows,  0, sizeof(byte_delay_windows));
+    memset(byte_delay_best_start, 0, sizeof(byte_delay_best_start));
+    memset(byte_delay_best_count, 0, sizeof(byte_delay_best_count));
+
+    // FIXME? consult LMC0 only
+    BDK_CSR_INIT(lmcx_config, node, BDK_LMCX_CONFIG(0));
+    if (lmcx_config.s.rank_ena) { // replace the default offset when there is more than 1 rank...
+	dram_tune_rank_offset = 1ull << (28 + lmcx_config.s.pbank_lsb - lmcx_config.s.rank_ena + (num_lmcs/2));
+	ddr_print2("N%d: Tuning multiple ranks per DIMM (rank offset 0x%lx).\n", node, dram_tune_rank_offset);
+    }
+    if (lmcx_config.s.init_status & 0x0c) { // bit 2 or 3 set indicates 2 DIMMs
+	dram_tune_dimm_offset = 1ull << (28 + lmcx_config.s.pbank_lsb + (num_lmcs/2));
+	ddr_print2("N%d: Tuning multiple DIMMs per channel (DIMM offset 0x%lx)\n", node, dram_tune_dimm_offset);
+    }
+
+    // FIXME? do this for LMC0 only
+    //BDK_CSR_INIT(comp_ctl2, node, BDK_LMCX_COMP_CTL2(0));
+    //this_rodt = comp_ctl2.s.rodt_ctl;
+
+    // construct the bytemask
+    int bytes_todo = (ddr_interface_64b) ? 0xff : 0x0f;
+    uint64_t bytemask = 0;
+    for (byte = 0; byte < 8; ++byte) {
+	if (bytes_todo & (1 << byte)) {
+	    bytemask |= 0xfful << (8*byte); // set the bytes bits in the bytemask 
+	}
+    } /* for (byte = 0; byte < 8; ++byte) */
+
+    // now loop through selected legal values for the DLL byte offset...
+
+    for (byte_offset = start_offset; byte_offset <= end_offset; byte_offset += incr_offset) {
+
+	// do the setup on active LMCs
+	for (lmc = 0; lmc < num_lmcs; lmc++) {
+	    change_dll_offset_enable(node, lmc, 0);
+
+	    // set all byte lanes at once
+	    load_dll_offset(node, lmc, dll_offset_mode, byte_offset, 10 /* All bytes at once */);
+	    // but then clear the ECC byte lane so it should be neutral for the test...
+	    load_dll_offset(node, lmc, dll_offset_mode, 0, 8);
+
+	    change_dll_offset_enable(node, lmc, 1);
+
+	    // record start cycle CSRs here for utilization measure
+	    start_dram_dclk[lmc] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(lmc));
+	    start_dram_ops[lmc]  = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(lmc));
+	} /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+	bdk_watchdog_poke();
+
+	// run the test(s)
+	// only 1 call should be enough, let the bursts, etc, control the load...  
+	tot_errors = run_dram_tuning_threads(node, num_lmcs, bytemask);
+
+	for (lmc = 0; lmc < num_lmcs; lmc++) {
+	    // record stop cycle CSRs here for utilization measure
+	    stop_dram_dclk[lmc] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(lmc));
+	    stop_dram_ops[lmc]  = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(lmc));
+
+	    // accumulate...
+	    ops_sum[lmc]  += stop_dram_ops[lmc]  - start_dram_ops[lmc];
+	    dclk_sum[lmc] += stop_dram_dclk[lmc] - start_dram_dclk[lmc];
+
+	    errors = test_dram_byte_lmc_errs[lmc];
+
+	    // check errors by byte, but not ECC
+	    for (byte = 0; byte < 8; ++byte) {
+		if (!(bytes_todo & (1 << byte))) // is this byte lane to be done
+		    continue; // no
+
+		byte_delay_windows[lmc][byte] <<= 1; // always put in a zero
+		if (errors & (1 << byte)) { // yes, an error in this byte lane
+		    byte_delay_count[lmc][byte] = 0; // stop now always
+		} else { // no error in this byte lane
+		    if (byte_delay_count[lmc][byte] == 0) { // first success, set run start
+			byte_delay_start[lmc][byte] = byte_offset;
+		    }
+		    byte_delay_count[lmc][byte] += incr_offset; // bump run length
+
+		    if (byte_delay_count[lmc][byte] > byte_delay_best_count[lmc][byte]) {
+			byte_delay_best_count[lmc][byte] = byte_delay_count[lmc][byte];
+			byte_delay_best_start[lmc][byte] = byte_delay_start[lmc][byte];
+		    }
+		    byte_delay_windows[lmc][byte] |= 1ULL; // for pass, put in a 1
+		}
+	    } /* for (byte = 0; byte < 8; ++byte) */
+
+	    // only print when there are errors and verbose...
+	    if (errors) {
+		debug_print("DLL %s Offset Test %3d: errors 0x%x\n",
+			    mode_str, byte_offset, errors);
+	    }
+	} /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+    } /* for (byte_offset=-63; byte_offset<63; byte_offset += incr_offset) */
+
+    // done with testing, load up and/or print out the offsets we found...
+
+    // only when margining...
+    if (!do_tune) {
+	printf("  \n");
+	printf("-------------------------------------\n");
+#if 0
+	uint32_t mts_speed = (libdram_get_freq_from_pll(node, 0) * 2) / 1000000; // FIXME: sample LMC0
+	printf("N%d: Starting %s Timing Margining for %d MT/s.\n", node, mode_str, mts_speed);
+#else
+	printf("N%d: Starting %s Timing Margining.\n", node, mode_str);
+#endif
+	printf("  \n");
+    } /* if (!do_tune) */
+
+    for (lmc = 0; lmc < num_lmcs; lmc++) {
+#if 1
+	// FIXME FIXME
+	// FIXME: this just makes ECC always show 0
+	byte_delay_best_start[lmc][8] = start_offset;
+	byte_delay_best_count[lmc][8] = end_offset - start_offset + incr_offset;
+#endif
+
+	// disable offsets while we load...
+	change_dll_offset_enable(node, lmc, 0);
+
+	// only when margining...
+	if (!do_tune) {
+	    // print the heading
+	    printf("  \n");
+	    printf("N%d.LMC%d: %s Timing Margin     %s : ", node, lmc, mode_str, mode_blk);
+	    printf("     ECC/8 ");
+	    for (byte = 7; byte >= 0; byte--) {
+		printf("    Byte %d ", byte);
+	    }
+	    printf("\n");
+	} /* if (!do_tune) */
+
+	// print and load the offset values
+	// print the windows bit arrays
+	// only when margining...
+	if (!do_tune) {
+            printf("N%d.LMC%d: DLL %s Offset Amount %s : ", node, lmc, mode_str, mode_blk);
+        } else {
+            ddr_print("N%d.LMC%d: SW DLL %s Offset Amount %s : ", node, lmc, mode_str, mode_blk);
+        }
+	for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
+
+	    int count = byte_delay_best_count[lmc][byte];
+	    if (count == 0)
+		count = incr_offset; // should make non-tested ECC byte come out 0
+	   
+	    byte_offset =  byte_delay_best_start[lmc][byte] +
+		((count - incr_offset) / 2); // adj by incr
+
+	    if (!do_tune) { // do counting and special flag if margining
+                int will_need_review = !is_low_risk_winlen(speed_bin, (count - incr_offset)) &&
+		                       !is_low_risk_offset(speed_bin, byte_offset);
+
+                printf("%10d%c", byte_offset, (will_need_review) ? '<' :' ');
+
+		if (will_need_review)
+		    needs_review_count++;
+		else
+		    low_risk_count++;
+	    } else { // if just tuning, make the printout less lengthy
+                ddr_print("%5d ", byte_offset);
+            }
+
+	    // FIXME? should we be able to override this?
+	    if (mode_is_read) // for READ offsets, always store what we found
+		load_dll_offset(node, lmc, dll_offset_mode, byte_offset, byte);
+	    else // for WRITE offsets, always store 0
+		load_dll_offset(node, lmc, dll_offset_mode, 0, byte);
+
+	}
+	if (!do_tune) {
+            printf("\n");
+        } else {
+            ddr_print("\n");
+        }
+
+
+	// re-enable the offsets now that we are done loading
+	change_dll_offset_enable(node, lmc, 1);
+
+	// only when margining...
+	if (!do_tune) {
+	    // print the window sizes
+	    printf("N%d.LMC%d: DLL %s Window Length %s : ", node, lmc, mode_str, mode_blk);
+	    for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
+		int count = byte_delay_best_count[lmc][byte];
+		if (count == 0)
+		    count = incr_offset; // should make non-tested ECC byte come out 0
+
+		// do this again since the "needs review" test is an AND...
+		byte_offset =  byte_delay_best_start[lmc][byte] +
+		    ((count - incr_offset) / 2); // adj by incr
+
+		int will_need_review = !is_low_risk_winlen(speed_bin, (count - incr_offset)) &&
+		    !is_low_risk_offset(speed_bin, byte_offset);
+
+		printf("%10d%c", count - incr_offset, (will_need_review) ? '<' :' ');
+	    }
+	    printf("\n");
+
+	    // print the window extents
+	    printf("N%d.LMC%d: DLL %s Window Bounds %s : ", node, lmc, mode_str, mode_blk);
+	    for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
+		int start = byte_delay_best_start[lmc][byte];
+		int count = byte_delay_best_count[lmc][byte];
+		if (count == 0)
+		    count = incr_offset; // should make non-tested ECC byte come out 0
+		printf(" %3d to%3d ", start,
+		       start + count - incr_offset);
+	    }
+	    printf("\n");
+#if 0
+	    // FIXME: should have a way to force these out...
+	    // print the windows bit arrays
+	    printf("N%d.LMC%d: DLL %s Window Bitmap%s : ", node, lmc, mode_str, mode_blk);
+	    for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
+		printf("%010lx ", byte_delay_windows[lmc][byte]);
+	    }
+	    printf("\n");
+#endif
+	} /* if (!do_tune) */
+    } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+    // only when margining...
+    if (!do_tune) {
+	// print the Summary line(s) here
+	printf("  \n");
+	printf("N%d: %s Timing Margining Summary : %s ", node, mode_str,
+	       (needs_review_count > 0) ? "Needs Review" : "Low Risk");
+	if (needs_review_count > 0)
+	    printf("(%d)", needs_review_count); 
+	printf("\n");
+
+	// FIXME??? want to print here: "N0: %s Offsets have been applied already"
+
+	printf("-------------------------------------\n");
+	printf("  \n");
+    } /* if (!do_tune) */
+
+    // FIXME: we probably want this only when doing verbose...
+    // finally, print the utilizations all together
+    for (lmc = 0; lmc < num_lmcs; lmc++) {
+	uint64_t percent_x10 = ops_sum[lmc] * 1000 / dclk_sum[lmc];
+	ddr_print2("N%d.LMC%d: ops %lu, cycles %lu, used %lu.%lu%%\n",
+		  node, lmc, ops_sum[lmc], dclk_sum[lmc], percent_x10 / 10, percent_x10 % 10);
+    } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+    // FIXME: only when verbose, or only when there are errors?
+    // run the test one last time 
+    // print whether there are errors or not, but only when verbose...
+    bdk_watchdog_poke();
+    debug_print("N%d: %s: Start running test one last time\n", node, __FUNCTION__);
+    tot_errors = run_dram_tuning_threads(node, num_lmcs, bytemask);
+    debug_print("N%d: %s: Finished running test one last time\n", node, __FUNCTION__);
+    if (tot_errors)
+	ddr_print2("%s Timing Final Test: errors 0x%x\n", mode_str, tot_errors);
+
+    return (do_tune) ? tot_errors : !!(needs_review_count > 0);
+}
+
+#define USE_L2_WAYS_LIMIT 0 // non-zero to enable L2 ways limiting
+
+/*
+ * Automatically adjust the DLL offset for the data bytes
+ */
+int perform_dll_offset_tuning(bdk_node_t node, int dll_offset_mode, int do_tune)
+{
+    int ddr_interface_64b;
+    int save_ecc_ena[4];
+    bdk_lmcx_config_t lmc_config;
+    int lmc, num_lmcs = __bdk_dram_get_num_lmc(node);
+    const char *s;
+#if USE_L2_WAYS_LIMIT
+    int ways, ways_print = 0;
+#endif
+#if 0
+    int dram_tune_use_rodt = -1, save_rodt[4];
+    bdk_lmcx_comp_ctl2_t comp_ctl2;
+#endif
+    int loops = 1, loop;
+    uint64_t orig_coremask;
+    int errs = 0;
+
+    // enable any non-running cores on this node
+    orig_coremask = bdk_get_running_coremask(node);
+    ddr_print4("N%d: %s: Starting cores (mask was 0x%lx)\n",
+	      node, __FUNCTION__, orig_coremask);
+    bdk_init_cores(node, ~0ULL & ~orig_coremask);
+    dram_tune_max_cores = bdk_get_num_running_cores(node);
+
+    // but use only a certain number of cores, at most what is available
+    if ((s = getenv("ddr_tune_use_cores")) != NULL) {
+	dram_tune_use_cores = strtoul(s, NULL, 0);
+	if (dram_tune_use_cores <= 0) // allow 0 or negative to mean all
+	    dram_tune_use_cores = dram_tune_max_cores;
+    }
+    if (dram_tune_use_cores > dram_tune_max_cores)
+	dram_tune_use_cores = dram_tune_max_cores;
+
+    // see if we want to do the tuning more than once per LMC...
+    if ((s = getenv("ddr_tune_use_loops"))) {
+	loops = strtoul(s, NULL, 0);
+    }
+
+    // see if we want to change the granularity of the byte_offset sampling 
+    if ((s = getenv("ddr_tune_use_gran"))) {
+	dram_tune_use_gran = strtoul(s, NULL, 0);
+    }
+
+    // allow override of the test repeats (bursts) per thread create
+    if ((s = getenv("ddr_tune_use_bursts")) != NULL) {
+        dram_tune_use_bursts = strtoul(s, NULL, 10);
+    }
+
+#if 0
+    // allow override of Read ODT setting just during the tuning run(s)
+    if ((s = getenv("ddr_tune_use_rodt")) != NULL) {
+        int temp = strtoul(s, NULL, 10);
+	// validity check
+	if (temp >= 0 && temp <= 7)
+	    dram_tune_use_rodt = temp;
+    }
+#endif
+
+#if 0
+    // allow override of the test pattern
+    // FIXME: a bit simplistic...
+    if ((s = getenv("ddr_tune_use_pattern")) != NULL) {
+	int patno = strtoul(s, NULL, 10);
+	if (patno == 2)
+	    dram_tune_test_pattern = test_pattern_2;
+	else if (patno == 3)
+	    dram_tune_test_pattern = test_pattern_3;
+	else // all other values use default
+	    dram_tune_test_pattern = test_pattern_1;
+    }
+#endif
+
+    // allow override of the test mem_xor algorithm
+    if ((s = getenv("ddr_tune_use_xor2")) != NULL) {
+        dram_tune_use_xor2 = !!strtoul(s, NULL, 10);
+    }
+
+    // print current working values
+    ddr_print2("N%d: Tuning will use %d cores of max %d cores, and use %d repeats.\n",
+		node, dram_tune_use_cores, dram_tune_max_cores,
+		dram_tune_use_bursts);
+
+#if USE_L2_WAYS_LIMIT
+    // see if L2 ways are limited
+    if ((s = lookup_env_parameter("limit_l2_ways")) != NULL) {
+	ways = strtoul(s, NULL, 10);
+	ways_print = 1;
+    } else {
+	ways = bdk_l2c_get_num_assoc(node);
+    }
+#endif
+
+#if 0
+    // if RODT is to be overridden during tuning, note change
+    if (dram_tune_use_rodt >= 0) {
+	ddr_print("N%d: using RODT %d for tuning.\n",
+		  node, dram_tune_use_rodt);
+    }
+#endif
+
+    // FIXME? get flag from LMC0 only
+    lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(0));
+    ddr_interface_64b = !lmc_config.s.mode32b;
+
+    // do setup for each active LMC
+    debug_print("N%d: %s: starting LMCs setup.\n", node, __FUNCTION__);
+    for (lmc = 0; lmc < num_lmcs; lmc++) {
+
+#if 0
+	// if RODT change, save old and set new here...
+	if (dram_tune_use_rodt >= 0) {
+	    comp_ctl2.u = BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
+	    save_rodt[lmc] = comp_ctl2.s.rodt_ctl;
+	    comp_ctl2.s.rodt_ctl = dram_tune_use_rodt;
+	    DRAM_CSR_WRITE(node, BDK_LMCX_COMP_CTL2(lmc), comp_ctl2.u);
+	    BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
+	}
+#endif
+	/* Disable ECC for DRAM tests */
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+	save_ecc_ena[lmc] = lmc_config.s.ecc_ena;
+	lmc_config.s.ecc_ena = 0;
+	DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+
+    } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+#if USE_L2_WAYS_LIMIT
+    /* Disable l2 sets for DRAM testing */
+    limit_l2_ways(node, 0, ways_print);
+#endif
+
+    // testing is done on all LMCs simultaneously
+    // FIXME: for now, loop here to show what happens multiple times
+    for (loop = 0; loop < loops; loop++) {
+	/* Perform DLL offset tuning */
+	errs = auto_set_dll_offset(node, dll_offset_mode, num_lmcs, ddr_interface_64b, do_tune);
+    }
+
+#if USE_L2_WAYS_LIMIT
+    /* Restore the l2 set configuration */
+    limit_l2_ways(node, ways, ways_print);
+#endif
+
+    // perform cleanup on all active LMCs   
+    debug_print("N%d: %s: starting LMCs cleanup.\n", node, __FUNCTION__);
+    for (lmc = 0; lmc < num_lmcs; lmc++) {
+
+	/* Restore ECC for DRAM tests */
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+	lmc_config.s.ecc_ena = save_ecc_ena[lmc];
+	DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+#if 0
+	// if RODT change, restore old here...
+	if (dram_tune_use_rodt >= 0) {
+	    comp_ctl2.u = BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
+	    comp_ctl2.s.rodt_ctl = save_rodt[lmc];
+	    DRAM_CSR_WRITE(node, BDK_LMCX_COMP_CTL2(lmc), comp_ctl2.u);
+	    BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
+	}
+#endif
+	// finally, see if there are any read offset overrides after tuning
+        // FIXME: provide a way to do write offsets also??
+        if (dll_offset_mode == 2) {
+            for (int by = 0; by < 9; by++) {
+                if ((s = lookup_env_parameter("ddr%d_tune_byte%d", lmc, by)) != NULL) {
+                    int dllro = strtoul(s, NULL, 10);
+                    change_dll_offset_enable(node, lmc, 0);
+                    load_dll_offset(node, lmc, /* read */2, dllro, by);
+                    change_dll_offset_enable(node, lmc, 1);
+                }
+            }
+        }
+    } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+    // finish up...
+
+#if 0
+    // if RODT was overridden during tuning, note restore
+    if (dram_tune_use_rodt >= 0) {
+	ddr_print("N%d: restoring RODT %d after tuning.\n",
+		  node, save_rodt[0]); // FIXME? use LMC0
+    }
+#endif
+
+    // put any cores on this node, that were not running at the start, back into reset
+    uint64_t reset_coremask = bdk_get_running_coremask(node) & ~orig_coremask;
+    if (reset_coremask) {
+	ddr_print4("N%d: %s: Stopping cores 0x%lx\n", node, __FUNCTION__,
+		  reset_coremask);
+	bdk_reset_cores(node, reset_coremask);
+    } else {
+	ddr_print4("N%d: %s: leaving cores set to 0x%lx\n", node, __FUNCTION__,
+		  orig_coremask);
+    }
+
+    return errs;
+
+} /* perform_dll_offset_tuning */
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+/////    HW-assist byte DLL offset tuning   //////
+
+#if 1
+// setup defaults for byte test pattern array
+// take these first two from the HRM section 6.9.13
+static const uint64_t byte_pattern_0[] = {
+    0xFFAAFFFFFF55FFFFULL, // GP0
+    0x55555555AAAAAAAAULL, // GP1
+    0xAA55AAAAULL,         // GP2
+};
+static const uint64_t byte_pattern_1[] = {
+    0xFBF7EFDFBF7FFEFDULL, // GP0
+    0x0F1E3C78F0E1C387ULL, // GP1
+    0xF0E1BF7FULL,         // GP2
+};
+// this is from Andrew via LFSR with PRBS=0xFFFFAAAA
+static const uint64_t byte_pattern_2[] = {
+    0xEE55AADDEE55AADDULL, // GP0
+    0x55AADDEE55AADDEEULL, // GP1
+    0x55EEULL,             // GP2
+};
+// this is from Mike via LFSR with PRBS=0x4A519909
+static const uint64_t byte_pattern_3[] = {
+    0x0088CCEE0088CCEEULL, // GP0
+    0xBB552211BB552211ULL, // GP1
+    0xBB00ULL,             // GP2
+};
+
+static const uint64_t *byte_patterns[] = {
+    byte_pattern_0, byte_pattern_1, byte_pattern_2, byte_pattern_3 // FIXME: use all we have
+};
+#define NUM_BYTE_PATTERNS ((int)(sizeof(byte_patterns)/sizeof(uint64_t *)))
+
+#define DEFAULT_BYTE_BURSTS 32 // FIXME: this is what what the longest test usually has
+int dram_tune_byte_bursts = DEFAULT_BYTE_BURSTS;
+#endif
+
+static void
+setup_hw_pattern(bdk_node_t node, int lmc, const uint64_t *pattern_p)
+{
+    /*
+      3) Setup GENERAL_PURPOSE[0-2] registers with the data pattern of choice.
+      a. GENERAL_PURPOSE0[DATA<63:0>] – sets the initial lower (rising edge) 64 bits of data.
+      b. GENERAL_PURPOSE1[DATA<63:0>] – sets the initial upper (falling edge) 64 bits of data.
+      c. GENERAL_PURPOSE2[DATA<15:0>] – sets the initial lower (rising edge <7:0>) and upper
+      (falling edge <15:8>) ECC data.
+    */
+    DRAM_CSR_WRITE(node, BDK_LMCX_GENERAL_PURPOSE0(lmc), pattern_p[0]);
+    DRAM_CSR_WRITE(node, BDK_LMCX_GENERAL_PURPOSE1(lmc), pattern_p[1]);
+    DRAM_CSR_WRITE(node, BDK_LMCX_GENERAL_PURPOSE2(lmc), pattern_p[2]);
+}
+
+#define DEFAULT_PRBS 0xFFFFAAAAUL /* FIXME: maybe try 0x4A519909UL */
+
+static void
+setup_lfsr_pattern(bdk_node_t node, int lmc, uint64_t data)
+{
+    uint32_t prbs;
+    const char *s;
+
+    if ((s = getenv("ddr_lfsr_prbs"))) {
+        prbs = strtoul(s, NULL, 0);
+    } else
+        prbs = DEFAULT_PRBS; // FIXME: from data arg?
+
+    /*
+      2) DBTRAIN_CTL[LFSR_PATTERN_SEL] = 1
+         here data comes from the LFSR generating a PRBS pattern
+         CHAR_CTL.EN = 0
+         CHAR_CTL.SEL = 0; // for PRBS
+         CHAR_CTL.DR = 1;
+         CHAR_CTL.PRBS = setup for whatever type of PRBS to send
+         CHAR_CTL.SKEW_ON = 1;
+    */
+    BDK_CSR_INIT(char_ctl, node, BDK_LMCX_CHAR_CTL(lmc));
+    char_ctl.s.en      = 0;
+    char_ctl.s.sel     = 0;
+    char_ctl.s.dr      = 1;
+    char_ctl.s.prbs    = prbs;
+    char_ctl.s.skew_on = 1;
+    DRAM_CSR_WRITE(node, BDK_LMCX_CHAR_CTL(lmc), char_ctl.u);
+}
+
+int
+choose_best_hw_patterns(bdk_node_t node, int lmc, int mode)
+{
+    int new_mode = mode;
+    const char *s;
+
+    switch (mode) {
+    case DBTRAIN_TEST: // always choose LFSR if chip supports it
+        if (! CAVIUM_IS_MODEL(CAVIUM_CN88XX)) {
+            int lfsr_enable = 1;
+            if ((s = getenv("ddr_allow_lfsr"))) { // override?
+                lfsr_enable = !!strtoul(s, NULL, 0);
+            }
+            if (lfsr_enable)
+                new_mode = DBTRAIN_LFSR;
+        }
+        break;
+    case DBTRAIN_DBI: // possibly can allow LFSR use?
+        break;
+    case DBTRAIN_LFSR: // forced already
+        if (CAVIUM_IS_MODEL(CAVIUM_CN88XX)) {
+            ddr_print("ERROR: illegal HW assist mode %d\n", mode);
+            new_mode = DBTRAIN_TEST;
+        }
+        break;
+    default:
+        ddr_print("ERROR: unknown HW assist mode %d\n", mode);
+    }
+
+    if (new_mode != mode)
+        VB_PRT(VBL_DEV2, "choose_best_hw_patterns: changing mode %d to %d\n", mode, new_mode);
+
+    return new_mode;
+}
+
+int
+run_best_hw_patterns(bdk_node_t node, int lmc, uint64_t phys_addr,
+                     int mode, uint64_t *xor_data)
+{
+    int pattern;
+    const uint64_t *pattern_p;
+    int errs, errors = 0;
+
+    // FIXME? always choose LFSR if chip supports it???
+    mode = choose_best_hw_patterns(node, lmc, mode);
+
+    if (mode == DBTRAIN_LFSR) {
+            setup_lfsr_pattern(node, lmc, 0);
+            errors = test_dram_byte_hw(node, lmc, phys_addr, mode, xor_data);
+            VB_PRT(VBL_DEV2, "%s: LFSR at A:0x%012lx errors 0x%x\n",
+                   __FUNCTION__, phys_addr, errors);
+    } else {
+        for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) {
+            pattern_p = byte_patterns[pattern];
+            setup_hw_pattern(node, lmc, pattern_p);
+
+            errs = test_dram_byte_hw(node, lmc, phys_addr, mode, xor_data);
+
+            VB_PRT(VBL_DEV2, "%s: PATTERN %d at A:0x%012lx errors 0x%x\n",
+                   __FUNCTION__, pattern, phys_addr, errs);
+
+            errors |= errs;
+        } /* for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) */
+    }
+    return errors;
+}
+
+static void
+hw_assist_test_dll_offset(bdk_node_t node, int dll_offset_mode,
+                          int lmc, int bytelane)
+{
+    int byte_offset, new_best_offset[9];
+    int rank_delay_start[4][9];
+    int rank_delay_count[4][9];
+    int rank_delay_best_start[4][9];
+    int rank_delay_best_count[4][9];
+    int errors[4], off_errors, tot_errors;
+    int num_lmcs = __bdk_dram_get_num_lmc(node);
+    int rank_mask, rankx, active_ranks;
+    int pattern;
+    const uint64_t *pattern_p;
+    int byte;
+    char *mode_str = (dll_offset_mode == 2) ? "Read" : "Write";
+    int pat_best_offset[9];
+    uint64_t phys_addr;
+    int pat_beg, pat_end;
+    int rank_beg, rank_end;
+    int byte_lo, byte_hi;
+    uint64_t hw_rank_offset;
+    // FIXME? always choose LFSR if chip supports it???
+    int mode = choose_best_hw_patterns(node, lmc, DBTRAIN_TEST);
+
+    if (bytelane == 0x0A) { // all bytelanes
+        byte_lo = 0;
+        byte_hi = 8;
+    } else { // just 1
+        byte_lo = byte_hi = bytelane;
+    }
+
+    BDK_CSR_INIT(lmcx_config, node, BDK_LMCX_CONFIG(lmc));
+    rank_mask = lmcx_config.s.init_status;
+    // this should be correct for 1 or 2 ranks, 1 or 2 DIMMs
+    hw_rank_offset = 1ull << (28 + lmcx_config.s.pbank_lsb - lmcx_config.s.rank_ena + (num_lmcs/2));
+
+    debug_print("N%d: %s: starting LMC%d with rank offset 0x%lx\n",
+                node, __FUNCTION__, lmc, hw_rank_offset);
+
+    // start of pattern loop
+    // we do the set of tests for each pattern supplied...
+
+    memset(new_best_offset, 0, sizeof(new_best_offset));
+    for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) {
+
+	memset(pat_best_offset, 0, sizeof(pat_best_offset));
+
+        if (mode == DBTRAIN_TEST) {
+            pattern_p = byte_patterns[pattern];
+            setup_hw_pattern(node, lmc, pattern_p);
+        } else {
+            setup_lfsr_pattern(node, lmc, 0);
+        }
+
+	// now loop through all legal values for the DLL byte offset...
+
+#define BYTE_OFFSET_INCR 3 // FIXME: make this tunable?
+
+	tot_errors = 0;
+
+	memset(rank_delay_count, 0, sizeof(rank_delay_count));
+	memset(rank_delay_start, 0, sizeof(rank_delay_start));
+	memset(rank_delay_best_count, 0, sizeof(rank_delay_best_count));
+	memset(rank_delay_best_start, 0, sizeof(rank_delay_best_start));
+
+	for (byte_offset = -63; byte_offset < 64; byte_offset += BYTE_OFFSET_INCR) {
+
+	    // do the setup on the active LMC
+	    // set the bytelanes DLL offsets
+	    change_dll_offset_enable(node, lmc, 0);
+	    load_dll_offset(node, lmc, dll_offset_mode, byte_offset, bytelane); // FIXME? bytelane?
+	    change_dll_offset_enable(node, lmc, 1);
+
+	    bdk_watchdog_poke();
+
+	    // run the test on each rank
+	    // only 1 call per rank should be enough, let the bursts, loops, etc, control the load...
+	
+	    off_errors = 0; // errors for this byte_offset, all ranks
+
+            active_ranks = 0;
+
+	    for (rankx = 0; rankx < 4; rankx++) {
+                if (!(rank_mask & (1 << rankx)))
+                    continue;
+
+		phys_addr = hw_rank_offset * active_ranks;
+		// FIXME: now done by test_dram_byte_hw()
+                //phys_addr |= (lmc << 7);
+                //phys_addr = bdk_numa_get_address(node, phys_addr); // map to node
+
+                active_ranks++;
+
+                // NOTE: return is a now a bitmask of the erroring bytelanes..
+		errors[rankx] = test_dram_byte_hw(node, lmc, phys_addr, mode, NULL);
+
+                for (byte = byte_lo; byte <= byte_hi; byte++) { // do bytelane(s)
+
+                    // check errors
+                    if (errors[rankx] & (1 << byte)) { // yes, an error in the byte lane in this rank
+                        off_errors |= (1 << byte);
+
+                        ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: Address 0x%012lx errors 0x%x\n",
+                                   node, lmc, rankx, bytelane, mode_str,
+                                   byte_offset, phys_addr, errors[rankx]);
+
+                        if (rank_delay_count[rankx][byte] > 0) { // had started run
+                            ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: stopping a run here\n",
+                                       node, lmc, rankx, bytelane, mode_str, byte_offset);
+                            rank_delay_count[rankx][byte] = 0;   // stop now
+                        }
+                        // FIXME: else had not started run - nothing else to do?
+                    } else { // no error in the byte lane
+                        if (rank_delay_count[rankx][byte] == 0) { // first success, set run start
+                            ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: starting a run here\n",
+                                       node, lmc, rankx, bytelane, mode_str, byte_offset);
+                            rank_delay_start[rankx][byte] = byte_offset;
+                        }
+                        rank_delay_count[rankx][byte] += BYTE_OFFSET_INCR; // bump run length
+
+                        // is this now the biggest window?
+                        if (rank_delay_count[rankx][byte] > rank_delay_best_count[rankx][byte]) {
+                            rank_delay_best_count[rankx][byte] = rank_delay_count[rankx][byte];
+                            rank_delay_best_start[rankx][byte] = rank_delay_start[rankx][byte];
+                            debug_print("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: updating best to %d/%d\n",
+                                        node, lmc, rankx, bytelane, mode_str, byte_offset,
+                                        rank_delay_best_start[rankx][byte], rank_delay_best_count[rankx][byte]);
+                        }
+                    }
+                } /* for (byte = byte_lo; byte <= byte_hi; byte++) */
+	    } /* for (rankx = 0; rankx < 4; rankx++) */
+
+	    tot_errors |= off_errors;
+
+	} /* for (byte_offset = -63; byte_offset < 64; byte_offset += BYTE_OFFSET_INCR) */
+
+	// now choose the best byte_offsets for this pattern according to the best windows of the tested ranks
+        // calculate offset by constructing an average window from the rank windows
+        for (byte = byte_lo; byte <= byte_hi; byte++) {
+
+            pat_beg = -999;
+            pat_end = 999;
+
+            for (rankx = 0; rankx < 4; rankx++) {
+                if (!(rank_mask & (1 << rankx)))
+                    continue;
+
+                rank_beg = rank_delay_best_start[rankx][byte];
+                pat_beg = max(pat_beg, rank_beg);
+                rank_end = rank_beg + rank_delay_best_count[rankx][byte] - BYTE_OFFSET_INCR;
+                pat_end = min(pat_end, rank_end);
+
+                ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test:  Rank Window %3d:%3d\n",
+                           node, lmc, rankx, bytelane, mode_str, rank_beg, rank_end);
+
+            } /* for (rankx = 0; rankx < 4; rankx++) */
+
+            pat_best_offset[byte] = (pat_end + pat_beg) / 2;
+            ddr_print4("N%d.LMC%d: Bytelane %d DLL %s Offset Test:  Pattern %d Average %3d\n",
+                       node, lmc, byte, mode_str, pattern, pat_best_offset[byte]);
+
+#if 0
+            // FIXME: next print the window counts
+            sprintf(sbuffer, "N%d.LMC%d Pattern %d: DLL %s Offset Count ",
+                    node, lmc, pattern, mode_str);
+            printf("%-45s : ", sbuffer);
+            printf(" %3d", byte_delay_best_count);
+            printf("\n");
+#endif
+
+            new_best_offset[byte] += pat_best_offset[byte]; // sum the pattern averages
+        } /* for (byte = byte_lo; byte <= byte_hi; byte++) */
+    } /* for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) */
+    // end of pattern loop
+
+    ddr_print("N%d.LMC%d: HW DLL %s Offset Amount   : ",
+              node, lmc, mode_str);
+
+    for (byte = byte_hi; byte >= byte_lo; --byte) { // print in decending byte index order
+        new_best_offset[byte] = divide_nint(new_best_offset[byte], NUM_BYTE_PATTERNS); // create the new average NINT
+
+        // print the best offsets from all patterns
+
+        if (bytelane == 0x0A) // print just the offset of all the bytes
+            ddr_print("%5d ", new_best_offset[byte]);
+        else
+            ddr_print("(byte %d) %5d ", byte, new_best_offset[byte]);
+        
+
+#if 1
+        // done with testing, load up the best offsets we found...
+        change_dll_offset_enable(node, lmc, 0); // disable offsets while we load...
+        load_dll_offset(node, lmc, dll_offset_mode, new_best_offset[byte], byte);
+        change_dll_offset_enable(node, lmc, 1); // re-enable the offsets now that we are done loading
+#endif
+    } /* for (byte = byte_hi; byte >= byte_lo; --byte) */
+
+    ddr_print("\n");
+
+#if 0
+    // run the test one last time 
+    // print whether there are errors or not, but only when verbose...
+    tot_errors = run_test_dram_byte_threads(node, num_lmcs, bytemask);
+    printf("N%d.LMC%d: Bytelane %d DLL %s Offset Final Test: errors 0x%x\n",
+	   node, lmc, bytelane, mode_str, tot_errors);
+#endif
+}
+
+/*
+ * Automatically adjust the DLL offset for the selected bytelane using hardware-assist
+ */
+int perform_HW_dll_offset_tuning(bdk_node_t node, int dll_offset_mode, int bytelane)
+{
+    int save_ecc_ena[4];
+    bdk_lmcx_config_t lmc_config;
+    int lmc, num_lmcs = __bdk_dram_get_num_lmc(node);
+    const char *s;
+    //bdk_lmcx_comp_ctl2_t comp_ctl2;
+    int loops = 1, loop;
+
+    // see if we want to do the tuning more than once per LMC...
+    if ((s = getenv("ddr_tune_ecc_loops"))) {
+	loops = strtoul(s, NULL, 0);
+    }
+
+    // allow override of the test repeats (bursts)
+    if ((s = getenv("ddr_tune_byte_bursts")) != NULL) {
+        dram_tune_byte_bursts = strtoul(s, NULL, 10);
+    }
+
+    // print current working values
+    ddr_print2("N%d: H/W Tuning for bytelane %d will use %d loops, %d bursts, and %d patterns.\n",
+	      node, bytelane, loops, dram_tune_byte_bursts,
+	      NUM_BYTE_PATTERNS);
+
+    // FIXME? get flag from LMC0 only
+    lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(0));
+
+    // do once for each active LMC
+
+    for (lmc = 0; lmc < num_lmcs; lmc++) {
+
+	ddr_print4("N%d: H/W Tuning: starting LMC%d bytelane %d tune.\n", node, lmc, bytelane);
+
+	/* Enable ECC for the HW tests */
+	// NOTE: we do enable ECC, but the HW tests used will not generate "visible" errors
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+	save_ecc_ena[lmc] = lmc_config.s.ecc_ena;
+	lmc_config.s.ecc_ena = 1;
+	DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+
+	// testing is done on a single LMC at a time
+	// FIXME: for now, loop here to show what happens multiple times
+	for (loop = 0; loop < loops; loop++) {
+	    /* Perform DLL offset tuning */
+	    //auto_set_dll_offset(node,  1 /* 1=write */, lmc, bytelane);
+	    hw_assist_test_dll_offset(node,  2 /* 2=read */, lmc, bytelane);
+	}
+
+	// perform cleanup on active LMC   
+	ddr_print4("N%d: H/W Tuning: finishing LMC%d bytelane %d tune.\n", node, lmc, bytelane);
+
+	/* Restore ECC for DRAM tests */
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+	lmc_config.s.ecc_ena = save_ecc_ena[lmc];
+	DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+
+	// finally, see if there are any read offset overrides after tuning
+	for (int by = 0; by < 9; by++) {
+	    if ((s = lookup_env_parameter("ddr%d_tune_byte%d", lmc, by)) != NULL) {
+		int dllro = strtoul(s, NULL, 10);
+		change_dll_offset_enable(node, lmc, 0);
+		load_dll_offset(node, lmc, 2 /* 2=read */, dllro, by);
+		change_dll_offset_enable(node, lmc, 1);
+	    }
+	}
+
+    } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+    // finish up...
+
+    return 0;
+
+} /* perform_HW_dll_offset_tuning */
author	David Hendricks <dhendricks@fb.com>	2018-03-09 13:58:27 -0800
committer	Patrick Georgi <pgeorgi@google.com>	2018-04-06 06:48:11 +0000
commit	2004b93aed993aa02bbc588b8d82c22418ac52ec (patch)
tree	cdd5e95a154e2e0139474288262835a7f5847665 /src/vendorcode/cavium/bdk/libdram/dram-tune-ddr3.c
parent	71cbd71eb5c0e8e13b25b5d5dd2f495e7d2967eb (diff)
download	coreboot-2004b93aed993aa02bbc588b8d82c22418ac52ec.tar.xz