
/*--------------------------------------------------------------------*/
/*--- A skin for extracting dynamic data traces.         rx_main.c ---*/
/*--------------------------------------------------------------------*/

/*
   This file is part of Redux, a Valgrind skin for tracing program
   execution.

   Copyright (C) 2003 Nicholas Nethercote
      njn25@cam.ac.uk

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307, USA.

   The GNU General Public License is contained in the file COPYING.
*/

#include "rx_include.h"
#include "redux.h"

//#include "vg_profile.c"

typedef
   struct _Rx_Chunk {
      struct _Rx_Chunk* next;
      Addr          data;           /* ptr to actual block              */
      UInt          size;           /* size requested                   */
   }
   Rx_Chunk;


static __attribute__((regparm(2)))
void set_mem_shadow4 ( Addr a, ShW* shadow4 );

static __attribute__((regparm(1)))
ShW* get_mem_shadow4 ( Addr a );


/*--------------------------------------------------------------------*/
/* Sparsely filled array;  index == syscallno */
SyscallInfo RX_(syscall_info) [MAX_NUM_SYSCALLS];

/*--------------------------------------------------------------------*/
/*--- Obscuring functions                                          ---*/
/*--------------------------------------------------------------------*/

static Bool entered_main = False;
static Bool exited_main  = False;

static __inline__ UInt align4(Addr a)
{
   return (a & 0xfffffffc);
}

/*--------------------------------------------------------------------*/
/*--- Allocator                                                    ---*/
/*--------------------------------------------------------------------*/

#define SUPERBLOCK_SIZE  (1 << 20)         // 1 MB

// Very simplistic currently:  
// - no memory deallocated, so don't need to track previously mmap'd blocks.
// - assuming small blocks, so wasted space at end of superblock negligible
//
// Nb: this must return zeroed blocks
static void* rx_malloc(UInt n_bytes)
{
   static Addr hp     = 0;    // current heap pointer
   static Addr hp_lim = 0;    // maximum usable byte in current block

   while (! IS_ALIGNED4_ADDR(n_bytes)) n_bytes++;    // make aligned if not already

   if (hp + n_bytes > hp_lim) {
      hp = (Addr)VG_(get_memory_from_mmap)(SUPERBLOCK_SIZE, "rx_malloc");

      // To guarantee that allocation always goes upwards through address
      // space, each superblock must come after the previous one.
      sk_assert(hp_lim < hp);

      hp_lim = hp + SUPERBLOCK_SIZE - 1;

      VG_(printf)("new superblock: %p--%p\n", hp, hp_lim);

      // zero -- this is a *must*;  should happen anyway but be paranoid.
      VG_(memset)((void *)hp, 0, SUPERBLOCK_SIZE);
   }

   hp += n_bytes;

   return (void*)(hp - n_bytes);
}

/*--------------------------------------------------------------------*/
/*--- Command line options                                         ---*/
/*--------------------------------------------------------------------*/

/* user options */
Int      RX_(clo_print_depth)    = 20;
Bool     RX_(clo_ignore_esp_ebp) = True;
Bool     RX_(clo_jcc)            = False;
Bool     RX_(clo_skip)           = False;
UInt     RX_(clo_inline)         = 2;
Bool     RX_(clo_rewrite)        = True;
Bool     RX_(clo_a4)             = False;
RxFormat RX_(clo_format)         = RxDot;

/* debug options */
Bool     RX_(clo_instr)          = False;
Bool     RX_(clo_actions)        = False;
Int      RX_(clo_sanity)         = 0;
Bool     RX_(clo_bb_entry)       = False;
Bool     RX_(clo_write_splits)   = True;

Bool SK_(process_cmd_line_option) ( Char* arg )
{
   /* user options */
   
   if (VG_CLO_STREQN(14, arg, "--print-depth=")) {
      RX_(clo_print_depth) = (Int)VG_(atoll)(&arg[14]);
      if (RX_(clo_print_depth) < 0) RX_(clo_print_depth) = 0;
   }
   
   else if (VG_CLO_STREQ(arg, "--ignore-esp-ebp=yes"))
      RX_(clo_ignore_esp_ebp) = True;
   else if (VG_CLO_STREQ(arg, "--ignore-esp-ebp=no"))
      RX_(clo_ignore_esp_ebp) = False;
   
   else if (VG_CLO_STREQ(arg, "--jcc=yes"))
      RX_(clo_jcc) = True;
   else if (VG_CLO_STREQ(arg, "--jcc=no"))
      RX_(clo_jcc) = False;

   // XXX: should just obliterate _dl_resolve_symbol, _dl_debug_initialize,
   // _dl_debug_state, (_dl_fini?)
   
   else if (VG_CLO_STREQ(arg, "--skip=yes"))
      RX_(clo_skip) = True;
   else if (VG_CLO_STREQ(arg, "--skip=no"))
      RX_(clo_skip) = False;
   
   else if (VG_CLO_STREQ(arg, "--inline=all"))
      RX_(clo_inline) = 2;
   else if (VG_CLO_STREQ(arg, "--inline=some"))
      RX_(clo_inline) = 1;
   else if (VG_CLO_STREQ(arg, "--inline=none"))
      RX_(clo_inline) = 0;

   else if (VG_CLO_STREQ(arg, "--rewrite=yes"))
      RX_(clo_rewrite) = True;
   else if (VG_CLO_STREQ(arg, "--rewrite=no"))
      RX_(clo_rewrite) = False;
   
   else if (VG_CLO_STREQ(arg, "--a4=yes"))
      RX_(clo_a4) = True;
   else if (VG_CLO_STREQ(arg, "--a4=no"))
      RX_(clo_a4) = False;
   
   else if (VG_CLO_STREQ(arg, "--format=dot"))
      RX_(clo_format) = RxDot;
   else if (VG_CLO_STREQ(arg, "--format=aisee"))
      RX_(clo_format) = RxAiSee;
   
   /* debug options */

   else if (VG_CLO_STREQ(arg, "--instr=yes"))
      RX_(clo_instr) = True;
   else if (VG_CLO_STREQ(arg, "--instr=no"))
      RX_(clo_instr) = False;
   
   else if (VG_CLO_STREQ(arg, "--actions=yes"))
      RX_(clo_actions) = True;
   else if (VG_CLO_STREQ(arg, "--actions=no"))
      RX_(clo_actions) = False;
   
   else if (VG_CLO_STREQ(arg, "-s") || VG_CLO_STREQ(arg, "--sanity"))
      RX_(clo_sanity)++;
   
   else if (VG_CLO_STREQ(arg, "--bb-entry=yes"))
      RX_(clo_bb_entry) = True;
   else if (VG_CLO_STREQ(arg, "--bb-entry=no"))
      RX_(clo_bb_entry) = False;
   
   else if (VG_CLO_STREQ(arg, "--write-splits=yes"))
      RX_(clo_write_splits) = True;
   else if (VG_CLO_STREQ(arg, "--write-splits=no"))
      RX_(clo_write_splits) = False;
   
   else
      return VG_(replacement_malloc_process_cmd_line_option)(arg);

   return True;
}

void SK_(print_usage) ( void )
{
   VG_(printf)(
"    --print-depth=<n>         print nodes with depth <n> [10]\n"
"    --ignore-esp-ebp=no|yes   ignore %%esp/%%ebp computations? [yes]\n"
"    --jcc=no|yes              record conditional jumps [no]\n"
"    --skip=no|yes             skip instrumentation of BBs before main() [no]\n"
"    --inline=none|some|all    amount of constant inlining [all]\n"
"    --a4=no|yes               fit graph onto A4 page (dot output only) [no]\n"
"    --rewrite=no|yes          rewrite graph to be prettier [yes]\n"
"    --format=dot|aisee        output format [dot]\n"
   );
   VG_(replacement_malloc_print_usage)();
}  

void SK_(print_debug_usage) ( void )
{
   VG_(printf)(
"    --instr=no|yes            show instrumentation reasoning [no]\n"
"    --actions=yes             show shadow actions [no]\n"
"    -s --sanity               extra sanity checks (1: +regs, 2: +mem) [no]\n"
"    --bb-entry=no|yes         show basic block entry during execution [none]\n"
"    --write-splits=no|yes     write splits in-place? [yes]\n"
   );
   VG_(replacement_malloc_print_debug_usage)();
}

/*--------------------------------------------------------------------*/
/*--- Building shadow nodes                                        ---*/
/*--------------------------------------------------------------------*/

#define UNSET_INSTR_ADDR   0xdeadbeef

// Global variable -- fake argument -- to build_foo() functions, for passing
// the actual word for which we are building the shadow word.
static UInt rx_val;

#ifdef RX_SLICING
// Similar global variable to build_foo() functions, for passing
// the x86 instr addr.
static UInt rx_instr_addr_runtime = UNSET_INSTR_ADDR;

// Similar globvar for call_builder(), because I am very lazy and can't be
// bothered threading it as an arg.
static UInt rx_instr_addr_compiletime;
#endif


// More fake args for builders that require more than 2 args
static UInt rx_arg3;
static UInt rx_arg4;

static UInt n_nodes_built = 0;
static UInt n_bytes_built = 0;



static __inline__ Bool val_within_size2_limits(UInt val)
{
   return (val < 65536 || val >= 0xffff8000);
}

static __inline__ Bool val_within_size1_limits(UInt val)
{
   return (val <   256 || val >= 0xffffff80);
}
   
static ShW*
build(ShKind kind, UInt extra, UInt size, UInt val, Int argc, ShW* argv[])
{
   UInt i;
   UInt node_size = sizeof(ShW) + argc*sizeof(UInt);
   ShW* sh;

   n_nodes_built++;
   n_bytes_built += node_size;

   // Node is checked once built, but these ones are truncated, so we must
   // check they are sensible before putting into the node.
   sk_assert(kind  < ShLast);
   sk_assert(extra < 256);
   sk_assert(argc  < 256);

   sk_assert(Sz0  == size || Sz1 == size || Sz2 == size || Sz4 == size);
   
   sh            = rx_malloc(node_size);
   sh->kind      = kind;
   sh->extra     = extra;
   sh->argc      = argc;
   sh->size      = size;
   sh->n_parents = PZero;
   sh->rewritten = False;
   sh->graphed   = False;
   sh->val       = val;
#ifdef RX_SLICING
   sh->instr_addr= rx_instr_addr_runtime;
#endif

   for (i = 0; i < argc; i++) {
      // check not lazy and 4byte aligned (ie. last two bits are zero)
      if (! IS_ALIGNED4_ADDR( (UInt)argv[i]) ) {
         VG_(printf)("kind = %d, argv[%d] = %p\n", kind, i, argv[i]);
         VG_(skin_panic)("bad argument in build()");
      }
      sh->argv[i] = argv[i];
   }

   if (RX_(clo_actions)) {
      VG_(printf)("=  build ");
      RX_(pp_ShW)(sh);
      VG_(printf)(" (%p)\n", val);
   }

   if (!RX_(sane_ShW)(sh, /*building*/True)) {
      RX_(up_ShW)(sh);
      RX_(pp_ShW)(sh);
      VG_(skin_panic)("building insane ShW"); 
   }
   return sh;
}

static ShW*
build_syscall(UInt syscallno, UInt res, Int argc, ShW* argv[])
{
   return build(ShSyscall, syscallno, Sz4, res, argc, argv);
}

static ShW* 
build_ShSysMem(UInt extra, UInt val, ShW* syscall)
{
   return build(ShSysMem, extra, Sz4, val, 1, & syscall);
}

static __inline__ ShW*
build_const(ConstOrigin origin, UInt size, UInt val) 
{
   return build(ShConst, origin, size, val, /*argc*/0, /*argv*/NULL);
}

#define build_code_const(size)                   \
static __attribute__((regparm(1)))               \
ShW* build_code_const##size(UInt val)            \
{                                                \
   return build_const(ConstCode, Sz##size, val); \
}
build_code_const(1);
build_code_const(2);
build_code_const(4);

static __attribute__((regparm(1)))
ShW* build_Widen(ShW* a)
{
   return build(ShWiden, 0, Sz4, rx_val, 1, & a);
}

#define build_unop_function(name)                        \
static __attribute__((regparm(1)))                       \
ShW* build_##name(ShW* a)                                \
{                                                        \
   return build(Sh##name, 0, a->size, rx_val, 1, & a);   \
}
build_unop_function(Inc);
build_unop_function(Dec);

build_unop_function(Not);
build_unop_function(Neg);

build_unop_function(Exit);


// If --ignore-esp-ebp=yes, ignore 
//    %esp/%ebp +/- constant 
//    %esp/%ebp +/- &(X,Y)       (seen in date, stack-alignment thing)
#define build_addsub_function(name)                                        \
static __attribute__((regparm(2)))                                         \
ShW* build_##name(ShW* a, ShW* b)                                          \
{                                                                          \
   ShW* argv[] = { a, b };                                                 \
   if ( ( (a == RX_(specials)[SpEspEbp] &&                                 \
           (ShConst == b->kind || ShAnd == b->kind)) ||                    \
          (b == RX_(specials)[SpEspEbp] &&                                 \
           (ShConst == a->kind || ShAnd == b->kind)) )  &&                 \
        RX_(clo_ignore_esp_ebp) )                                          \
      return RX_(specials)[SpEspEbp];                                      \
   else                                                                    \
      return build(Sh##name, 0, a->size, rx_val, 2, argv);                 \
}
build_addsub_function(Add);
build_addsub_function(Sub);

#define build_binop_function(name)                          \
static __attribute__((regparm(2)))                          \
ShW* build_##name(ShW* a, ShW* b)                           \
{                                                           \
   ShW* argv[] = { a, b };                                  \
   return build(Sh##name, 0, a->size, rx_val, 2, argv);     \
}
build_binop_function(Mul);
build_binop_function(Mul64High);
build_binop_function(Mul64);

build_binop_function(And);
build_binop_function(Or);
//build_binop_function(Xor);

#if 1
static __attribute__((regparm(2)))
ShW* build_Xor(ShW* a, ShW* b)
{
   ShW* argv[] = { a, b };
   if (/* XXX:simplifying && */ a == b) {
      // xor(X,X), sub(X,X) ==> 0
      return build_const(ConstZero, a->size, rx_val);
   } else {
      return build(ShXor, 0, a->size, rx_val, 2, argv);
   }
}
#endif

// Shift/rotate factor (b) is always a 1-byte value
#define build_shrotop_function(name)                        \
static __attribute__((regparm(2)))                          \
ShW* build_##name(ShW* a, ShW* b)                           \
{                                                           \
   ShW* argv[] = { a, b };                                  \
   return build(Sh##name, 0, a->size, rx_val, 2, argv);     \
}
build_shrotop_function(Shl);
build_shrotop_function(Shr);
build_shrotop_function(Sar);
build_shrotop_function(Rol);
build_shrotop_function(Ror);

static __inline__ Bool looks_like_a_pointer(Addr a)
{
   if (RX_(clo_ignore_esp_ebp))
      return False;
   else
      return (a > 0x10000000 && a < 0xf0000000);
}

// Mostly LEA1/LEA2 are boring, but sometimes they are used like add.  If it
// looks like a pointer, assume it's boring and don't store the args.
// (XXX: Might be able to do this better by looking at how the address is
// used in UCode -- eg. if it's only used as a LD/ST address it's boring)
static __attribute__((regparm(2)))
ShW* build_Lea1(ShW* base, ShW* displ)
{
   ShW* argv[] = { base, displ };
   sk_assert(base->size == displ->size);

   if (looks_like_a_pointer(rx_val))
      return build(ShConst, ConstLea1, base->size, rx_val, 0, NULL);
   else
      return build(ShLea1Add, 0,       base->size, rx_val, 2, argv);
}

static __attribute__((regparm(2)))
ShW* build_Lea2(ShW* base, ShW* Index)
{
   ShW* scale = (ShW*)rx_arg3;
   ShW* displ = (ShW*)rx_arg4;
   ShW* argv[] = { base, Index, scale, displ };
   sk_assert(base->size  == displ->size 
          && scale->size == displ->size);

   if (looks_like_a_pointer(rx_val))
      return build(ShConst, ConstLea2, base->size, rx_val, 0, NULL);
   else
      return build(ShLea2Add, 0,       base->size, rx_val, 4, argv);
}

// Global var for passing in extra args, yuk
static UInt rx_third_arg = 0x314159ff;

#define build_trinop_function(name)                            \
static __attribute__((regparm(2)))                             \
ShW* build_##name(ShW* a, ShW* b)                              \
{                                                              \
   ShW* argv[3];                                               \
   ShW* c = (ShW*)rx_third_arg;                                \
   sk_assert(a->size == b->size && b->size == c->size);        \
   if (ShSar   == c->kind &&                                   \
       0       == c->val  &&                                   \
       ShConst == c->argv[1]->kind &&                          \
       31      == c->argv[1]->val  &&                          \
       b       == c->argv[0])                                  \
   {                                                           \
      /* div|mod(X, Y, sar(Y,31)==0L)   ==>   div|mod(Y, X) */ \
      argv[0] = b;                                             \
      argv[1] = a;                                             \
      return build(Sh##name, 0, a->size, rx_val, 2, argv);     \
   } else {                                                    \
      argv[0] = a;                                             \
      argv[1] = b;                                             \
      argv[2] = c;                                             \
      return build(Sh##name, 0, a->size, rx_val, 3, argv);     \
   }                                                           \
}
build_trinop_function(Mod64);
build_trinop_function(Div64);

static __attribute__((regparm(2)))
ShW* build_Shrd(ShW* a, ShW* b)
{
   ShW* c      = (ShW*)rx_third_arg;
   ShW* argv[] = { a, b, c };
   return build(ShShrd, 0, b->size, rx_val, 3, argv);
}

static __attribute__((regparm(2)))
ShW* build_Shld(ShW* a, ShW* b)
{
   ShW* c      = (ShW*)rx_third_arg;
   ShW* argv[] = { a, b, c };
   return build(ShShld, 0, b->size, rx_val, 3, argv);
}

static __attribute__((regparm(2)))
ShW* build_cc2val(UInt condcode, ShW* a)
{
   UInt val = rx_val & 0x000000ff;  // only look at single byte
   sk_assert(val == 0 || val == 1);
   return build(ShCC2Val, condcode, Sz1, val, 1, & a);   // Size = 1 (always)
}

static __attribute__((regparm(0)))
ShW* build_Special(void)
{
   return build_const(ConstSpecial, Sz4, rx_val);   // Size = 4 (always)
}

#define MAX_CONDS_RECORDED    5000

static ShW* last_condcode_op;

static UInt        conds_n = 0;
static ShW* conds_made[MAX_CONDS_RECORDED];

static __attribute__((regparm(2))) 
void build_Cond(UInt condcode, ShW* a)
{
   if (! entered_main || exited_main)
      return;
   
   conds_made[conds_n] = build(ShCond, condcode, Sz0, /*rx_val*/666, 1, & a);
   conds_n++;
   if (conds_n > MAX_CONDS_RECORDED)
      VG_(skin_panic)("too many conds, increase MAX_CONDS_RECORDED");
}

/*--------------------------------------------------------------------*/
/*--- Shadow memory and registers                                  ---*/
/*--------------------------------------------------------------------*/

ShW*  RX_(specials)[SpLast];

Bool RX_(is_special_node)(ShKind kind)
{
   return (SpError <= kind && kind <= SpLast-1);
}

static void init_specials(void)
{
#define sp(name, val) \
   RX_(specials)[name] = build(name, 0, Sz4, val, 0, NULL);

   sp(SpError,      0xeeeeeeee);
   sp(SpStartup,    0x11111111);
   sp(SpHeap,       0x22222222);
   sp(SpZeroHeap,   0x33333333);
   sp(SpBrk,        0x44444444);
   sp(SpUnknownMem, 0x55555555);
   sp(SpUnknownReg, 0x66666666);
   sp(SpSyscallReg, 0x77777777);
   sp(SpSignalReg,  0x88888888);
   sp(SpPthOpReg,   0x99999999);
   sp(SpClReqReg,   0xaaaaaaaa);
   sp(SpRep,        0xbbbbbbbb);
   sp(SpEspEbp,     0xcccccccc);

#undef sp
}

static void rx_post_regs_write_init ( void )
{
   UInt i;
   for (i = R_EAX; i <= R_EDI; i++)
      VG_(set_shadow_archreg)( i, (UInt)RX_(specials)[SpUnknownReg] );
   VG_(set_shadow_eflags)( (UInt)RX_(specials)[SpUnknownReg] );
}

static void rx_post_reg_write_syscall    ( ThreadId tid, UInt reg )
{
   VG_(set_thread_shadow_archreg)( tid, reg, (UInt)RX_(specials)[SpSyscallReg]);
}
   
static void rx_post_reg_write_signal ( ThreadId tid, UInt reg )
{
   VG_(set_thread_shadow_archreg)( tid, reg, (UInt)RX_(specials)[SpSignalReg]);
}
   
static void rx_post_reg_write_pthread ( ThreadId tid, UInt reg ) 
{
   VG_(set_thread_shadow_archreg)( tid, reg, (UInt)RX_(specials)[SpPthOpReg]);
}
   
static void rx_post_reg_write_clientreq ( ThreadId tid, UInt reg )
{
   VG_(set_thread_shadow_archreg)( tid, reg, (UInt)RX_(specials)[SpClReqReg]);
}

   
#define SEC_MAP_WORDS        16384

/* 1 (4 byte) word per (4 byte) word --> 64KB per 64KB page */
typedef
   struct {
      ShW* shadow[SEC_MAP_WORDS];
   }
   SecMap;

static SecMap* primary_map[ 65536 ];
static SecMap  distinguished_secondary_map;

#define IS_DISTINGUISHED_SM(smap) \
   ((smap) == &distinguished_secondary_map)

#define ENSURE_MAPPABLE(addr,caller)                        \
   do {                                                     \
      if (IS_DISTINGUISHED_SM(primary_map[(addr) >> 16])) { \
         primary_map[(addr) >> 16] = alloc_secondary_map(); \
      } \
   } while(0)

static void init_shadow_memory ( void )
{
   Int i;

   // Use SpUnknownMem rather than SpError so that buggy programs that
   // read wildly inaccessible memory don't cause assertion failures.
   for (i = 0; i < SEC_MAP_WORDS; i++)
      distinguished_secondary_map.shadow[i] = RX_(specials)[SpUnknownMem];

   /* These entries gradually get overwritten as the used address
      space expands. */
   for (i = 0; i < 65536; i++)
      primary_map[i] = &distinguished_secondary_map;
}

static SecMap* alloc_secondary_map ( void )
{
   SecMap* map;
   UInt  i;

   map = VG_(get_memory_from_mmap)( sizeof(SecMap), "alloc_secondary_map" );

   for (i = 0; i < SEC_MAP_WORDS; i++)
      map->shadow[i] = RX_(specials)[SpUnknownMem];

   return map;
}

static __inline__ 
Bool is_lazy(ShW* sh)
{
   return (0x1 == ((UInt)sh & 0x00000001));    // is last bit set?
}

static __inline__
ShW* make_lazy(ShW* sh)
{
   sk_assert(! is_lazy(sh));
   return (ShW*)( (UInt)sh | 0x00000001 );    // set last bit
}

/* Candidates for laziness:
  - static memory: good, because many and most not used (esp. code)
  - ShSysmem: no, because it prevents memblock offsets from being stored in
    the .extra field
  - ShFnOuts: N/A because they're added at rewrite time
  - ShOblitOut: yes, again like because many and lots not used
*/
static ShW* delazify(Addr a, ShW* sh)
{
   UInt val;

   // check aligned and lazy (ie. last two bits are 01)
   sk_assert( ((UInt)sh & 0x00000003) == 0x1); 
   sk_assert( 0 != a );
   sk_assert( (a & 0x00000003) == 0x0);

   // zero last bit, removing lazy tag
   sh = (ShW*)( (UInt)sh & 0xfffffffe );  

   val = * (UInt*)a;

   switch (sh->kind) {
   //             -lz-> SpStartup()
   // becomes     ----> ShConst(val)
   case SpStartup: 
      sh = build_const(ConstStartup, Sz4, val);
      break;

   //             -lz-> ShOblit()
   // becomes     ----> ShOblitOut(val) ----> ShOblit
//   case ShOblit:
//      sh = build(...)
//      break;

   //             -lz-> ShSyscall(X)
   // becomes     ----> ShSysMem(val) ----> ShSyscall(X)
   case ShSyscall:
      VG_(skin_panic)("not doing ShSysMem lazily...");
      sh = build_ShSysMem( /*offset*/0, val, sh );
      break;
                   
   default:
      VG_(skin_panic)("lazy pointer to non-lazy node");
   }

   if (RX_(clo_actions)) {
      VG_(printf)("=  delazified %p : ", a);
      RX_(pp_ShW)(sh);
      VG_(printf)(" (%p)\n", val );
   }

   return sh;
}


/* Each word has one shadow word, so the distinguished secondary map has 16K 
   entries (one per word in the 64KB page).  Lookup is done as follows:

     bits 31..16:   primary map lookup
     bits 15.. 2:   secondary map lookup
     bits  1.. 0:   ignored due to word granularity
*/
static __attribute__((regparm(2)))
void set_mem_shadow ( Addr a, ShW* sh )
{
   SecMap* sm;
   UInt    sm_off;
   ENSURE_MAPPABLE(a, "set_mem_shadow");

   // allow for possibility of laziness
   sk_assert(((ShW*)((UInt)sh & 0xfffffffe))->kind != SpError);

   /* Use bits 31..16 for primary, 15..2 for secondary lookup */
   sm     = primary_map[a >> 16];
   sm_off = (a & 0xfffc) >> 2;

   sm->shadow[sm_off] = sh;

   sk_assert(!IS_DISTINGUISHED_SM(sm)); 
}

__attribute__((regparm (1)))
ShW* RX_(get_mem_shadow) ( Addr a )
{
   ShW* sh;
   /* Use bits 31..16 for primary, 15..2 for secondary lookup */
   SecMap* sm     = primary_map[a >> 16];
   UInt    sm_off = (a & 0xfffc) >> 2;

   // Can happen with buggy programs that read unaccessible memory
   if (IS_DISTINGUISHED_SM(sm)) {
      VG_(message)(Vg_DebugMsg, "accessed distinguished 2ndary map! %p\n", a);
   }

   sh = sm->shadow[sm_off];

   if ( is_lazy(sh) ) {
      sh = delazify(a & 0xfffffffc, sh);
      set_mem_shadow(a, sh);
   }
   sk_assert(sh->kind != SpError);

   return sh;
}

static __inline__
void maybe_set_mem_shadow ( Addr a, ShW* sh )
{
   if (RX_(clo_write_splits))
      set_mem_shadow(a, sh);
}

static __inline__
void maybe_set_shadow_archreg ( UInt archreg, ShW* sh )
{
   if (RX_(clo_write_splits))
      VG_(set_shadow_archreg)(archreg, (UInt)sh);
}

static void rx_new_mem(Char *s, ShW* sh, Addr start_a, UInt len,
                       Bool is_len_aligned)
{
   Addr a;

   if (RX_(clo_actions))
      VG_(printf)("=  new %s mem: %x..%x, %d\n", s, start_a, start_a+len,len);

   // For heap allocations, we know the length allocated is actually 'len'
   // rounded up to the nearest 4 bytes.  So this is safe.
   sk_assert(IS_ALIGNED4_ADDR(start_a) && 
             (is_len_aligned || IS_ALIGNED4_ADDR(len)));
   for (a = start_a; a < start_a + len; a += 4)
      set_mem_shadow4 ( a, sh );
}

static void rx_new_mem_startup ( Addr start_a, UInt len,
                                 Bool rr, Bool ww, Bool xx )
{
#if 0 //#ifdef RX_SLICING
   Addr a;

   if (RX_(clo_actions))
      VG_(printf)("=  new startup mem: %x..%x, %d\n", start_a, start_a+len,len);

   // pray it's aligned!
   sk_assert(IS_ALIGNED4_ADDR(start_a) && IS_ALIGNED4_ADDR(len));
   for (a = start_a; a < start_a + len; a += 4) {

      // Build the node.  If the memory is from vgskin_redux.so the value of
      // *(UInt*)a could change within build (eg. `hp' within rx_malloc()).
      // To make sure sh->val is correct, reset it after build().
      ShW* sh;
      //VG_(printf)("pre:   %p\n", *(UInt*)a);
      sh = build(ShConst, ConstStartup, Sz4, 0, 0, NULL);
      sh->val = * (UInt*)a;
      //VG_(printf)("post:  %p\n", *(UInt*)a);
      set_mem_shadow4 ( a, sh );
   }

#else
   /* Ignore permissions */
      VG_(printf)("=  new startup mem: %x..%x, %d\n", start_a, start_a+len,len);
   rx_new_mem("startup", make_lazy( RX_(specials)[SpStartup] ), start_a, len, False);
#endif
}

static void rx_new_mem_heap ( Addr a, UInt len, Bool is_inited )
{  
   rx_new_mem("heap", ( is_inited ? RX_(specials)[SpZeroHeap] : 
                                    RX_(specials)[SpHeap] ), a, len, True);
}

static void rx_new_mem_brk ( Addr a, UInt len )
{
   rx_new_mem("brk",  RX_(specials)[SpBrk], a, len, False);
}

static void rx_copy_mem_remap(Addr from, Addr to, UInt len)
{
   VG_(skin_panic)("rx_copy_mem_remap");
}

static void rx_change_mem_mprotect(Addr a, UInt len, Bool rr, Bool ww, Bool xx)
{
   VG_(skin_panic)("rx_change_mem_mprotect");
}

static void rx_die_mem_aligned(Addr start_a, UInt len)
{
   Addr a;
   sk_assert(IS_ALIGNED4_ADDR(start_a) && IS_ALIGNED4_ADDR(len));
   for (a = start_a;  a < start_a + len; a += 4) {
      set_mem_shadow4(a, RX_(specials)[SpUnknownMem]);
   }
}

static void rx_die_mem_heap(Addr start_a, UInt len)
{
   // We know that heap sizes are really all aligned.
   sk_assert(IS_ALIGNED4_ADDR(start_a));
   if (!IS_ALIGNED4_ADDR(len)) {
      len = align4(len) + 4;
   }
   rx_die_mem_aligned(start_a, len);
}

static void rx_die_mem(Addr start_a, UInt len)
{
   if (IS_ALIGNED4_ADDR(start_a) && IS_ALIGNED4_ADDR(len)) {
      rx_die_mem_aligned(start_a, len);
   } else {
      VG_(printf)("*** start_a = %p, len = %p\n", start_a, len);
      VG_(skin_panic)("die_mem_stack (unaligned)");

      // do the ends, then call rx_die_mem_aligned for the middle
   }
}

static void rx_copy_mem(Addr old, Addr new, UInt len )
{
   UInt i;

   sk_assert(IS_ALIGNED4_ADDR(old) && IS_ALIGNED4_ADDR(new) &&
             IS_ALIGNED4_ADDR(len));

   for (i = 0; i < len; i += 4) {
      set_mem_shadow(new+i, RX_(get_mem_shadow)(old+i));
   }
}


/*--------------------------------------------------------------------*/
/*--- Shadow memory/register splitting, getting, setting           ---*/
/*--------------------------------------------------------------------*/

static UInt saturate(UInt offset)
{
   if (offset >= 254) offset = 255;    // saturate at 8 bits
   return offset;
}

// For split4B'ing sp2W(X,_) or sp2W(_,X)
static __inline__
void split4B_ShSplit2W(ShW* shadow4, UInt Xn, ShW** argv, UInt valA, UInt valB) 
{
   UInt XA = 2*Xn;
   UInt XB = XA + 1;
   ShW* X  = shadow4->argv[Xn];

   sk_assert(0 == Xn || 1 == Xn);     // 0th or 1st word 

   // sp2W(c.w,_  ) --> sp4(c0.B,c1.B,_,   _   )
   // sp2W(_,  c.w) --> sp4(_,   _,   c0.B,c1.B)
   if (ShConst == X->kind) {
      UInt const_origin = X->extra;
      argv[XA] = build_const( const_origin, Sz1, valA);
      argv[XB] = build_const( const_origin, Sz1, valB);

   // sp2W(W0(Y),_    ) --> sp4B(B0(Y),B1(Y),_,    _    )
   // sp2W(W1(Y),_    ) --> sp4B(B2(Y),B3(Y),_,    _    )
   // sp2W(_,    W0(Y)) --> sp4B(_,    _,    B0(Y),B1(Y))
   // sp2W(_,    W1(Y)) --> sp4B(_,    _,    B2(Y),B3(Y))
   } else if (ShReadW == X->kind) {
      ShW* Y  = X->argv[0];
      UInt Yn = X->extra;
      sk_assert(0 == Yn || 1 == Yn);     // 0th or 1st word 
      argv[XA] = build( ShReadB, 2*Yn,   Sz1, valA, 1, &Y);
      argv[XB] = build( ShReadB, 2*Yn+1, Sz1, valB, 1, &Y);

   // sp2W(sp2B(Y,Z),_        ) --> sp4B(Y,Z,_,_)
   // sp2W(_,        sp2B(Y,Z)) --> sp4B(_,_,Y,Z)
   } else if (ShSplit2B == X->kind) {
      argv[XA] = X->argv[0];
      argv[XB] = X->argv[1];

   // sp2W(X,_) --> sp4B(B0(X),B1(X),_,    _    )
   // sp2W(_,X) --> sp4B(_,    _,   ,B2(X),B3(X))
   } else {
      argv[XA] = build( ShReadB, 2*Xn,   Sz1, valA, 1, &X);
      argv[XB] = build( ShReadB, 2*Xn+1, Sz1, valB, 1, &X);
   }
}

// Nb: at this point, shadow4->val is out of date, so we must use val
static ShW* split4B(ShW* shadow4, UInt val)
{
   ShW* argv[4] = { NULL, NULL, NULL, NULL };

   UInt val0 = (val & 0x000000ff) >> 0;
   UInt val1 = (val & 0x0000ff00) >> 8;
   UInt val2 = (val & 0x00ff0000) >> 16;
   UInt val3 = (val & 0xff000000) >> 24;

   sk_assert(Sz4       == shadow4->size);
   sk_assert(ShSplit4B != shadow4->kind);

   if (RX_(clo_actions)) {
      VG_(printf)("=  split4B ");
      RX_(pp_ShW)(shadow4);
      VG_(printf)(" (%p), replacing with %p\n", shadow4->val, val);
   }

   // c.L --> sp4B(c0.B, c1.B, c2.B, c3.B)
   if (ShConst == shadow4->kind) {
      UInt const_origin = shadow4->extra;
      argv[0] = build_const( const_origin, Sz1, val0);
      argv[1] = build_const( const_origin, Sz1, val1);
      argv[2] = build_const( const_origin, Sz1, val2);
      argv[3] = build_const( const_origin, Sz1, val3);

   // sp2W(X,Y) --> sp4B(...)
   } else if (ShSplit2W == shadow4->kind) {
      split4B_ShSplit2W(shadow4, 0, argv, val0, val1);
      split4B_ShSplit2W(shadow4, 1, argv, val2, val3);

   // widen(X.B) --> sp4B(X.B, 0, 0, 0)   
   } else if (ShWiden == shadow4->kind && 1 == shadow4->argv[0]->size) {
      sk_assert(0 == (val & 0xffffff00));   
      argv[0] = shadow4->argv[0];
      argv[1] = build_const ( ConstWiden, Sz1, 0 );
      argv[2] = argv[1];
      argv[3] = argv[1];

   // Not bothering with:
   // widen(c.w) --> sp4B(c0.B, c1.B, 0.B,0.B)
   // widen(X.w) --> sp4B(B0(X),B1(X),0.B,0.B.)

   // (m)(X.L) --> sp4B((m)(X0.b), ..., (m)(X3.b))
   } else if (ShSysMem == shadow4->kind) {
      ShW** sub_argv = &shadow4->argv[0];
      UInt offset = shadow4->extra;
      argv[0] = build( ShSysMem, saturate(offset+0), Sz1, val0, 1,
                             sub_argv );
      argv[1] = build( ShSysMem, saturate(offset+1), Sz1, val1, 1,
                             sub_argv );
      argv[2] = build( ShSysMem, saturate(offset+2), Sz1, val2, 1,
                             sub_argv );
      argv[3] = build( ShSysMem, saturate(offset+3), Sz1, val3, 1,
                             sub_argv );
   } else {
      // X --> sp4B(B0(X),B1(X),B2(X),B3(X)), for any sub-fields not yet handled
      argv[0] = build( ShReadB, 0, Sz1, val0, 1, &shadow4 );
      argv[1] = build( ShReadB, 1, Sz1, val1, 1, &shadow4 );
      argv[2] = build( ShReadB, 2, Sz1, val2, 1, &shadow4 );
      argv[3] = build( ShReadB, 3, Sz1, val3, 1, &shadow4 );
   }
   return build( ShSplit4B, 0, Sz4, val, 4, argv );
}

// For split2W'ing sp4B(X1,X2,_,_) or sp4B(_,_,X1,X2)
static __inline__
void split2W_ShSplit4B(ShW* shadow4, UInt Xn, ShW** argv, UInt val) 
{
   ShW* X1 = shadow4->argv[2*Xn];
   ShW* X2 = shadow4->argv[2*Xn+1];

   sk_assert(0 == Xn || 1 == Xn);     // 0th or 1st word 

   // sp4B(c0.B,c1.B,   _,   _) --> sp2W(c.W,_  )
   // sp4B(   _,   _,c0.B,c1.B) --> sp2W(  _,c.W)
   if (ShConst == X1->kind && ShConst == X2->kind && 
       X1->extra == X2 ->extra) 
   {
      UInt const_origin = X1->extra;
      argv[Xn] = build_const( const_origin, Sz2, val );

   // sp4B(B0(Y),B1(Y),    _,    _) --> sp2W(W0(Y),    _)
   // sp4B(B2(Y),B3(Y),    _,    _) --> sp2W(W1(Y),    _)
   // sp4B(    _,    _,B0(Y),B1(Y)) --> sp2W(    _,W0(Y))
   // sp4B(    _,    _,B2(Y),B3(Y)) --> sp2W(    _,W1(Y))
   } else if (ShReadB == X1->kind && ShReadB == X2->kind && 
              X1->argv[0] == X2->argv[0] &&
              ( (0 == X1->extra && 1 == X2->extra) ||
                (2 == X1->extra && 3 == X2->extra)))
   {
      ShW* Y  = X1->argv[0];
      UInt Yn = X1->extra;
      sk_assert(0 == Yn || 2 == Yn);     // 0th or 2nd 
      argv[Xn] = build( ShReadW, Yn/2, Sz2, val, 1, &Y);

   // sp4B(X1,X2, _, _) --> sp2W(sp2B(X1,X2),          _)
   // sp4B( _, _,X1,X2) --> sp2W(          _,sp2B(X1,X2))
   } else {
      ShW* sub_argv[] = { X1, X2 };
      argv[Xn] = build( ShSplit2B, 0, Sz2, val, 2, sub_argv);
   }
}

static ShW* split2W(ShW* shadow4, UInt val)
{
   ShW* argv[2];

   UInt val0 = (val & 0x0000ffff) >> 0;
   UInt val1 = (val & 0xffff0000) >> 16;

   sk_assert(Sz4       == shadow4->size);
   sk_assert(ShSplit2W != shadow4->kind);

   if (RX_(clo_actions)) {
      VG_(printf)("=  split2W ");
      RX_(pp_ShW)(shadow4);
      VG_(printf)(" (%p), replacing with %p\n", shadow4->val, val);
   }

   if (ShConst == shadow4->kind) {
      argv[0] = build_const( shadow4->extra, Sz2, val0 );
      argv[1] = build_const( shadow4->extra, Sz2, val1 );

   // widen.W(X) --> sp2W(X, 0)   
   } else if (ShWiden == shadow4->kind && Sz2 == shadow4->argv[0]->size) {
      sk_assert(0 == (val & 0xffff0000));   
      argv[0] = shadow4->argv[0];
      argv[1] = build_const( ConstWiden, Sz2, 0 );

   // Not bothering with:
   // widen(c.B) --> sp2W(c.W,0.W)
   // widen(X.B) --> sp2W(W0(widen(X.B)), 0.W)

   // sp4B(W,X,Y,Z) --> sp2W(...)
   } else if (ShSplit4B == shadow4->kind) {
      split2W_ShSplit4B(shadow4, 0, argv, val0);
      split2W_ShSplit4B(shadow4, 1, argv, val1);

   // (m)(X.L) --> sp2W((m)(X0.w), (m)(X1.w))
   //
   // XXX: want to get rid of this
   //
   // 
   } else if (ShSysMem == shadow4->kind) {
      ShW** sub_argv = &shadow4->argv[0];
      argv[0] = build( ShSysMem, 0, Sz2, val0, 1, sub_argv);
      argv[1] = build( ShSysMem, 1, Sz2, val1, 1, sub_argv);

   // X --> sp2W(W0(X), W1(X))
   } else {
      argv[0] = build( ShReadW, 0, Sz2, val0, 1, &shadow4);
      argv[1] = build( ShReadW, 1, Sz2, val1, 1, &shadow4);
   }
   return build( ShSplit2W, 0, Sz4, val, 2, argv );
}

static ShW* split2B(ShW* shadow2, UInt val)
{
   ShW* argv[2];

   UInt val0 = (val & 0x000000ff) >> 0;
   UInt val1 = (val & 0x0000ff00) >> 8;

   sk_assert(Sz2       == shadow2->size);
   sk_assert(ShSplit2B != shadow2->kind);

   if (RX_(clo_actions)) {
      VG_(printf)("=  split2B ");
      RX_(pp_ShW)(shadow2);
      VG_(printf)(" (%p), replacing with %p\n", shadow2->val, val);
   }

   // c.w --> sp2B(c0.B,c1.B)
   if (ShConst == shadow2->kind) {
      argv[0] = build_const( shadow2->extra, Sz1, val0 );
      argv[1] = build_const( shadow2->extra, Sz1, val1 );

   // W0(L) --> sp2B( B0(L), B1(L) )
   // W1(L) --> sp2B( B2(L), B3(L) )
   } else if (ShReadW == shadow2->kind) {
      UInt n = shadow2->extra;
      sk_assert(n == 0 || n == 1);

      argv[0] = build( ShReadB, n*2,   Sz1, val0, 1, & shadow2->argv[0] );
      argv[1] = build( ShReadB, n*2+1, Sz1, val1, 1, & shadow2->argv[0] );

   // X.w --> sp2B( B0(X), B1(X) )
   } else {
      VG_(skin_panic)("Can't handle general split2B case yet");
   }
   return build( ShSplit2B, 0, Sz2, val, 2, argv );
}

static __inline__
ShW* split4B_if_not_already(ShW* sh, UInt val, Addr a)
{
   if (ShSplit4B != sh->kind) {
      sh = split4B(sh, val);
      maybe_set_mem_shadow(a, sh);
   }
   return sh;
}

/*--------------------------------------------------------------------*/
/*--- Sanity checking                                              ---*/
/*--------------------------------------------------------------------*/

/* Check that nobody has spuriously claimed that the first or last 16
   pages (64 KB) of address space have become accessible.  Failure of
   the following do not per se indicate an internal consistency
   problem, but they are so likely to that we really want to know
   about it if so. */
Bool SK_(cheap_sanity_check) ( void )
{
//   if (IS_DISTINGUISHED_SM(primary_map[0])
//       /* kludge: kernel drops a page up at top of address range for
//          magic "optimized syscalls", so we can no longer check the
//          highest page */
//       /* && IS_DISTINGUISHED_SM(primary_map[65535]) */
//      )
      return True;
//   else
//      return False;
}

Bool SK_(expensive_sanity_check) ( void )
{
   Int i;

   /* Make sure nobody changed the distinguished secondary. */
   for (i = 0; i < SEC_MAP_WORDS; i++)
      if (distinguished_secondary_map.shadow[i] != RX_(specials)[SpUnknownMem])
         return False;

   return True;
}

/*--------------------------------------------------------------------*/

static __inline__ Bool ignored_kind(ShKind kind)
{
   return ( (SpHeap <= kind && kind < SpLast) || kind == ShCond);
}

#define shadow4_val_matches_mem(a, shadow4) \
   (is_lazy(shadow4)                        \
    || ignored_kind((shadow4)->kind)        \
    || (shadow4)->val == * (UInt*)(a))

#define shadow4_val_matches_arch(arch, shadow4)    \
   (!is_lazy(shadow4) &&                           \
     (ignored_kind((shadow4)->kind)                \
   || (shadow4)->val == VG_(get_archreg)((arch))))

// Useful for debugging, when assertions fail
static void print_shadow_and_realval(ShW* sh, UInt val)
{
   UInt tmp = RX_(clo_print_depth);
   RX_(clo_print_depth) = 20;   // always give good depth for debugging
   VG_(printf)("=====================\n");
   VG_(printf)("val:         %p\n", val);
   VG_(printf)("sh->val: ", val);
   RX_(pp_ShW)(sh);
   VG_(printf)(" (%p)\n", sh->val);
   VG_(printf)("=====================\n");
   RX_(clo_print_depth) = tmp;  // restore original depth
}

static __attribute__ ((unused))
void dump_archregs(void) 
{
   int i;
   for (i = 0; i < 8; i++) {
      ShW* sh = (ShW*)VG_(get_shadow_archreg)(i);
      VG_(printf)("reg %d: val=%8p, shad=", i, VG_(get_archreg)(i));
      RX_(pp_ShW)(sh);
      VG_(printf)(" (%p)\n", sh->val);
   }
}

static __attribute__ ((regparm(1)))
void check_archregs(UInt instr)
{
   Int  i;
   ShW* arch_shadow;

   for (i = 0; i < 8; i++) { 
      arch_shadow = (ShW*)VG_(get_shadow_archreg)(i);

      // In the scheduler, when the thread state is saved, the arch regs are
      // filled with the junk value 0xDEADBEEF, so ignore that.
      // [I don't think that was the right explanation, and the problem was
      //  something else that has now been fixed]
      if (! shadow4_val_matches_arch(i, arch_shadow)
        /*&& 0xDEADBEEF != VG_(get_archreg)(i)*/) 
      {
         VG_(printf)("\nbefore instr #%d, reg %s\n", instr, nameIReg(4,i));
         print_shadow_and_realval(arch_shadow, VG_(get_archreg)(i));
         VG_(skin_panic)("archreg sh->val doesn't match reality");
      }
   }
}

static __attribute__ ((regparm(1)))
void check_mem(UInt bb)
{
   Int i, j;
   static Int n = 0;

   n++;
   for (i = 0; i < 65536; i++) {
      if (primary_map[i] != & distinguished_secondary_map) {
         for (j = 0; j < SEC_MAP_WORDS; j++) {
            Addr a = 65536*i + 4*j;
            ShW* sh = primary_map[i]->shadow[j];
            if (! shadow4_val_matches_mem(a, sh)) {
               VG_(printf)("--\nbb: %d, a = %p\n", bb, a);
               print_shadow_and_realval(sh, * (UInt*)a);
               VG_(skin_panic)("mem sh->val doesn't match reality");
            }
         }
      }
   }
}

Bool RX_(sane_ShW)(ShW* sh, Bool building)
{
#  define EXTRA0     (sh->extra == 0)
#  define EXTRA1     (sh->extra <= 1)
#  define EXTRA3     (sh->extra <= 3)
#  define EXTRAk     (ConstError < sh->extra && sh->extra < ConstLast)
#  define EXTRAcc    (sh->extra <= CondAlways)
#  define EXTRAb     (sh->extra == True || sh->extra == False)
#  define EXTRAne0   (sh->extra != 0)
#  define EXTRAa     (AllocError < sh->extra && sh->extra < AllocLast)

#  define ARGC0      (sh->argc == 0)
#  define ARGC1      (sh->argc == 1)
#  define ARGC2      (sh->argc == 2)
#  define ARGC3      (sh->argc == 3)
#  define ARGC23     (ARGC2 || ARGC3)
#  define ARGC4      (sh->argc == 4)
#  define ARGCgt0    (sh->argc >  0)
#  define ARGCgt1    (sh->argc >  1)

#  define SIZEv0     (sh->size == Sz0 && sh->val == 0)
#  define SIZE0      (sh->size == Sz0)
#  define SIZE4      (sh->size == Sz4)
#  define SIZE2      (sh->size == Sz2 && val_within_size2_limits(sh->val))
#  define SIZE1      (sh->size == Sz1 && val_within_size1_limits(sh->val))
#  define SIZEi      (SIZE4 || SIZE2 || SIZE1)
#  define SIZE42     (SIZE4 || SIZE2)
#  define SIZECC     (sh->size == Sz1 && (sh->val == 0 || sh->val == 1))

#  define REQ(a,b)   (a == b || sh->rewritten)

// Rewriting removes widens, which can cause an expected 2-byte or 4-byte
// sized thing to be smaller.  1-byte things can't change, though.
#  define EQ_SZ_ARGS1   (REQ(sh->size, sh->argv[0]->size))
#  define EQ_SZ_ARGS2 (EQ_SZ_ARGS1 && REQ(sh->argv[0]->size, sh->argv[1]->size))
#  define EQ_SZ_ARGS3 (EQ_SZ_ARGS2 && REQ(sh->argv[1]->size, sh->argv[2]->size))
#  define SND_ARG_SZ1   (REQ(sh->size,sh->argv[0]->size) && \
                         Sz1      == sh->argv[1]->size)
#  define ASHxD         (Sz1      == sh->argv[0]->size && \
                         REQ(sh->size, sh->argv[1]->size) && \
                         REQ(sh->size, sh->argv[2]->size))
#  define TWO_B_ARGS    (sh->argv[0]->size == Sz1 && \
                         sh->argv[1]->size == Sz1)
#  define FOUR_B_ARGS   (TWO_B_ARGS && \
                         sh->argv[2]->size == Sz1 && \
                         sh->argv[3]->size == Sz1)
#  define TWO_W_ARGS    (REQ(sh->argv[0]->size,Sz2) && \
                         REQ(sh->argv[1]->size,Sz2))
#  define SYSCALL_ARG   (sh->argv[0]->kind == ShSyscall)

#  define UNSPLIT_ARG   (sh->argv[0]->kind != ShSplit4B && \
                         sh->argv[0]->kind != ShSplit2W && \
                         sh->argv[0]->kind != ShSplit2B)

#  define AREADW        (REQ(sh->argv[0]->size, Sz4) && UNSPLIT_ARG)
#  define AREADB        ((REQ(sh->argv[0]->size, Sz4) || \
                          REQ(sh->argv[0]->size, Sz2)) && \
                         UNSPLIT_ARG)

#  define ONE_WB_ARG    (REQ(sh->argv[0]->size,Sz2) || sh->argv[0]->size == Sz1)
#  define FN_ARG        (sh->argv[0]->kind == ShFn)
#  define ALEA2         (REQ(sh->argv[0]->size,sh->argv[2]->size) &&   \
                         sh->argv[2]->size == sh->argv[3]->size)
#  define ASTRING       ({Int i; Bool b = True;                            \
                          for (i = 0; i < sh->argc; i++)                   \
                             b == b && (sh->argv[0]->kind == ShConst ||    \
                                        sh->argv[0]->kind == ShSysMem);    \
                          b;                                               \
                        })
#  define ASYSCALL      ({Int i; Bool b = True;                            \
                          for (i = RX_(syscall_info)[sh->extra].argc;      \
                               i < sh->argc; i++)                          \
                             b == b && (sh->argv[0]->kind == ShChunk ||    \
                                        sh->argv[0]->kind == ShString);    \
                          b;                                               \
                        })
                       


#  define PARENTS    (sh->rewritten ? sh->n_parents != PZero : True)
#  define REWR       (sh->graphed   ? sh->rewritten || !RX_(clo_rewrite) : True)
#  define GRAPHED    True     // nothing to check(?)


#  define XSPECIAL   (PARENTS && REWR && GRAPHED            )
#  define XERROR     (PARENTS && REWR && GRAPHED && building)
#  define XNORMAL    (PARENTS && REWR && GRAPHED            )
#  define XFN        (PARENTS         && GRAPHED            )
#  define XFNOUT     (PARENTS && REWR && GRAPHED            )

#define p(a,b,c,d)   return (a && b && c && d);
#define h(a,b,c,d,e) return (a && b && c && d && e);

   // XXX: nothing about .val field!

   switch (sh->kind) {
   /* Fields checked:  extra     argc   size+val  others*/
   case SpError:     p(EXTRA0,   ARGC0,   SIZE4,  XERROR);
   case SpStartup:
   case SpHeap:
   case SpZeroHeap:
   case SpBrk:
   case SpUnknownMem:
   case SpUnknownReg:
   case SpSyscallReg:
   case SpSignalReg :
   case SpPthOpReg:
   case SpClReqReg:
   case SpRep:
   case SpEspEbp:    p(EXTRA0,   ARGC0,   SIZE4,  XSPECIAL);
      
   /* Fields checked:  extra     argc   size+val  argv[]...    other */

   // .extra == ConstOrigin;  True: because no args
   case ShConst:     h(EXTRAk,   ARGC0,   SIZEi,  True,        XNORMAL);

   case ShAdd:       h(EXTRA0,   ARGC2,   SIZEi,  EQ_SZ_ARGS2, XNORMAL);
   case ShSub:       h(EXTRA0,   ARGC2,   SIZEi,  EQ_SZ_ARGS2, XNORMAL);
   case ShMul:       h(EXTRA0,   ARGC2,   SIZE42, EQ_SZ_ARGS2, XNORMAL);
   case ShMul64High: h(EXTRA0,   ARGC2,   SIZEi,  EQ_SZ_ARGS2, XNORMAL);
   case ShMul64:     h(EXTRA0,   ARGC2,   SIZEi,  EQ_SZ_ARGS2, XNORMAL);

   // ARGC23: because can be rewritten
   case ShDiv64:     h(EXTRA0,   ARGC23,  SIZEi,  EQ_SZ_ARGS3, XNORMAL);
   case ShMod64:     h(EXTRA0,   ARGC23,  SIZEi,  EQ_SZ_ARGS3, XNORMAL);

   case ShAnd:       h(EXTRA0,   ARGC2,   SIZEi,  EQ_SZ_ARGS2, XNORMAL);
   case ShOr:        h(EXTRA0,   ARGC2,   SIZEi,  EQ_SZ_ARGS2, XNORMAL);
   case ShXor:       h(EXTRA0,   ARGC2,   SIZEi,  EQ_SZ_ARGS2, XNORMAL);
   case ShShl:       h(EXTRA0,   ARGC2,   SIZEi,  SND_ARG_SZ1, XNORMAL);
   case ShShr:       h(EXTRA0,   ARGC2,   SIZEi,  SND_ARG_SZ1, XNORMAL);
   case ShSar:       h(EXTRA0,   ARGC2,   SIZEi,  SND_ARG_SZ1, XNORMAL);
   case ShRol:       h(EXTRA0,   ARGC2,   SIZEi,  SND_ARG_SZ1, XNORMAL);
   case ShRor:       h(EXTRA0,   ARGC2,   SIZEi,  SND_ARG_SZ1, XNORMAL);
   case ShInc:       h(EXTRA0,   ARGC1,   SIZEi,  EQ_SZ_ARGS1, XNORMAL);
   case ShDec:       h(EXTRA0,   ARGC1,   SIZEi,  EQ_SZ_ARGS1, XNORMAL);
   case ShNot:       h(EXTRA0,   ARGC1,   SIZEi,  EQ_SZ_ARGS1, XNORMAL);
   case ShNeg:       h(EXTRA0,   ARGC1,   SIZEi,  EQ_SZ_ARGS1, XNORMAL);
   case ShLea1Add:   h(EXTRA0,   ARGC2,   SIZE4,  EQ_SZ_ARGS2/*??*/, XNORMAL);
   case ShLea2Add:   h(EXTRA0,   ARGC4,   SIZE4,  ALEA2/*??*/, XNORMAL);

   // .extra == Condcode
   case ShCond:      h(EXTRAcc,  ARGC1,   SIZE0,  True/*XXX*/, XNORMAL);
   case ShCC2Val:    h(EXTRAcc,  ARGC1,   SIZECC, True/*XXX*/, XNORMAL);

   case ShWiden:     h(EXTRA0,   ARGC1,   SIZE4,  ONE_WB_ARG,  XNORMAL);
   case ShShrd:      h(EXTRA0,   ARGC3,   SIZE42, ASHxD,       XNORMAL);
   case ShShld:      h(EXTRA0,   ARGC3,   SIZE42, ASHxD,       XNORMAL);

   // .extra == syscall number
   case ShSyscall:   h(EXTRAne0, True,    SIZE4,  ASYSCALL,    XNORMAL);

   // .extra == output mem block offset (saturates at 255)
   // SIZEi: because of the way I split4B ShSysMem nodes
   case ShSysMem:    h(True,     ARGC1,   SIZEi,  SYSCALL_ARG, XNORMAL);

   case ShSplit4B:   h(EXTRA0,   ARGC4,   SIZE4,  FOUR_B_ARGS, XNORMAL);
   case ShSplit2W:   h(EXTRA0,   ARGC2,   SIZE4,  TWO_W_ARGS,  XNORMAL);
   case ShSplit2B:   h(EXTRA0,   ARGC2,   SIZE2,  TWO_B_ARGS,  XNORMAL);

   // .extra == byte #
   case ShReadB:     h(EXTRA3,   ARGC1,   SIZE1,  AREADB,      XNORMAL);

   // .extra == word #
   case ShReadW:     h(EXTRA1,   ARGC1,   SIZE2,  /*AREADW*/True,      XNORMAL);

   case ShExit:      h(EXTRA0,   ARGC1,   SIZE4,  EQ_SZ_ARGS1, XNORMAL);

   // .extra == has replace a syscall in syscalls_made?
   case ShFn:        h(EXTRAb,   ARGC1,   SIZE0,  True,        XFN    );

   case ShFnOut:     h(EXTRA0,   ARGC1,   True,   FN_ARG,      XFNOUT );

   // ARGCgt1: I don't create chunks with a single arg
   case ShChunk:     h(EXTRA0,   ARGCgt1, SIZE0,  True,        XNORMAL);

   case ShString:    h(EXTRA0,   ARGCgt0, SIZE0,  ASTRING,     XNORMAL);
   case ShInd:       h(EXTRA0,   ARGC1,   SIZEv0, True,        XNORMAL);

   // True: because no args
   case ShAlloc:     h(EXTRAa,   ARGC0,   SIZE4,  True,        XNORMAL);

   default: RX_(ppp_ShW)("unrecognised node", sh);
            VG_(skin_panic)("sane_ShW: bad node kind");
   }

#  undef EXTRA0
#  undef EXTRA1
#  undef EXTRA3
#  undef EXTRAk
#  undef EXTRAcc
#  undef EXTRAb
#  undef EXTRAne0
#  undef EXTRAa

#  undef ARGC0
#  undef ARGC1
#  undef ARGC2
#  undef ARGC3
#  undef ARGC23
#  undef ARGC4
#  undef ARGCgt0
#  undef ARGCgt1

#  undef SIZEv0
#  undef SIZE0
#  undef SIZE4
#  undef SIZE2
#  undef SIZE1
#  undef SIZEi
#  undef SIZE42
#  undef SIZECC

#  undef REQ

#  undef EQ_SZ_ARGS1
#  undef EQ_SZ_ARGS2
#  undef EQ_SZ_ARGS3
#  undef SND_ARG_SZ1
#  undef ASHxD
#  undef TWO_B_ARGS
#  undef FOUR_B_ARGS
#  undef TWO_W_ARGS
#  undef SYSCALL_ARG
#  undef UNSPLIT_ARG
#  undef AREADW
#  undef AREADB
#  undef ONE_WB_ARG
#  undef FN_ARG
#  undef ALEA2
#  undef ASTRING
#  undef ASYSCALL

#  undef PARENTS
#  undef REWR
#  undef GRAPHED

#  undef XSPECIAL
#  undef XERROR
#  undef XNORMAL
#  undef XFN
#  undef XFNOUT

#  undef p
#  undef h
}

/*--------------------------------------------------------------------*/
/*--- Getting, setting memory                                      ---*/
/*--------------------------------------------------------------------*/

static void m_actions(Char* s, Addr a, ShW* sh, UInt val)
{
   if (RX_(clo_actions) && !is_lazy(sh) ) {
      VG_(printf)("=  %s %p %s ", s, a, ( 'g' == s[0] ? "-->" : "<--" ) );
      ( is_lazy(sh) ?  VG_(printf)("(lazy)") : RX_(pp_ShW)(sh) );
      VG_(printf)(" (%p)\n", val);
   }
}

static 
void a_actions(Char* s, UInt archreg, UInt size, ShW* sh, UInt val)
{
   if (RX_(clo_actions)) {
      VG_(printf)("=  %s %s %s ", s, nameIReg(size, archreg),
                  ( 'g' == s[0] ? "-->" : "<--" ) );
      ( is_lazy(sh) ?  VG_(printf)("(lazy)") : RX_(pp_ShW)(sh) );
      VG_(printf)(" (%p)\n", val);
   }
}

static __inline__
UInt extract_byte(UInt byte, UInt val)
{
   UInt shift_by = byte * 8;
   return (val & (0xff << shift_by)) >> shift_by;
}

static UInt archreg_to_byte(UInt archreg)
{
   switch(archreg) {
   case R_AL: case R_CL: case R_DL: case R_BL:  return 0;
   case R_AH: case R_CH: case R_DH: case R_BH:  return 1;
   default: VG_(skin_panic)("archreg_to_byte: bad archreg");
   }
}

/*--------------------------------------------------------------------*/

// if (L) is the shadow for align4(a), and (M) the shadow for align4(a)+4
//   if a = 0: return L
//   if a = 1: (L) = sp4B(LB0,LB1,LB2,LB3), 
//             (M) = sp4B(MB0,MB1,MB2,MB3), return sp4B(LB1,LB2,LB3,MB0)
//   if a = 2: (L) = sp4B(LB0,LB1,LB2,LB3), 
//             (M) = sp4B(MB0,MB1,MB2,MB3), return sp4B(LB2,LB3,MB0,MB1)
//   if a = 3: (L) = sp4B(LB0,LB1,LB2,LB3), 
//             (M) = sp4B(MB0,MB1,MB2,MB3), return sp4B(LB3,MB0,MB1,MB2)
//
static __attribute__((regparm(1)))
ShW* get_mem_shadow4 ( Addr a )
{
   ShW* shadow4;

   if ( IS_ALIGNED4_ADDR(a) ) {
      shadow4 = RX_(get_mem_shadow)(a);

   } else {
      ShW* argv[4];
      Addr aA   = align4(a);
      Addr aB   = aA + 4;
      ShW* sh4A = RX_(get_mem_shadow)(aA);
      ShW* sh4B = RX_(get_mem_shadow)(aB);
      UInt valA = * (UInt*)aA;
      UInt valB = * (UInt*)aB;

      sh4A = split4B_if_not_already(sh4A, valA, aA);
      sh4B = split4B_if_not_already(sh4B, valB, aB);
      
      // Three ways to misalign a 4-byte access
      switch (a - aA) {
      case 1: 
         argv[0] = sh4A->argv[1];
         argv[1] = sh4A->argv[2];
         argv[2] = sh4A->argv[3];
         argv[3] = sh4B->argv[0];
         break; 
      case 2:
         argv[0] = sh4A->argv[2];
         argv[1] = sh4A->argv[3];
         argv[2] = sh4B->argv[0];
         argv[3] = sh4B->argv[1];
         break;
      case 3:
         argv[0] = sh4A->argv[3];
         argv[1] = sh4B->argv[0];
         argv[2] = sh4B->argv[1];
         argv[3] = sh4B->argv[2];
         break;
      default: sk_assert(7==11);
      }
      shadow4 = build( ShSplit4B, 0, Sz4, * (UInt*)a, 4, argv );
   }
   m_actions("getm4", a, shadow4, shadow4->val);
   //sk_assert(shadow4_val_matches_mem(a, shadow4));
      if (!shadow4_val_matches_mem(a, shadow4)) {
         VG_(printf)("a = %p\n", a);
         print_shadow_and_realval(shadow4, * (UInt*)a);
            sk_assert(False);
      }
   return shadow4;
}

// if (L) is the shadow for align4(a), and (M) the shadow for align4(a)+4
//   if a = 0: (L) = sp2W(LW0, LW1), return LW0
//   if a = 2: (L) = sp2W(LW0, LW1), return LW1
//
//   if a = 1: (L) = sp4B(LB0,LB1,LB2,LB3), return sp2B(LB1,LB2)
//   if a = 3: (L) = sp4B(LB0,LB1,LB2,LB3), 
//             (M) = sp4B(MB0,MB1,MB2,MB3), return sp2B(LB3,MB0)
//
static __attribute__((regparm (1)))
ShW* get_mem_shadow2 ( Addr a )
{
   ShW* shadow4 = RX_(get_mem_shadow)(a);
   ShW* shadow2;

   if (IS_ALIGNED2_ADDR(a)) {
      if (ShSplit2W != shadow4->kind) {
         shadow4 = split2W( shadow4, * (UInt*)(align4(a)) );
         maybe_set_mem_shadow(a, shadow4);
      }
      shadow2 = shadow4->argv[(a>>1) % 2];

   } else {
      ShW* argv[2];
      Addr aA   = align4(a);
      ShW* sh4A = RX_(get_mem_shadow)(aA);
      UInt valA = * (UInt*)aA;

      // Definitely need to split sh4A
      sh4A = split4B_if_not_already(sh4A, valA, aA);

      // Two ways to misalign a 2-byte access
      switch (a - aA) {
      case 1:       // within 4-byte word
         argv[0] = sh4A->argv[1];
         argv[1] = sh4A->argv[2];
         break;

      case 3: {     // straddles two 4-byte words
         Addr aB   = aA + 4;
         ShW* sh4B = RX_(get_mem_shadow)(aB);
         UInt valB = * (UInt*)aB;

         sh4B = split4B_if_not_already(sh4B, valB, aB);
         argv[0] = sh4A->argv[3];
         argv[1] = sh4B->argv[0];
         break;
      }
      default: sk_assert(8==12);
      }
      shadow2 = build( ShSplit2B, 0, Sz2, * (UShort*)a, 2, argv );
   }
   m_actions("getm2", a, shadow2, shadow2->val);
   sk_assert(shadow4_val_matches_mem(align4(a), shadow4));
   return shadow2;
}


// if (L) is the shadow for align4(a)
//   if a = n: (L) = sp4B(LB0,LB1,LB2,LB3), return LBn
//
static __attribute__((regparm (1)))
ShW* get_mem_shadow1 ( Addr a )
{
   ShW* shadow4 = RX_(get_mem_shadow)(a);
   ShW* shadow1;

   sk_assert(shadow4_val_matches_mem(align4(a), shadow4));
   
   shadow4 = split4B_if_not_already(shadow4, * (UInt*)(align4(a)), a);
   shadow1 = shadow4->argv[a % 4];
   m_actions("getm1", a, shadow1, shadow1->val);
   return shadow1;
}

/*--------------------------------------------------------------------*/

// When setting bytes within words:
// a. if it's not split, we make a new node, which can have its args safely
//    updated because it cannot be shared.
// b. if it is split, we make a copy, otherwise if the node were shared all
//    the dependents would see changes when we updated the args, creating
//    inconsistencies.
static ShW* split_or_copy4B(ShW* sh, UInt val)
{
   if (ShSplit4B != sh->kind) {
      return split4B(sh, val);
   } else {
      return build( ShSplit4B, 0, Sz4, val, 4, sh->argv );
   }
}

static ShW* split_or_copy2W(ShW* sh, UInt val)
{
   if (ShSplit2W != sh->kind) {
      return split2W(sh, val);
   } else {
      return build( ShSplit2W, 0, Sz4, val, 2, sh->argv );
   }
}

// Assert the shadow val matches the real at the end;  can't do it at the
// start because the shadow hasn't yet been updated.
static __attribute__((regparm(2)))
void set_mem_shadow4 ( Addr a, ShW* shadow4 )
{
   // If memory was just munmapped, don't cause a seg fault!
   if (shadow4->kind == SpUnknownMem)
      m_actions("setm4", a, shadow4, 0x55555555);
   else
      m_actions("setm4", a, shadow4, * (UInt*)a);

   if (IS_ALIGNED4_ADDR(a)) {
      set_mem_shadow(a, shadow4);
      //sk_assert(shadow4_val_matches_mem(a, shadow4));
      if (!shadow4_val_matches_mem(a, shadow4)) {
         VG_(printf)("a = %p\n", a);
         print_shadow_and_realval(shadow4, * (UInt*)a);
            sk_assert(False);
      }

   } else {
      Addr aA   = align4(a);
      Addr aB   = aA + 4;
      ShW* sh4A = RX_(get_mem_shadow)(aA);
      ShW* sh4B = RX_(get_mem_shadow)(aB);
      UInt val  = * (UInt*)a;
      UInt valA = * (UInt*)aA;
      UInt valB = * (UInt*)aB;

      if (ShSplit4B != shadow4->kind) {
         sk_assert(val == shadow4->val);
         shadow4 = split4B(shadow4, val);
      }
      sh4A = split_or_copy4B(sh4A, valA);
      sh4B = split_or_copy4B(sh4B, valB);

      // Three ways to misalign a 4-byte access
      switch (a - aA) {
      case 1: 
         sh4A->argv[1] = shadow4->argv[0];
         sh4A->argv[2] = shadow4->argv[1];
         sh4A->argv[3] = shadow4->argv[2];
         sh4B->argv[0] = shadow4->argv[3];
         break;
      case 2: 
         sh4A->argv[2] = shadow4->argv[0];
         sh4A->argv[3] = shadow4->argv[1];
         sh4B->argv[0] = shadow4->argv[2];
         sh4B->argv[1] = shadow4->argv[3];
         break;
      case 3: 
         sh4A->argv[3] = shadow4->argv[0];
         sh4B->argv[0] = shadow4->argv[1];
         sh4B->argv[1] = shadow4->argv[2];
         sh4B->argv[2] = shadow4->argv[3];
         break;
      default: sk_assert(9==13);
      }
      m_actions("setm4.misalignedA", aA, sh4A, valA);
      m_actions("setm4.misalignedB", aB, sh4B, valB);
      set_mem_shadow(aA, sh4A);
      set_mem_shadow(aB, sh4B);
      sk_assert(shadow4_val_matches_mem(aA, sh4A));
      sk_assert(shadow4_val_matches_mem(aB, sh4B));
   }
}

static __attribute__((regparm(2)))
void set_mem_shadow2 ( Addr a, ShW* shadow2 )
{
   ShW*   shadow4 = RX_(get_mem_shadow)(a);
   UInt   new4val = * (UInt*)(align4(a));
   UShort new2val = * (UShort*)a;

   m_actions("setm2", a, shadow2, new2val);
   sk_assert(shadow2->val == new2val);

   if (IS_ALIGNED2_ADDR(a)) {
      shadow4 = split_or_copy2W(shadow4, new4val);
      shadow4->argv[(a>>1) % 2] = shadow2;
      set_mem_shadow(a, shadow4);
      sk_assert(shadow4_val_matches_mem(align4(a), shadow4));

   } else {
      Addr aA = align4(a);
      ShW* sh4A = RX_(get_mem_shadow)(aA);
      UInt valA = * (UInt*)aA;
      UInt val  = * (UShort*)a;

      if (ShSplit2B != shadow2->kind) {
         shadow2 = split2B(shadow2, val);
      }
      sh4A = split_or_copy4B(sh4A, valA);

      switch (a - aA) {
      case 1:       // within 4-byte word
         sh4A->argv[1] = shadow2->argv[0];
         sh4A->argv[2] = shadow2->argv[1];
         m_actions("setm2.misaligned", aA, sh4A, valA);
         set_mem_shadow(aA, sh4A);
         sk_assert(shadow4_val_matches_mem(aA, sh4A));
         break;

      case 3: {     // straddles two 4-byte words
         Addr aB = align4(a) + 4;
         ShW* sh4B = RX_(get_mem_shadow)(aB);
         UInt valB = * (UInt*)aB;

         sh4B = split_or_copy4B(sh4B, valB);
         sh4A->argv[3] = shadow2->argv[0];
         sh4B->argv[0] = shadow2->argv[1];
         m_actions("setm2.misalignedA", aA, sh4A, valA);
         m_actions("setm2.misalignedB", aB, sh4B, valB);
         set_mem_shadow(aA, sh4A);
         set_mem_shadow(aB, sh4B);
         sk_assert(shadow4_val_matches_mem(aA, sh4A));
         sk_assert(shadow4_val_matches_mem(aB, sh4B));
         break;
      }
      default: sk_assert(10==14);
      }
   }
}

static __attribute__((regparm(2)))
void set_mem_shadow1 ( Addr a, ShW* shadow1 )
{
   ShW*  shadow4 = RX_(get_mem_shadow)(a);
   UInt  new4val = * (UInt*)(align4(a));
   UChar new1val = * (UChar*)a;

   m_actions("setm1", a, shadow1, new1val);
   sk_assert(shadow1->val == new1val);

   shadow4 = split_or_copy4B(shadow4, new4val);
   shadow4->argv[a % 4] = shadow1;
   set_mem_shadow(a, shadow4);
   sk_assert(shadow4_val_matches_mem(align4(a), shadow4));
}

/*--------------------------------------------------------------------*/

static __attribute__((regparm (1)))
ShW* get_archreg_shadow4 ( UInt archreg )
{
   ShW* shadow4 = (ShW*)VG_(get_shadow_archreg)(archreg);

   if(!shadow4_val_matches_arch(archreg, shadow4)) {

     print_shadow_and_realval(shadow4, VG_(get_archreg)(archreg));
        sk_assert(False);
   }
   
   a_actions("geta4", archreg, 4, shadow4, VG_(get_archreg)(archreg));
   return shadow4;
}

static __attribute__((regparm (1)))
ShW* get_archreg_shadow2 ( UInt archreg )
{
   ShW* shadow4 = (ShW*)VG_(get_shadow_archreg)(archreg);
   ShW* shadow2;

   sk_assert(shadow4_val_matches_arch(archreg, shadow4));
   if (ShSplit2W != shadow4->kind) {
      shadow4 = split2W(shadow4, VG_(get_archreg)(archreg));
      maybe_set_shadow_archreg(archreg, shadow4);
   }
   shadow2 = shadow4->argv[0];
   a_actions("geta2", archreg, 2, shadow2, 
             VG_(get_archreg)(archreg) & 0xffff );   // low 16 bits

   return shadow2;
}

// We split it if it's not already, and store that split node (this storing
// isn't mandatory;  it might result in some unnecessarily split nodes in
// the graph, but also avoids repeated splits if >1 byte accesses occur).
static __attribute__((regparm (1)))
ShW* get_archreg_shadow1 ( UInt archreg1 )
{
   UInt archreg4 = (archreg1 >= 4 ? archreg1 - 4 : archreg1);
   ShW* shadow4  = (ShW*)VG_(get_shadow_archreg)(archreg4);
   ShW* shadow1;
   UInt byte     = archreg_to_byte(archreg1);

   // Check that the archreg's shadow and value match
   sk_assert(shadow4_val_matches_arch(archreg4, shadow4));

   if (ShSplit4B != shadow4->kind) {
      shadow4 = split4B(shadow4, VG_(get_archreg)(archreg4));
      maybe_set_shadow_archreg(archreg4, shadow4);
   }
   shadow1 = shadow4->argv[byte];
   a_actions("geta1", archreg1, 1, shadow1, 
             extract_byte(byte, VG_(get_archreg)(archreg4)) );

   return shadow1;
}

static __attribute__((regparm(2)))
void set_archreg_shadow4 ( UInt archreg, ShW* shadow4 )
{
   sk_assert(shadow4_val_matches_arch(archreg, shadow4));

   // Ignore %esp/%ebp if --ignore-esp-ebp=yes
   if ( (R_ESP == archreg || R_EBP == archreg) && RX_(clo_ignore_esp_ebp) ) {
      shadow4 = RX_(specials)[SpEspEbp];
   }
   
   a_actions("seta4", archreg, 4, shadow4, shadow4->val);
   VG_(set_shadow_archreg) (archreg, (UInt)shadow4);
   sk_assert(shadow4_val_matches_arch(archreg, shadow4));
}
   
static __attribute__((regparm(2)))
void set_archreg_shadow2 ( UInt archreg, ShW* shadow2 )
{
   ShW* shadow4 = (ShW*)VG_(get_shadow_archreg)(archreg);
   UInt new4val = VG_(get_archreg)(archreg);
   UInt new2val = 0xffff & new4val;

   a_actions("seta2", archreg, 2, shadow2, new2val);
   sk_assert(shadow2->val == new2val);

   shadow4 = split_or_copy2W(shadow4, new4val);
   shadow4->argv[0] = shadow2;         // overwrite apt. byte-shadow
   VG_(set_shadow_archreg) (archreg, (UInt)shadow4);
   sk_assert(shadow4_val_matches_arch(archreg, shadow4));
}
   
// Idea:  split (or copy if already split), and replace the appropriate
// sub-shadow with the new shadow1.
//
// Note: depending on which 8-bit register is used, will be the 0th or 1st
//       byte we deal with.
static __attribute__((regparm(2)))
void set_archreg_shadow1 ( UInt archreg1, ShW* shadow1 )
{
   // archreg1 holds a value in the range R_AL..R_BH (0..7).
   // But the 1-byte regs are physically entirely in E[ABCD]X, so we map
   // everything back to that range;  this is what archreg4 holds.  Phew.
   UInt archreg4 = (archreg1 >= 4 ? archreg1 - 4 : archreg1);
   ShW* shadow4  = (ShW*)VG_(get_shadow_archreg)(archreg4);
   UInt byte     = archreg_to_byte(archreg1);
   UInt new4val  = VG_(get_archreg)(archreg4);
   UInt new1val  = extract_byte(byte, new4val);

   a_actions("seta1", archreg1, 1, shadow1, new1val);

   // Check that the new byte-shadow we are about to put in matches the new
   // byte-val in the real archreg.
   sk_assert(shadow1->val == new1val);
   
   shadow4 = split_or_copy4B(shadow4, new4val);
   shadow4->argv[byte] = shadow1;
   VG_(set_shadow_archreg) (archreg4, (UInt)shadow4);
   
   // Check that the byte-shadow was inserted properly, and that the full
   // shadow matches the value of the full real archreg.
   sk_assert(shadow4_val_matches_arch(archreg4, shadow4));
}

/*--------------------------------------------------------------------*/
/*--- Syscall handling                                             ---*/
/*--------------------------------------------------------------------*/

SyscallMade* RX_(syscalls_made)      = NULL;
SyscallMade* RX_(syscalls_made_last) = NULL;

static void print_syscall_info(void)
{
   Int i;
   for (i = 0; i < MAX_NUM_SYSCALLS; i++)
      if (RX_(syscall_info)[i].argc != 0 || RX_(syscall_info)[i].name != NULL)
         VG_(printf)("%3u: %s(%u)\n", i,
                     RX_(syscall_info)[i].name, RX_(syscall_info)[i].argc);
}

/* To add a new syscall, just add the appropriate line below */
static void init_syscall_info(void)
{
   Int i;
   /* Using NULL below for the 0-arg ones is a hack;  the array ends up
      containing two NULL entries, and only the first one is seen */
#define s(n, s, min, max, v...)                 \
   {                                            \
      static Char* array[] = { v, NULL };       \
      sk_assert(n < MAX_NUM_SYSCALLS);          \
      for (i = 0; array[i] != NULL; i++);       \
      RX_(syscall_info)[n].name = s;            \
      RX_(syscall_info)[n].argc = i;            \
      RX_(syscall_info)[n].argv = array;        \
      RX_(syscall_info)[n].min_mem_reads = min; \
      RX_(syscall_info)[n].max_mem_reads = max; \
   }

   // XXX: add something to indicate argblock, eg. number of argblock params
   // (most will be zero)
   
   // num  name  min/max_mem_reads argv (use NULL if no args)
   s(  2, "fork",         0,0, NULL);
   s(  3, "read",         0,0, "fd", "buf", "count");
   s(  4, "write",        1,1, "fd", "buf", "count");
   s(  5, "open",         1,1, "path", "flags");
   s(  6, "close",        0,0, "fd");
   s( 10, "unlink",       1,1, "path");
   
   // execve() does have memory args, but the core doesn't do pre_mem_read
   // for them, hence the 0...
   s( 11, "execve",       0,0, "filename", "argv", "envp");
   s( 12, "chdir",        1,1, "path");
   s( 13, "time",         0,0, "t");
   s( 15, "chmod",        1,1, "path", "mode");
   s( 19, "lseek",        0,0, "fd", "offset", "whence");
   s( 20, "getpid",       0,0, NULL);
   s( 27, "alarm",        0,0, "seconds");
   s( 30, "utime",        1,2, "filename", "buf");
   s( 33, "access",       1,1, "path", "mode");
   s( 40, "rmdir",        1,1, "path");
   s( 41, "dup",          0,0, "oldfd");
   s( 42, "pipe",         0,0, "fd_arr");
   s( 43, "times",        0,0, "buf");
   s( 45, "brk",          0,0, "end_data_seg");
   s( 54, "ioctl",        0,2, "fd", "request", "argp");
   s( 57, "setpgid",      0,0, "pid", "pgid");
   s( 63, "dup2",         0,0, "oldfd", "newfd");
   s( 64, "getppid",      0,0, NULL);
   s( 65, "getpgrp",      0,0, NULL);
   s( 77, "getrusage",    0,0, "who", "usage");
   s( 78, "gettimeofday", 0,0, "tv", "tz");

   // uses argblock 
   s( 82, "select",       0,4, "n", "readfds", "writefds",
                               "exceptfds", "timeout");
   s( 85, "readlink",     1,1, "path", "buf", "bufsiz");

   // uses argblock
   s( 90, "mmap",         0,0, "start", "len", "prot", "flags", "fd", "offset");
   s(192, "mmap2",        0,0, "start", "len", "prot", "flags", "fd", "offset");

   s( 91, "munmap",       0,0, "start", "len");
   s(102, "socketcall",   1,7, "call", "args");
   s(104, "setitimer",    1,1, "which", "value", "ovalue");
   s(114, "wait4",        0,0, "pid", "status", "options", "rusage");
   s(122, "uname",        0,0, "buf");
   s(125, "mprotect",     0,0, "addr", "len", "prot");
   s(140, "llseek",       0,0, "fd", "offset_hi", "offset_lo", "result", 
                               "whence");
   s(145, "readv",        1,1, "fd", "vector", "count");
   s(146, "writev",       1,1, "fd", "vector", "count");
   s(168, "poll",         0,0, "ufds", "nfds", "timeout");
   s(174, "sigaction",    0,1, "signum", "act", "oldact");
   s(175, "sigprocmask",  0,1, "how", "set", "oldset");
   s(183, "getcwd",       0,0, "buf", "size");
   s(191, "ugetrlimit",   0,0, "resource", "rlim");
   s(195, "stat64",       1,1, "filename", "buf");
   s(196, "lstat64",      1,1, "filename", "buf");
   s(197, "fstat64",      0,0, "fd", "buf");
   s(199, "getuid32",     0,0, NULL);
   s(200, "getgid32",     0,0, NULL);
   s(201, "geteuid32",    0,0, NULL);
   s(202, "getegid32",    0,0, NULL);
   s(205, "getgroups32",  0,0, "size", "list");
   s(212, "chown32",      1,1, "path", "owner", "group");
   s(220, "getdents64",   0,0, "fd", "dirp", "count");
   s(221, "fcntl64",      0,0, "fd", "cmd");

   s(243, "set_thread_area",0,1, "u_info");
#undef s
}

// Remember some things about the syscall that we don't use until
// SK_(post_syscall) is called...

#define MAX_READ_MEMS  10

// Most distinct mem blocks read by a syscall is 5, I think;  10 should be safe
// This array is null-terminated.
static ShW* mem_reads[MAX_READ_MEMS];
static UInt mem_reads_n = 0;

// XXX: do any syscalls do more than one write?
static Addr     last_mem_write_addr = INVALID_TEMPREG;
static UInt     last_mem_write_len  = INVALID_TEMPREG;

static Int      current_syscall = -1;

#define  MAX_SYS_ARGC   1000

// This may be called 0, 1 or several times for each syscall, depending on
// how many memory args it has.  For each memory arg, we record all the
// relevant information, including the actual words referenced.
static 
void rx_pre_mem_read_common(CorePart part, Bool is_asciiz, Addr a, UInt len)
{
   Int  i, j;
   Addr a2, node_addr, first_addr;
   UInt wordc, argc;
   UInt first_byte, last_byte;
   Int  strings;
   ShW* sh;
   UInt first_kind, first_extra;
   ShW *first_child;

   // static not to preserve value, just so the stack isn't inflated too much
   static ShW* tmp_argv [MAX_SYS_ARGC];
   static ShW* tmp_argv2[MAX_SYS_ARGC];
   static ShW* str_argv [MAX_SYS_ARGC];

   if (Vg_CoreSysCall != part) {
      sk_assert(-1 == current_syscall);
      return;
   }

   sk_assert(0 < current_syscall && current_syscall < 256);

   // mmap() does a memory read for the args, but we don't care about that.
   if (90 == current_syscall)
      return;

   // Calculate number of words, allowing for misalignment
   // wordc*4 is never smaller, and at most 6 bytes bigger, than len
   argc  = 0;
   wordc = ((align4(a + len - 1) - align4(a)) >> 2) + 1;
   sk_assert(len <= wordc*4 && wordc*4 <= len + 6);

   first_byte = a & 0x3;
   last_byte  = (a + len - 1) & 0x3;
   
   //------------------------------------------------------------------------
   // Go through, splitting where necessary, and copying in longs, words
   // or bytes into tmp_argv as appropriate -- this skips uninteresting
   // split nodes, at the price of having different sized mem args.
   for (i = 0, a2 = align4(a);  i < wordc;  i++, a2 += 4) {

      ShW* shadow4         = get_mem_shadow4(a2);
      Bool first_word      = (0       == i);
      Bool last_word       = (wordc-1 == i);
      UInt this_first_byte = ( first_word ? first_byte : 0 );
      UInt this_last_byte  = (  last_word ?  last_byte : 3 );

      // If is_asciiz, or incomplete first/last word, split into bytes if
      // not already split
      if ( is_asciiz ||
           (first_word && 0 != first_byte) || (last_word  && 3 != last_byte) ) 
      {
         shadow4 = split4B_if_not_already(shadow4, shadow4->val, a2);
      }

      // Copy in;  skip split nodes if present
      if (ShSplit4B == shadow4->kind) {
         for (j = this_first_byte; j <= this_last_byte; j++) {

            ShW* shj = shadow4->argv[j];

            // Should never have sp4(Bn(sp4(...)), ...)
            if (ShReadB == shj->kind)
               sk_assert(ShSplit4B != shj->argv[0]->kind);
            
            tmp_argv[argc] = shj;
            argc++;
         }

      } else if (ShSplit2W == shadow4->kind) {
         VG_(skin_panic)("can't handle ShSplit2W case yet, sorry");

      } else if (ShSplit2B == shadow4->kind) {
         VG_(skin_panic)("ShSplit2B??");

      } else {
         tmp_argv[argc] = shadow4;
         argc++;
      }
      sk_assert(argc < MAX_SYS_ARGC);
   }

   //------------------------------------------------------------------------
   // Now go through tmp_argv, collecting up similar and adjacent char nodes
   // into ShStrings, copying the results into tmp_argv2.
   //
   // Possible strings:
   //    (all constants [can be different types, eg .c and .s]
   // OR  all mem outputs from the same syscall)
   // AND all size 1
   j           = 0;
   strings     = 0;
   node_addr   = a;

   for (i = 0; i < argc; ) {

      sh = tmp_argv[i];

      if (1 == sh->size && (ShConst == sh->kind || ShSysMem == sh->kind)) {
         first_kind     = sh->kind;
         first_extra    = sh->extra;
         first_child    = sh->argv[0];  // maybe bad, but then won't be used 
         first_addr     = node_addr;

         strings++;

         // interesting case -- gather up into a string
         len = 0;
         do {
            str_argv[len] = sh;
            len++;
            i++;
            node_addr += sh->size;
            sh = tmp_argv[i];

            if (i >= argc)                                              break;
            if (1 != sh->size)                                          break;
            if (sh->kind != first_kind)                                 break;

            if ((ShSysMem == first_kind && sh->argv[0] != first_child)) break;

         } while (True);

         //VG_(printf)("len = %d\n", len);

         // Put string node into tmp_argv2.
         tmp_argv2[j] = build( ShString, /*extra*/0, Sz0, first_addr, len, 
                                    str_argv );

      } else {
         // boring case, copy into tmp_argv2
         tmp_argv2[j] = sh;
         node_addr += sh->size;
         i++;
      }
      j++;
   }

   //------------------------------------------------------------------------
   // Now build the final result node.  If a single string, can elide ShChunk
   if (strings <= 1 && 1 == j) {
      mem_reads[mem_reads_n] = tmp_argv2[0];
   } else {
      mem_reads[mem_reads_n] = build(ShChunk, /*extra*/0, Sz0, a, j, tmp_argv2);
   }
   mem_reads_n++;
}

// XXX: add a sanity check when doing pre_ShW...

static 
void rx_pre_mem_read(CorePart part, ThreadId tid, Char* s, Addr a, UInt len)
{
   rx_pre_mem_read_common(part, /*is_asciiz*/False, a, len);
}

static 
void rx_pre_mem_read_asciiz(CorePart part, ThreadId tid, Char* s, Addr a)
{
   UInt len = VG_(strlen)( (Char*)a );
   // Nb: no +1 for '\0' -- we don't want to print it on the graph
   rx_pre_mem_read_common(part, /*is_asciiz*/True, a, len);
}

static void rx_pre_mem_write(CorePart part, ThreadId tid, Char* s, 
                             Addr a, UInt len)
{
}

static void rx_post_mem_write(Addr a, UInt len)
{
   if (-1 != current_syscall) {

      // AFAIK, no syscalls write more than one block of memory;  check this
      if (INVALID_TEMPREG != last_mem_write_addr) {
         VG_(printf)("sys# = %d\n", current_syscall);
         VG_(skin_panic)("can't handle syscalls that write more than one block (eg. readv()), sorry");
      }
      sk_assert(INVALID_TEMPREG == last_mem_write_len);

      last_mem_write_addr = a;
      last_mem_write_len  = len;
   }
}

static void rx_new_mem_mmap ( Addr a, UInt len, Bool rr, Bool ww, Bool xx )
{  
   // (Ignore permissions)
   // Make it look like any other syscall that outputs to memory
   last_mem_write_addr = a;
   last_mem_write_len  = len;
}

// When a signal is delivered, %esp will be changed for the signal stack
// frame, so mark it as unknown [now handled correctly by core]
static void rx_pre_deliver_signal ( ThreadId tid, Int sigNo, Bool alt_stack )
{
   //VG_(set_thread_shadow_archreg)(tid, R_ESP,
   //                               (UInt)RX_(specials)[SpSignalReg]);
}

// After a signal is delivered, %esp will change again (revert?  not sure),
// so mark it as unknown.  [now handled correctly by the core]
//
// %edx also changes due to a SET_EDX in VG_(deliver_signals)() [not handled
// correctly by core?]
//
// %eax also changes (XXX)
// - in push_signal_frame, it has a syscall num (eg. 4)
// - by the time the signal returns, it has changed to a stack location
// - during the return frame pop, it gets set back to the syscall num
static void rx_post_deliver_signal ( ThreadId tid, Int sigNo )
{
   //VG_(set_thread_shadow_archreg)(tid, R_EAX, (UInt)RX_(specials)[SpSignalReg]);
   //VG_(set_thread_shadow_archreg)(tid, R_EDX, (UInt)RX_(specials)[SpSignalReg]);
   //VG_(set_thread_shadow_archreg)(tid, R_ESP, (UInt)RX_(specials)[SpSignalReg]);
}

void* SK_(pre_syscall) ( ThreadId tid, UInt syscallno, Bool is_blocking )
{
   // Nb: 6 is for up to 6 syscall args.
   // We use VG_(malloc) here so it can later be freed.
   ShW** argv = VG_(malloc)((6+MAX_READ_MEMS) * sizeof(ShW*));
   UInt  argc = RX_(syscall_info)[syscallno].argc;

   if (argc == 0 && NULL == RX_(syscall_info)[syscallno].argv) {
      VG_(printf)("syscallno = %d", syscallno);
      VG_(skin_panic)("unhandled syscall");
   }

   current_syscall = syscallno;

   if (90 == syscallno) {  // mmap
      // Get contents of ebx to find args block, then extract shadow args
      Addr* arg_block = (Addr*)VG_(get_thread_archreg)(tid, R_EBX);          
      argv[0] = get_mem_shadow4((Addr) & arg_block[0]);
      argv[1] = get_mem_shadow4((Addr) & arg_block[1]);
      argv[2] = get_mem_shadow4((Addr) & arg_block[2]);
      argv[3] = get_mem_shadow4((Addr) & arg_block[3]);
      argv[4] = get_mem_shadow4((Addr) & arg_block[4]);
      argv[5] = get_mem_shadow4((Addr) & arg_block[5]);

   } else if (82 == syscallno) { // select -- only 5 args, done with arg block?
      // XXX: should put an argblock param in SyscallInfo
      VG_(skin_panic)("can't handle select()");

   } else {
      if (argc >= 1) argv[0] = (ShW*)VG_(get_thread_shadow_archreg)(tid, R_EBX);
      if (argc >= 2) argv[1] = (ShW*)VG_(get_thread_shadow_archreg)(tid, R_ECX);
      if (argc >= 3) argv[2] = (ShW*)VG_(get_thread_shadow_archreg)(tid, R_EDX);
      if (argc >= 4) argv[3] = (ShW*)VG_(get_thread_shadow_archreg)(tid, R_ESI);
      if (argc >= 5) argv[4] = (ShW*)VG_(get_thread_shadow_archreg)(tid, R_EDI);
      if (argc >= 6) {
                     sk_assert(syscallno == 192);  // mmap2
                     argv[4] = (ShW*)VG_(get_thread_shadow_archreg)(tid, R_EBP);
      }
      if (argc >= 7) {
         VG_(printf)("syscallno = %d\n", syscallno);
         VG_(printf)("argc      = %d\n", argc);
         VG_(printf)("x         = %d\n", RX_(syscall_info)[syscallno].argc);
         print_syscall_info();
         VG_(skin_panic)("> 6-arg syscall??");
      }
   }
   return argv;
}

// Nb: By using rx_malloc() we can't free these nodes.  Not really a problem
// since they are preserved until the end;  only those removed by
// folding could be freed.
static void record_syscall(ShW* node) 
{
   SyscallMade* new_made;

   /* ignore everything before main() */
   if (! entered_main) return;

   new_made       = rx_malloc(sizeof(SyscallMade));
   new_made->node = node;
   new_made->next = NULL;

   if (NULL == RX_(syscalls_made)) {
      RX_(syscalls_made) = RX_(syscalls_made_last) = new_made;    // init 
         
   } else {
      RX_(syscalls_made_last)->next = new_made;    // link in new node
      RX_(syscalls_made_last) = new_made;          // update tail ptr
   }
}

void SK_(post_syscall) ( ThreadId tid, UInt syscallno, void* pre_result,
                         Int res, Bool is_blocking )
{
   // 1. Because KERNEL_DO_SYSCALL overwrites sh_eax with
   //    VG_(written_shadow_reg), we must write sh_eax ourselves after the
   //    syscall is performed, here;  not in SK_(pre_syscall)().
   // 2. And we must write the thread sh_eax, not the baseBlock one.
   SyscallInfo sysinfo = RX_(syscall_info)[syscallno];
   Addr  a, wstart_a;
   UInt  wlen;
   UInt  argc = sysinfo.argc;
   ShW** argv = (ShW**)pre_result;
   ShW*  node;
   UInt  i;

   sk_assert(current_syscall == syscallno);

   //------------------------------------------------------------------------
   // If the syscall read any memory buffers, add them to argv.  Then reset
   // for next syscall.

   //if ( ! (sysinfo.min_mem_reads <= mem_reads_n )) {
   //   VG_(printf)("syscall = %d, expmin = %d, actual = %d\n",
   //               syscallno, sysinfo.min_mem_reads, mem_reads_n);
   //}
   sk_assert( sysinfo.min_mem_reads <= mem_reads_n );
   //if ( ! (sysinfo.max_mem_reads >= mem_reads_n )) {
   //   VG_(printf)("syscall = %d, expmax = %d, actual = %d\n",
   //               syscallno, sysinfo.max_mem_reads, mem_reads_n);
   //}
   sk_assert( sysinfo.max_mem_reads >= mem_reads_n );
   
   if (mem_reads_n > 0) {
      for (i = 0; i < mem_reads_n; i++) {
         argv[argc] = mem_reads[i];
         argc++;
      }
      // zero mem_reads just to be safe
      for (i = 0; i < MAX_READ_MEMS; i++) mem_reads[i] = NULL;
      mem_reads_n = 0;
   }

   //------------------------------------------------------------------------
   node = build_syscall(syscallno, res, argc, argv);
   // XXX: argv could be made global to save malloc/freeing it repeatedly
   VG_(free)(argv);        

   // Set the return value shadow
   VG_(set_return_from_syscall_shadow)(tid, (UInt)node);

   //------------------------------------------------------------------------
   // If the syscall wrote memory, point the shadows at the new node.
   // Then reset for next syscall.
   wstart_a = last_mem_write_addr;
   wlen     = last_mem_write_len;
   if (INVALID_TEMPREG != wstart_a && INVALID_TEMPREG != wlen) {
      
      sk_assert(wstart_a != 0);     // XXX: can happen if program has bug
      sk_assert(IS_ALIGNED4_ADDR(wstart_a));
//      sk_assert(IS_ALIGNED4_ADDR(wlen));  XXX
      for (a = wstart_a;  a < wstart_a + wlen; a += 4) {

         // XXX: THIS IS BOGUS!  can mmap in a file in a too-big space,
         // and then some of the segment remains untouchable.
         
         // If ShSysMem is built lazily, can't record the mem block offset.
         //set_mem_shadow4(a, make_lazy(node));
         UInt offset = a - wstart_a;
         set_mem_shadow4(a, build_ShSysMem( /*syscallno*/saturate(offset),
                         * (UInt*)a, node ));
      }
   }
   last_mem_write_addr = INVALID_TEMPREG;
   last_mem_write_len  = INVALID_TEMPREG;
   //------------------------------------------------------------------------

   record_syscall(node);

   current_syscall = -1;
}

/*--------------------------------------------------------------------*/
/*--- Higher level instrumenters                                   ---*/
/*--------------------------------------------------------------------*/

#define INSTR(format, args...)                                          \
   if (RX_(clo_instr))                                                  \
      do {                                                              \
         VG_(printf)("***                                         ");   \
         VG_(printf)(format , ## args);                                 \
      } while (0)

//#define SHOW_IGNORES

#ifdef SHOW_IGNORES
#define IGNORE(format, args...)  VG_(printf)(format , ## args)
#else
#define IGNORE(format, args...)
#endif

static void tempreg__mem(UCodeBlock* cb, UInt size, UInt t_reg, UInt t_mem)
{
   Addr helper;
   INSTR("q%d --> Mset(t%d).%d\n", t_reg, t_mem, size);

   switch (size) {
   case 1:  helper = (Addr) & set_mem_shadow1; break; 
   case 2:  helper = (Addr) & set_mem_shadow2; break;  
   case 4:  helper = (Addr) & set_mem_shadow4; break;
   default: VG_(printf)("size = %d\n", size);
            VG_(skin_panic)("tempreg__mem: bad size");
   }
   uInstr2(cb, CCALL, 0, TempReg, t_mem, TempReg, SHADOW(t_reg));
   uCCall (cb, helper, 2, 2, /*ret_val*/False);
}

static void mem__tempreg(UCodeBlock* cb, UInt size, UInt t_mem, UInt t_reg,
                         Bool maybe_useless)
{
   Addr helper;
   INSTR("Mget(t%d).%d --> q%d %s\n", t_mem, size, t_reg,
               ( maybe_useless ? ", (-?-)" : "" ) );

   switch (size) {
   case 1:  helper = (Addr) & get_mem_shadow1; break; 
   case 2:  helper = (Addr) & get_mem_shadow2; break;  
   case 4:  helper = (Addr) & get_mem_shadow4; break;
   default: VG_(skin_panic)("mem__tempreg: bad size");
   }
   uInstr3(cb, CCALL, 0, TempReg, t_mem, NoValue, 0, TempReg, SHADOW(t_reg));
   uCCall (cb, helper, 1, 1, True);
}

static void archreg__tempreg(UCodeBlock* cb, UInt size,
                             UInt archreg, UInt t_reg)
{
   UInt tX = newTemp(cb);
   Addr helper;

   INSTR("SHADOW(%s).%d --> q%d\n", nameIReg(size, archreg), size, t_reg);

   switch (size) {
   case 1:  helper = (Addr) & get_archreg_shadow1; break; 
   case 2:  helper = (Addr) & get_archreg_shadow2; break;  
   case 4:  helper = (Addr) & get_archreg_shadow4; break;  
   default: VG_(skin_panic)("archreg__tempreg: bad size");
   }
   uInstr2 (cb, MOV, 4, Literal, 0, TempReg, tX);
   uLiteral(cb, archreg ); 
   uInstr3 (cb, CCALL, 0, TempReg, tX, NoValue, 0, TempReg, SHADOW(t_reg));
   uCCall  (cb, helper, 1, 1, True);
}

static void tempreg__archreg(UCodeBlock* cb, UInt size, UInt t_reg,
                             UInt archreg)
{
   UInt tX = newTemp(cb);
   Addr helper;

   INSTR("q%d --> SHADOW(%s).%d\n", t_reg, nameIReg(size, archreg), size);

   switch (size) {
   case 1:  helper = (Addr) & set_archreg_shadow1; break; 
   case 2:  helper = (Addr) & set_archreg_shadow2; break;  
   case 4:  helper = (Addr) & set_archreg_shadow4; break;  
   default: VG_(skin_panic)("tempreg__archreg: bad size");
   }
   uInstr2 (cb, MOV, 4, Literal, 0, TempReg, tX);
   uLiteral(cb, archreg); 
   uInstr2 (cb, CCALL, 0, TempReg, tX, TempReg, SHADOW(t_reg));
   uCCall  (cb, helper, 2, 2, False);
}

/*--------------------------------------------------------------------*/
/*--- Instrumentation-related functions                            ---*/
/*--------------------------------------------------------------------*/

static UInstr* my_get_instr(UCodeBlock* cb_in, UInt i)
{
   if (i < VG_(get_num_instrs)(cb_in))
      return VG_(get_instr)(cb_in, i);
   else
      return NULL;
}

/*--------------------------------------------------------------------*/
/*--- Functions called from instrumented code                      ---*/
/*--------------------------------------------------------------------*/

static __attribute__((regparm(1)))
void print_BB_entry(UInt bb)
{
   VG_(printf)("%u ========================\n", bb);
   //dump_archregs();
}

/* Calls the builder, arranging things so the real result goes in
   t_val, and its shadow in SHADOW(t_val). 
   Nb: the called helper must use regparm() for all its args!
 */
static void call_builder(UCodeBlock* cb, Addr f, Tag tag1, UInt val1,
                         Tag tag2, UInt val2, UInt t_val)
{
   UInt argc;

   if      (NoValue == tag1) argc = 0;
   else if (NoValue == tag2) argc = 1;
   else                      argc = 2;

   // Move contents of t_val into rx_val
   VG_(set_global_var_tempreg)(cb, (Addr) & rx_val, t_val);

#ifdef RX_SLICING
   // Add instrumentation to move `rx_instr_addr_compiletime' (fixed) into
   // the global var `rx_instr_addr_runtime'.
   VG_(set_global_var)(cb, (Addr) & rx_instr_addr_runtime,
                                    rx_instr_addr_compiletime);
#endif

   uInstr3 (cb, CCALL, 0, tag1, val1, tag2, val2, TempReg, SHADOW(t_val));
   uCCall  (cb, f, argc, argc, /*ret_val*/True);
}

// Nb: q_res is the register that the result is put into by the generated
// code.  We also return q_res so that any subsequent code generation knows
// which register it was put into.
static void const_call_builder(UCodeBlock* cb, UInt lit_size, UInt lit32,
                               UInt q_res)
{
   UInt t_lit = newTemp(cb);
   Addr const_builder;

   // Build it at compile-time
   // uInstr2 (cb, MOV, 4, Literal, 0, TempReg, q_arg1);
   // uLiteral(cb, (UInt)build_const(ConstCode, lit_size, u->lit32));

   // Build it at run-time
   switch (lit_size) {
   case Sz1: const_builder = (Addr) & build_code_const1; break;
   case Sz2: const_builder = (Addr) & build_code_const2; break;
   case Sz4: const_builder = (Addr) & build_code_const4; break;
   default:  VG_(skin_panic)("bad lit_size in binop()");
   }
   uInstr2 (cb, MOV, 4, Literal, 0, TempReg, t_lit);
   uLiteral(cb, lit32);
   uInstr3 (cb, CCALL, 0, TempReg, t_lit, NoValue,0, TempReg, q_res);
   uCCall  (cb, const_builder, 1, 1, /*ret_val*/True);
}

static UInt count_POPs(UCodeBlock* cb, UInt i)
{
   UInstr* u;
   UInt n_POPs = 0;
   while (NULL != (u = my_get_instr(cb, i))) {
      if (POP == u->opcode) {
         n_POPs++;
      } else if (CALLM_E == u->opcode) {
         return n_POPs;
      }
      i++;
   }
   VG_(skin_panic)("did not find CALLM_E?");
}

/* Need n_POPs for multiplication -- sometimes it gives a double-word
   return, sometimes a single, and the behaviour is different */
static void get_helper_builder(UInt offset, UInt n_POPs, Addr builders[2])
{
#define  m(name)  (VGOFF_(helper_##name) == offset)

   // XXX: could make a macro for all the skin_panics...

   sk_assert(n_POPs <= 2);

   if (m(idiv_64_32) || m(div_64_32)) {
      builders[0] = (Addr) & build_Mod64;      // EDX
      builders[1] = (Addr) & build_Div64;      // EAX
   }
   else if m(idiv_32_16)   VG_(skin_panic)("argh helper_idiv_32_16");
   else if m(div_32_16)    VG_(skin_panic)("argh helper_div_32_16");
   else if m(idiv_16_8)    VG_(skin_panic)("argh helper_idiv_16_8");
   else if m(div_16_8)     VG_(skin_panic)("argh helper_div_16_8");

   else if (m(imul_32_64) || m(mul_32_64)) {
      if (n_POPs == 2) {
         builders[0] = (Addr) & build_Mul64High;  // EDX
         builders[1] = (Addr) & build_Mul64;      // EAX
      } else {
         builders[0] = (Addr) & build_Mul64;      // EDX
      }
   }
   else if m(imul_16_32)   VG_(skin_panic)("argh helper_imul_16_32");
   else if m(mul_16_32)    VG_(skin_panic)("argh helper_mul_16_32");
   else if m(imul_8_16)    VG_(skin_panic)("argh helper_imul_8_16");
   else if m(mul_8_16)     VG_(skin_panic)("argh helper_mul_8_16");

   else if m(CLD)          builders[0] = 0;
   else if m(STD)          builders[0] = 0;
   else if m(get_dirflag)  builders[0] = 0;       // These three seem ok

   else if m(CLC)          VG_(skin_panic)("argh helper_CLC");
   else if m(STC)          VG_(skin_panic)("argh helper_STC");

   else if m(shldl)        builders[0] = (Addr) & build_Shld;
   else if m(shrdl)        builders[0] = (Addr) & build_Shrd;
   else if m(shldw)        VG_(skin_panic)("argh helper_shldw");
   else if m(shrdw)        VG_(skin_panic)("argh helper_shrdw");

   else if m(RDTSC) {
      builders[0] = (Addr) & build_Special;  // EDX
      builders[1] = (Addr) & build_Special;  // EAX
   }
   else if m(CPUID)        VG_(skin_panic)("argh helper_CPUID");

   else if m(bsf)          VG_(skin_panic)("argh helper_bsf");
   else if m(bsr)          VG_(skin_panic)("argh helper_bsr");

   else if m(fstsw_AX)     VG_(skin_panic)("argh helper_fstsw_AX");
   else if m(SAHF)         VG_(skin_panic)("argh helper_SAHF");
   else if m(DAS)          VG_(skin_panic)("argh helper_DAS");
   else if m(DAA)          VG_(skin_panic)("argh helper_DAA");

   else                    VG_(skin_panic)("argh: something else");

#undef m
}

static void set_last_condcode_op(UCodeBlock* cb, UInt q_condcode)
{
   // If any operations in this BB used or affected the condcodes, record
   // its shadow_node in 'last_condcode_op', so that a Jcc or CC2VAL at the
   // very start of the next BB, eg:
   //
   //   0: Jzo       $0x4014BAB3  (-rOSZACP)
   //   1: JMPo      $0x4014BAAF
   //
   // can know where its value came from.
   if (INVALID_TEMPREG != q_condcode) {
      UInt tN = newTemp(cb); 
      uInstr2 (cb, MOV,   4, Literal, 0, TempReg, tN);
      uLiteral(cb, (UInt) & last_condcode_op);
      uInstr2 (cb, STORE, 4, TempReg, q_condcode, TempReg, tN);
      INSTR("set last_condcode <-- q%d\n", q_condcode-1);
   }
}
 
static UInt get_last_condcode_op(UCodeBlock* cb, UInt q_condcode)
{
   // If a previous operation in this BB has set the condcodes, just use its
   // shadow.  Otherwise, load (from 'last_condcode_op') the shadow we saved
   // from the previous BB.  Used for JMPs and CC2VAL.
   if (INVALID_TEMPREG == q_condcode) {
      UInt tN = newTemp(cb); 
      q_condcode = newShadow(cb);
      uInstr2 (cb, MOV,   4, Literal, 0, TempReg, tN);
      uLiteral(cb, (UInt) & last_condcode_op);
      uInstr2 (cb, LOAD, 4, TempReg, tN, TempReg, q_condcode);
      INSTR("get last_condcode --> q%d\n", q_condcode-1);
   }
   return q_condcode;
}


static void binop(UCodeBlock* cb, UInstr* u, Char* s, UInt lit_size,
                  Addr builder, Bool can_be_Literal, Bool can_be_ArchReg,
                  Bool can_be_ArchReg2) 
{
   UInt q_arg1, q_arg2;
   UInt t_arch_shadow;
   
   VG_(copy_UInstr)(cb, u);

   switch (u->tag1) {
   case TempReg:
      INSTR("q%d %s q%d --> q%d\n", u->val2, s, u->val1, u->val2);
      q_arg1 = SHADOW(u->val1);
      break;

   case Literal:
      sk_assert(can_be_Literal);
      q_arg1 = newShadow(cb);
      INSTR("q%d %s %u --> q%d\n", u->val2, s, u->lit32, u->val2);
      const_call_builder(cb, lit_size, u->lit32, q_arg1);
      break;

   case ArchReg: {
      sk_assert(can_be_ArchReg);
      t_arch_shadow = newTemp(cb);
      archreg__tempreg(cb, u->size, u->val1, t_arch_shadow);
      INSTR("q%d %s q%d==SHADOW(%s) --> q%d\n", u->val2, s, t_arch_shadow,
              nameIReg(u->size, u->val1), u->val2);
      q_arg1 = SHADOW(t_arch_shadow);
      break;
   }
   default: VG_(skin_panic)("binop: unknown arg type");
   }

   switch (u->tag2) {
   case TempReg:
      q_arg2 = SHADOW(u->val2);
      break;

   case ArchReg: {
      sk_assert(can_be_ArchReg);
      t_arch_shadow = newTemp(cb);
      archreg__tempreg(cb, u->size, u->val2, t_arch_shadow);
      q_arg2 = SHADOW(t_arch_shadow);
      break;
   }
   default: VG_(skin_panic)("binop: unexpected arg type");
   }

   call_builder(cb, builder, TempReg, q_arg2, TempReg, q_arg1, u->val2);
}

static void unop(UCodeBlock* cb, UInstr* u, Char* s, Addr builder) 
{
   sk_assert(TempReg == u->tag1);
   VG_(copy_UInstr)(cb, u);
   INSTR("%s(q%d) --> q%d\n", s, u->val1, u->val1);
   call_builder(cb, builder, TempReg, SHADOW(u->val1), NoValue, 0, u->val1);
}

// First JMP instruction: the 2nd last if there's a Jcc, or the last if not.
static Bool is_first_jmp_instr(UCodeBlock* cb, UInstr* u, UInt i)
{
   UInt n = VG_(get_num_instrs)(cb);
   if (i == n-2) {                                    // 2nd last
      return True;
   } else if (i == n-1) {                             // last
      UInstr* u_prev = my_get_instr(cb, i-1);
      return ( (NULL == u_prev || JMP != u_prev->opcode) 
             ? True                                   // prev not Jcc
             : False                                  // prev was Jcc
             );
   } 
   VG_(skin_panic)("JMP not last or 2nd last instr?");
}

/*--------------------------------------------------------------------*/
/*--- Replacing malloc() et al                                     ---*/
/*--------------------------------------------------------------------*/

static VgHashTable rx_malloc_list = NULL;

/* Use a small redzone (paranoia) */
UInt VG_(vg_malloc_redzone_szB) = 4;


static void add_Rx_Chunk ( Addr p, UInt size )
{
   Rx_Chunk* rc = VG_(malloc)(sizeof(Rx_Chunk));
   rc->data     = p;
   rc->size     = size; 
   VG_(HT_add_node)( rx_malloc_list, (VgHashNode*)rc );
}

/* Allocate memory and note change in memory available */
static __inline__
void* alloc_mem ( UInt size, UInt alignment, Bool is_zeroed )
{
   Addr p = (Addr)VG_(cli_malloc)(alignment, size);
   add_Rx_Chunk ( p, size );
   rx_new_mem_heap( p, size, is_zeroed );
   return (void*)p;
}

void* SK_(malloc) ( Int n )
{
   return alloc_mem ( n, VG_(clo_alignment), /*is_zeroed*/False );
}

void* SK_(__builtin_new) ( Int n )
{
   return alloc_mem ( n, VG_(clo_alignment), /*is_zeroed*/False );
}

void* SK_(__builtin_vec_new) ( Int n )
{
   return alloc_mem ( n, VG_(clo_alignment), /*is_zeroed*/False );
}

void* SK_(memalign) ( Int align, Int n )
{
   return alloc_mem ( n, align, /*is_zeroed*/False );
}

void* SK_(calloc) ( Int nmemb, Int size1 )
{
   void* p;
   Int  size, i;

   size = nmemb * size1;

   p = alloc_mem ( size, VG_(clo_alignment), /*is_zeroed*/True );
   for (i = 0; i < size; i++)    /* calloc() is zeroed */
      ((UChar*)p)[i] = 0;
   return p;
}

static
void die_and_free_mem ( Rx_Chunk* rc, Rx_Chunk** prev_chunks_next_ptr )
{
   /* Remove rc from the malloclist using prev_chunks_next_ptr to
      avoid repeating the hash table lookup. */
   *prev_chunks_next_ptr = rc->next;

   rx_die_mem_heap( rc->data, rc->size );

   VG_(cli_free) ( (void*)(rc->data) );
   VG_(free)     ( rc );
}


static __inline__
void handle_free ( void* p )
{
   Rx_Chunk*  rc;
   Rx_Chunk** prev_chunks_next_ptr;

   rc = (Rx_Chunk*)VG_(HT_get_node) ( rx_malloc_list, (UInt)p,
                                      (VgHashNode***)&prev_chunks_next_ptr );
   if (rc == NULL) {
      return;
   }
   die_and_free_mem ( rc, prev_chunks_next_ptr );
}

void SK_(free) ( void* p )
{
   handle_free(p);
}

void SK_(__builtin_delete) ( void* p )
{
   handle_free(p);
}

void SK_(__builtin_vec_delete) ( void* p )
{
   handle_free(p);
}

void* SK_(realloc) ( void* p, Int new_size )
{
   Rx_Chunk  *rc;
   Rx_Chunk **prev_chunks_next_ptr;
   UInt       i;

   /* First try and find the block. */
   rc = (Rx_Chunk*)VG_(HT_get_node) ( rx_malloc_list, (UInt)p,
                                       (VgHashNode***)&prev_chunks_next_ptr );

   if (rc == NULL) {
      return NULL;

   } else if (rc->size == new_size) {
      /* size unchanged */
      return p;

   } else if (rc->size > new_size) {
      /* new size is smaller */
      rc->size = new_size;
      return p;

   } else {
      /* new size is bigger */
      Addr p_new;

      /* Get new memory */
      p_new = (Addr)VG_(cli_malloc)(VG_(clo_alignment), new_size);

      /* First half kept and copied, second half new */
      rx_copy_mem     ( (Addr)p, p_new, rc->size );
      rx_new_mem_heap ( p_new+rc->size, new_size-rc->size, /*inited*/False );

      /* Copy from old to new */
      for (i = 0; i < rc->size; i++)
         ((UChar*)p_new)[i] = ((UChar*)p)[i];

      /* Free old memory */
      die_and_free_mem ( rc, prev_chunks_next_ptr );

      /* this has to be after die_and_free_mem, otherwise the
         former succeeds in shorting out the new block, not the
         old, in the case when both are on the same list.  */
      add_Rx_Chunk ( p_new, new_size );

      return (void*)p_new;
   }
}

// Build a node appropriate for the call
static void rx_post_reg_write_clientcall ( ThreadId tid, UInt reg, Addr f )
{
   UInt      p;
   ShW*      sh;
   AllocKind kind;

   if      (f == (Addr) & SK_(malloc))            kind = AllocMalloc;
   else if (f == (Addr) & SK_(__builtin_new))     kind = AllocNew;
   else if (f == (Addr) & SK_(__builtin_vec_new)) kind = AllocVecNew;
   else if (f == (Addr) & SK_(calloc))            kind = AllocCalloc;
   else if (f == (Addr) & SK_(memalign))          kind = AllocMemAlign;
   else if (f == (Addr) & SK_(realloc))           kind = AllocRealloc;
   else {
      VG_(set_thread_shadow_archreg)(tid, reg, (UInt)RX_(specials)[SpClReqReg]);
      return;
   }

   p = VG_(get_thread_archreg)(tid, reg);
   sh = build(ShAlloc, kind, Sz4, p, 0, NULL);
   VG_(set_thread_shadow_archreg) ( tid, reg, (UInt)sh );
}

/*--------------------------------------------------------------------*/
/*--- Client requests                                              ---*/
/*--------------------------------------------------------------------*/

Bool SK_(handle_client_request) ( ThreadId tid, UInt* args, UInt *ret )
{
   if (!VG_IS_SKIN_USERREQ('R','X',args[0]))
      return False;

   switch (args[0]) {
   case VG_USERREQ__PRINT_VAR: {
      Addr var_ptr = args[1];
#ifdef RX_SLICING
      RX_(dmp_var)(var_ptr);
#else
      RX_(gr_var)(var_ptr);
#endif
      *ret = 0;   // unused
      break;
   }
   default:
      VG_(message)(Vg_UserMsg,
                   "Warning: unknown redux client request code %d", args[0]);
      return False; 
   }

   return True;
}


/*--------------------------------------------------------------------*/
/*--- SK_(instrument)()                                            ---*/
/*--------------------------------------------------------------------*/

// XXX: urgh
extern void VG_(init_tt_tc) ( void );

UCodeBlock* SK_(instrument)(UCodeBlock* cb_in, Addr a)
{
   UCodeBlock* cb;
   Int         i;
   UInstr     *u;
   UInt        q_condcode = INVALID_TEMPREG;

   static UInt bb = 0;

   // CALLM state
   UInt callm_args[3];
   UInt callm_tags[3];
   Addr callm_builders[2];
   UInt callm_argc   = 0;
   UInt callm_retc   = 0;
   Bool in_callm     = False;

#define FNNAME_LEN   100
   Char  fnname_buf[FNNAME_LEN];

#ifdef RX_SLICING
   rx_instr_addr_compiletime = a;
#endif

   if (! entered_main) {
      if (VG_(get_fnname_if_entry)(a, fnname_buf, FNNAME_LEN) &&
          0 == VG_(strcmp)(fnname_buf, "main")) {

         entered_main = True;
         if (RX_(clo_instr) || RX_(clo_actions)) {
            VG_(printf)("\nSTART OF main()\n");
         }
         if (RX_(clo_skip)) {
            // XXX: nicer way to do this... eg. don't waste the mmap'd memory
            VG_(init_tt_tc)();    // flush all translations
         }
      } else if (RX_(clo_skip)) {
         return cb_in;
      }
   }

   cb = VG_(setup_UCodeBlock)(cb_in);

   if ( VG_(get_fnname_if_entry)(a, fnname_buf, FNNAME_LEN))
   {
      VG_(printf)("compiling: %s\n", fnname_buf);
   }

   if (RX_(clo_instr))
      VG_(printf)("\n-- BB %d(%p) --------------------------------\n", bb, a);

   if (RX_(clo_actions) || RX_(clo_instr) || RX_(clo_sanity) >= 2
    || RX_(clo_bb_entry)) {
      UInt tBB = newTemp(cb);
      uInstr2 (cb, MOV, 4, Literal, 0, TempReg, tBB);
      uLiteral(cb, bb);

      /* This prints out the BB number, every time it's entered */
      if (RX_(clo_actions) || RX_(clo_instr) || RX_(clo_bb_entry)) {
         uInstr1 (cb, CCALL, 0, TempReg, tBB);
         uCCall  (cb, (Addr) & print_BB_entry, 1, 1, /*ret_val*/False);
      }
      /* This does a complete sanity check of memory every BB */
      if (RX_(clo_sanity) >= 2) {
         uInstr1 (cb, CCALL, 0, TempReg, tBB);
         uCCall  (cb, (Addr) & check_mem, 1, 1, /*ret_val*/False);
      }
   }

   /*-- Start main loop -------------------------------------------------*/
   for (i = 0; i < VG_(get_num_instrs)(cb_in); i++) {
      u = VG_(get_instr)(cb_in, i);

      if (RX_(clo_instr) && u->opcode != NOP) VG_(pp_UInstr) ( i, u );

      // If --sanity, check archregs after every UCode instruction
      if (RX_(clo_sanity) >= 1) {
         UInt tN = newTemp(cb);
         uInstr2 (cb, MOV, 4, Literal, 0, TempReg, tN);
         uLiteral(cb, i);

         // Sanity check archregs
         uInstr1(cb, CCALL, 0, TempReg, tN);
         uCCall(cb, (Addr) & check_archregs, 1, 1, /*ret_val*/False);
      }

      /*-- Start main switch --------------------------------------------*/
      switch (u->opcode) {
      case GET:
         VG_(copy_UInstr)(cb, u);
         archreg__tempreg(cb, u->size, u->val1, u->val2);
         break;

      case PUT:
         VG_(copy_UInstr)(cb, u);
         tempreg__archreg(cb, u->size, u->val1, u->val2);
         break;

      case ADD:
      case ADC: 
         binop(cb, u, "+", u->size, (Addr) & build_Add, True, True, False);
         break;
      case SUB:
      case SBB:
         binop(cb, u, "-", u->size, (Addr) & build_Sub, True, True, False);
         break;
      case MUL:
         binop(cb, u, "*", u->size, (Addr) & build_Mul, True, True, True);
         break;
      case XOR:
         binop(cb, u, "^", u->size, (Addr) & build_Xor, True, True, False);
         break;

      // Nb: any shift/rotate literals are always 1-byte
      case SHL:
         binop(cb, u, "shl", 1, (Addr) & build_Shl, True, False, False);
         break;
      case SHR:
         binop(cb, u, "shr", 1, (Addr) & build_Shr, True, False, False);
         break;
      case SAR:
         binop(cb, u, "sar", 1, (Addr) & build_Sar, True, False, False);
         break;
      case ROL:
         binop(cb, u, "rol", 1, (Addr) & build_Rol, True, False, False);
         break;
      case ROR:
         binop(cb, u, "ror", 1, (Addr) & build_Ror, True, False, False);
         break;

      case AND:
         binop(cb, u, "&&", u->size, (Addr) & build_And, False, False, False);
         break;
      case OR:
         binop(cb, u, "||", u->size, (Addr) & build_Or,  False, False, False);
         break;

      case INC: unop(cb, u, "++",  (Addr) & build_Inc);  break;
      case DEC: unop(cb, u, "--",  (Addr) & build_Dec);  break;

      case NEG: unop(cb, u, "neg", (Addr) & build_Neg);  break;
      case NOT: unop(cb, u, "~",   (Addr) & build_Not);  break;

      case WIDEN: unop(cb, u, "widen", (Addr) & build_Widen);  break;

#if 0
      case BSWAP:
         VG_(copy_UInstr)(cb, u);
         IGNORE("q%d = bswap(q%d)\n", u->val1, u->val1);
         break;
#endif

      case MOV:
         VG_(copy_UInstr)(cb, u);
         if (Literal == u->tag1) {
            // XXX: should scan forward to see if t2 is boring
            INSTR("%u --> q%d (-?-)\n", u->lit32, u->val2);
            const_call_builder(cb, u->size, u->lit32, SHADOW(u->val2));
            break;

         } else {
            INSTR("q%d --> q%d\n", u->val1, u->val2);
            uInstr2 (cb, MOV, 4, TempReg, SHADOW(u->val1),
                                 TempReg, SHADOW(u->val2));
         }
         break;

      case CMOV:
         // just copy shadow if condition is true
         uInstr2(cb, CMOV, 4, TempReg, SHADOW(u->val1),
                              TempReg, SHADOW(u->val2));
         uCond(cb, u->cond);
         uFlagsRWU(cb, u->flags_r, u->flags_w, FlagsEmpty);
         VG_(copy_UInstr)(cb, u);
         break;

      case LOAD:
         mem__tempreg(cb, u->size, u->val1, u->val2, True);
         VG_(copy_UInstr)(cb, u);      // must come after mem__tempreg!
         break;

      case STORE:
         VG_(copy_UInstr)(cb, u);
         tempreg__mem(cb, u->size, u->val1, u->val2);
         break;

#if 0
      case FPU_R:
         mem__tempreg(cb, u->val2, u->val1, False);
         VG_(copy_UInstr)(cb, u);
         break;

      case FPU_W:
         tempreg__mem(cb, u->val1, u->val2);
         VG_(copy_UInstr)(cb, u);
         break;
#endif

      // addr = base + displ
      // val2 = val1 + lit32
      case LEA1: {
         UInt tk;
         // XXX: don't build a k() if boring, eg:
         // LEA1L	8(t4), t10
         // LDL	(t10), t14
         //   or
         // LEA1L     -4(t2), t0
         // MOVL      $0x1, t4
         // STL       t4, (t0)
         //
         // Do build a k() if interesting, eg:
         // LEA1L     12(t4), t30
         // STL       t30, (t12)

         VG_(copy_UInstr)(cb, u);
         INSTR("lea1(%u + t%d) --> q%d\n", u->lit32, u->val1, u->val2);
         
         // Arg 2: displacement 
         tk = newTemp(cb);
         uInstr2 (cb, MOV,   4, Literal, 0, TempReg, tk);
         uLiteral(cb, (UInt)build_const(ConstCode, u->size, u->lit32) );

         call_builder(cb, (Addr) & build_Lea1, TempReg, SHADOW(u->val1),
                      TempReg, tk, u->val2);
         break;
      }

      // addr = base + (index * scale)   + displ.
      // val3 = val1 + (val2  * extra4b) + lit32
      case LEA2:
         VG_(copy_UInstr)(cb, u);
         INSTR("lea2(t%d + (t%d)*%d + %d) -> q%d\n",
                 u->val1, u->val2, u->extra4b, u->lit32, u->val3);

         // Arg 3: scale
         VG_(set_global_var)(cb, (UInt) & rx_arg3, 
                       (UInt)build_const(ConstCode, u->size, u->extra4b) );
         // Arg 4: displacement
         VG_(set_global_var)(cb, (UInt) & rx_arg4, 
                       (UInt)build_const(ConstCode, u->size, u->lit32) );
         call_builder(cb, (Addr) & build_Lea2, TempReg, SHADOW(u->val1),
                      TempReg, SHADOW(u->val2), u->val3);
         break;

      case INCEIP:
#ifdef RX_SLICING
         rx_instr_addr_compiletime += u->val1;
#endif
         VG_(copy_UInstr)(cb, u);
         break;
                 

      case NOP: case LOCK: 
         break;

      /*--------------------------------------------------------------*/
      /*--- CALLM and related                                      ---*/
      /*--------------------------------------------------------------*/
      /* Note that instructions that return two results (eg. div/idiv,
         mul/imul are drawn with two nodes, if both results are used */
      case CALLM_S: 
         VG_(copy_UInstr)(cb, u);
         sk_assert(!in_callm);
         in_callm      = True;
         callm_args[0] = callm_args[1] = callm_args[2] = INVALID_TEMPREG;
         callm_tags[0] = callm_tags[1] = callm_tags[2] = NoValue;
         callm_builders[0] = callm_builders[1] = INVALID_TEMPREG;
         callm_argc    = 0;
         callm_retc    = 0;
         break;

      // Prepend the new arg onto callm_args[] so that things come out in
      // the right order, eg for div/mod.
      case PUSH:
         VG_(copy_UInstr)(cb, u);
         sk_assert(in_callm);
#if 0
         callm_args[2] = callm_args[1];
         callm_tags[2] = callm_tags[1];
         callm_args[1] = callm_args[0];
         callm_tags[1] = callm_tags[0];
         callm_args[0] = u->val1;
         callm_tags[0] = TempReg;
#else
         callm_args[callm_argc] = u->val1;
         callm_tags[callm_argc] = TempReg;
#endif
         callm_argc++;
         sk_assert(callm_argc <= 3);
         break;

      case CALLM: {
         Int j, n_POPs;
         sk_assert(in_callm);
         VG_(copy_UInstr)(cb, u);
         n_POPs = count_POPs(cb_in, i+1); 
         get_helper_builder(u->val1, n_POPs, callm_builders);
         if (3 == callm_argc) {
            VG_(set_global_var_tempreg)(cb, (Addr) & rx_third_arg, 
                                        SHADOW(callm_args[2]));
         }
         // Copy the args because they can get clobbered by the CCALLs
         for (j = 0; j < 3; j++) {
            UInt tN = newShadow(cb);
            if (NoValue != callm_tags[j])
               uInstr2(cb, MOV, 4, TempReg, SHADOW(callm_args[j]), 
                                   TempReg, tN);
            callm_args[j] = tN;
         }
         break;
      }

      case POP:
         sk_assert(in_callm);
         VG_(copy_UInstr)(cb, u);
         // Build node for the POP'd result
         if (callm_builders[callm_retc] != 0)
            call_builder(cb, callm_builders[callm_retc], callm_tags[0],
                         callm_args[0], callm_tags[1], callm_args[1],
                         u->val1);
         callm_retc++;
         sk_assert(callm_retc <= 2);
         break;

      case CLEAR:
         sk_assert(in_callm);
         VG_(copy_UInstr)(cb, u);
         break;

      case CALLM_E:
         sk_assert(in_callm);
         VG_(copy_UInstr)(cb, u);
         in_callm = False;
         break;
      /*--------------------------------------------------------------*/
      /*--- End of CALLM and related                               ---*/
      /*--------------------------------------------------------------*/

      case CC2VAL: {
         UInt tCC = newTemp(cb);

         VG_(copy_UInstr)(cb, u);
         q_condcode = get_last_condcode_op(cb, q_condcode);
         INSTR("q%d = %s2val() <== q%d\n", 
                     u->val1, VG_(name_UCondcode)(u->cond), q_condcode);

         uInstr2 (cb, MOV, 4, Literal, 0, TempReg, tCC);
         uLiteral(cb, u->cond);
         call_builder(cb, (Addr) & build_cc2val, TempReg, tCC, 
                      TempReg, q_condcode, u->val1);
         break;
      }

      case JMP: {
         if (is_first_jmp_instr(cb_in, u, i))
            set_last_condcode_op(cb, q_condcode);
                   
         if (RX_(clo_jcc) && CondAlways != u->cond) {
            UInt tCC = newTemp(cb);
            q_condcode = get_last_condcode_op(cb, q_condcode);

            INSTR("(CF) = %s(q%d)\n", VG_(name_UCondcode)(u->cond),
                    q_condcode);
            uInstr2 (cb, MOV, 4, Literal, 0, TempReg, tCC);
            uLiteral(cb, u->cond);
            uInstr2 (cb, CCALL, 0, TempReg, tCC, TempReg, q_condcode);
            uCCall  (cb, (Addr) & build_Cond, 2, 2, /*ret_val*/False);

         } else {
            switch (u->jmpkind) {
            case JmpBoring:    IGNORE("(ignore JMP)\n");     break;
            case JmpClientReq: IGNORE("(ignore JMP-cli)\n"); break;
            case JmpCall:      IGNORE("(ignore JMP-c)\n");   break;
            case JmpSyscall:   INSTR("SH(%%eax) = syscall(...)\n"); break;

            case JmpRet:       
               if (VG_(get_fnname)(a, fnname_buf, FNNAME_LEN) &&
                   0 == VG_(strcmp)(fnname_buf, "main")) {
                  if (RX_(clo_instr) || RX_(clo_actions)) {
                     VG_(printf)("\nEND OF main()\n");
                  }
                  exited_main = True;
                  //VG_(init_tt_tc)();
               }

               if ( VG_(get_fnname)(a, fnname_buf, FNNAME_LEN) )
               {
                  // do nothing
               }
               break;

            default: VG_(skin_panic)("unknown jmp kind");
            }
         }

#ifdef RX_SLICING
         /* Get x86 instr size from final JMP. */
         rx_instr_addr_compiletime += VG_(get_last_instr)(cb_in)->extra4b;
#endif

         VG_(copy_UInstr)(cb, u);
         break;
      }

/*      0x4014C054:  repe stosl
        0: GETL      %ECX, t2
        1: JIFZL     t2, $0x4014C056
        2: DECL      t2
        3: PUTL      t2, %ECX
        4: GETL      %EAX, t0
        5: GETL      %EDI, t4
        6: STL       t0, (t4)
        7: CALLM_So
        8: MOVL      $0x0, t0
        9: PUSHL     t0
       10: CALLMo    $0x53  (-rD)
       11: POPL      t0
       12: CALLM_Eo
       13: SHLL      $0x2, t0
       14: ADDL      t0, t4
       15: PUTL      t4, %EDI
       16: JMPo      $0x4014C054 */
      /* Nb: this case does the entire BB to its end */
      case JIFZ: {
         IGNORE("(ignore JMP-c)\n");
         VG_(copy_UInstr)(cb, u);
         break;
      }
#if 0
         UInt j;
         UInstr *ppu, *pu;
         Bool ecx_put = False, esi_put = False, edi_put = False;

         // %ECX is changed after the REP instruction... this
         // updates it (but with a not-very-informative shadow).
         VG_(set_global_var)(cb, VG_(shadow_archreg_address)(R_ECX),
                             (UInt)RX_(specials)[SpRep]);

         // Look ahead and check that %ECX and %EDI are updated, and
         // whether %ESI is too.  Then update the relevant regs (but with
         // a not-very-informative shadow).
         for (j = i+1; j < VG_(get_num_instrs)(cb_in); j++) {
            u = VG_(get_instr)(cb_in, j);
            if (PUT == u->opcode) {
               switch (u->val2) {
               case R_ECX: ecx_put = True;   break;
               case R_ESI: esi_put = True;   break;
               case R_EDI: edi_put = True;   break;
               default: sk_assert(-1 == 1);
               }
            }
         }
         sk_assert(ecx_put);
         VG_(set_global_var)(cb, VG_(shadow_archreg_address)(R_ECX),
                             (UInt)RX_(specials)[SpRep]);
         sk_assert(edi_put);
         VG_(set_global_var)(cb, VG_(shadow_archreg_address)(R_EDI),
                             (UInt)RX_(specials)[SpRep]);
         if (esi_put) {
            VG_(set_global_var)(cb, VG_(shadow_archreg_address)(R_ESI),
                                (UInt)RX_(specials)[SpRep]);
         }

         // Scan forward to end of BB, handling any GETs, then the CALLM
         for (  ; i < VG_(get_num_instrs)(cb_in); i++) {
            u = VG_(get_instr)(cb_in, i);
            VG_(copy_UInstr)(cb, u);

            if (RX_(clo_instr) && u->opcode != NOP) 
               VG_(pp_UInstr) ( i, u );

            if (GET == u->opcode && R_EAX == u->val1) {
               // Occurs in "repe stosl"
               archreg__tempreg(cb, u->size, u->val1, u->val2);

            } else if (CALLM_S == u->opcode) {
               sk_assert(i >= 2);
               ppu = VG_(get_instr)(cb_in, i-2);
               pu  = VG_(get_instr)(cb_in, i-1);

               // Which REP instruction depends on what's before the CALLM
               switch (pu->opcode) {
               case STORE:
                  if (LOAD == ppu->opcode) {
                     // rep mov
                     mem__tempreg(cb, ppu->size, ppu->val1, ppu->val2,False);
                     tempreg__mem(cb,  pu->size,  pu->val1,  pu->val2);
                  } else {
                     // repe stos
                     tempreg__mem(cb,  pu->size,  pu->val1,  pu->val2);
                  }
                  break;

               case SUB:
                  // repne scasb -- do nothing
                  sk_assert(LOAD == ppu->opcode);
                  sk_assert(1 == ppu->size);
                  sk_assert(1 == pu->size);
                  break;

               default: VG_(skin_panic)("unhandled REP case");
               }
            }
         } 
         break;
      }
#endif

      // Do nothing;  not shadowing segment regs
      case PUTSEG:

      // Should be setting the def'd regs to something
      case GETSEG:
      case USESEG:
         VG_(copy_UInstr)(cb, u);
         break;

      default:
         VG_(pp_UInstr)(0, u);
         VG_(skin_panic)("Redux: unhandled instruction");
         break;
      }
      /*-- End main switch ----------------------------------------------*/

      // If the op updates the condcodes, remember this, for any subsequent
      // CC2VALs or JMPs */

      // XXX: should really do all the condcodes individually...
      // (nb: but can ignore DF, it is never used for conditional tests.
      //
      // I can sort of get away with it because CC2VAL is pretty rare,
      // I'm ignoring jumps, and the tests are usually SUB or AND anyway...

      // XXX: currently, sets q_condcode if the instruction writes any flags
      if (0 != (u->flags_w & FlagsOSZACP)) { // if it sets any flags
         if (u->tag2 == TempReg) {
            // binary ops: 
            //   (-wOSZACP): AND, OR, ADD, XOR, SUB, ADC, SBB, SHL, SHR, SAR
            //   (-wOC):     ROL, ROR, RCL, RCR
            //   (-wZCP):    FPU
            q_condcode = SHADOW(u->val2);

         } else if (u->tag1 == TempReg) {
            // unary ops:
            //   (-wALL):    PUTF
            //   (-wOSZACP): NEG
            //   (-wOSZAP):  INC, DEC
            q_condcode = SHADOW(u->val1);

         } else {
            // XXX: not recording anything... bad.  Difficulty is that at
            // this point we don't know where the CALLM args will be POPped
            // to...
            sk_assert(CALLM == u->opcode);
         }
      }
   }
   /*-- End main loop ---------------------------------------------------*/

   if (RX_(clo_instr)) VG_(printf)("q_condcode: q%u\n\n", q_condcode-1);

   bb++;

#ifdef RX_SLICING
   rx_instr_addr_compiletime = UNSET_INSTR_ADDR;
#endif

   VG_(free_UCodeBlock)(cb_in);
   return cb;
}

/*--------------------------------------------------------------------*/
/*--- Init/fini                                                       */
/*--------------------------------------------------------------------*/

void SK_(pre_clo_init)() 
{
   // 0-terminated arrays
   Addr compact_helpers[] = {
      (Addr) & get_mem_shadow4, (Addr) & set_mem_shadow4,
      (Addr) & build_code_const4, 
      (Addr) & build_Add,  (Addr) & build_Sub,
      (Addr) & build_Inc,  (Addr) & build_Dec,
      (Addr) & build_Lea1,
      0
   };
   Addr noncompact_helpers[] = {
      (Addr) & build_code_const2,   (Addr) & build_code_const1, 
      (Addr) & get_mem_shadow1,     (Addr) & get_mem_shadow2,
      (Addr) & set_mem_shadow1,     (Addr) & set_mem_shadow2,
      (Addr) & get_archreg_shadow1, (Addr) & get_archreg_shadow2,
      (Addr) & get_archreg_shadow4,
      (Addr) & set_archreg_shadow1, (Addr) & set_archreg_shadow2,
      (Addr) & set_archreg_shadow4,
      (Addr) & build_Lea2,
      (Addr) & build_cc2val,        (Addr) & build_Cond,
      (Addr) & build_Mul,
      (Addr) & build_Mul64High,     (Addr) & build_Mul64,
      (Addr) & build_Div64,         (Addr) & build_Mod64,
      (Addr) & build_And,           (Addr) & build_Or ,     (Addr) & build_Xor, 
      (Addr) & build_Shl,           (Addr) & build_Shr,     (Addr) & build_Sar, 
      (Addr) & build_Rol,           (Addr) & build_Ror,
      (Addr) & build_Not,           (Addr) & build_Neg,
      (Addr) & build_Shrd,          (Addr) & build_Shld,
      (Addr) & build_Widen,
      (Addr) & build_Special,
      (Addr) & print_BB_entry,
      (Addr) & check_archregs,      (Addr) & check_mem,
      0
   };
   Int i;

#ifdef RX_SLICING
   sk_assert(12 == sizeof(ShW));
#else
   sk_assert( 8 == sizeof(ShW));
#endif 

   VG_(details_name)            ("Redux");
   VG_(details_version)         ("0.0.1");
   VG_(details_description)     ("a dynamic data tracer");
   VG_(details_copyright_author)("Copyright (C) 2003, Nicholas Nethercote");
   VG_(details_bug_reports_to)  ("njn25@cam.ac.uk");

   // causes assertion error in VG_(init_tt_tc)() -- code expansion too big
   //VG_(details_avg_translation_sizeB) ( 380 );  // code expansion about 22:1 !

   /* Needs */
   VG_(needs_shadow_regs)         ();
   VG_(needs_syscall_wrapper)     ();
   VG_(needs_command_line_options)();
   VG_(needs_sanity_checks)       ();
   VG_(needs_client_requests)     ();
   VG_(needs_shadow_memory)       ();

   /* Events to track */
   VG_(init_new_mem_startup)( rx_new_mem_startup );
   VG_(init_new_mem_mmap)   ( rx_new_mem_mmap    );
   VG_(init_new_mem_brk)    ( rx_new_mem_brk     );
   VG_(init_new_mem_stack_signal)  ( NULL );

   VG_(init_copy_mem_remap)        ( rx_copy_mem_remap      );
   VG_(init_change_mem_mprotect)   ( rx_change_mem_mprotect );

   VG_(init_die_mem_stack)         ( rx_die_mem           );
   VG_(init_die_mem_munmap)        ( rx_die_mem           );
   VG_(init_die_mem_brk)           ( rx_die_mem           );
   VG_(init_die_mem_stack_signal)  ( rx_die_mem           );

   VG_(init_pre_mem_read)        ( rx_pre_mem_read        );
   VG_(init_pre_mem_read_asciiz) ( rx_pre_mem_read_asciiz );
   VG_(init_pre_mem_write)       ( rx_pre_mem_write       );
   VG_(init_post_mem_write)      ( rx_post_mem_write      );

   VG_(init_post_regs_write_init)             ( rx_post_regs_write_init );
   VG_(init_post_reg_write_syscall_return)    ( rx_post_reg_write_syscall );
   VG_(init_post_reg_write_deliver_signal)    ( rx_post_reg_write_signal );
   VG_(init_post_reg_write_pthread_return)    ( rx_post_reg_write_pthread );
   VG_(init_post_reg_write_clientreq_return)  ( rx_post_reg_write_clientreq );
   VG_(init_post_reg_write_clientcall_return) ( rx_post_reg_write_clientcall );

   VG_(init_pre_deliver_signal)  ( rx_pre_deliver_signal );
   VG_(init_post_deliver_signal) ( rx_post_deliver_signal );

   /* Helpers */
   for (i = 0; compact_helpers[i] != 0; i++)
      VG_(register_compact_helper)( compact_helpers[i] );

   for (i = 0; noncompact_helpers[i] != 0; i++)
      VG_(register_noncompact_helper)( noncompact_helpers[i] );

}

void SK_(post_clo_init)(void)
{
   /* Other */
   init_specials();   // Must come before init_shadow_memory()
   init_shadow_memory();
   init_syscall_info();
   rx_malloc_list = VG_(HT_construct)();
}

void SK_(fini)(Int exitcode)
{
   ShW* exit_shadow = (ShW*)VG_(get_exit_status_shadow)();
   
   if (! ignored_kind(exit_shadow->kind))
      sk_assert(exit_shadow->val == exitcode);

   VG_(printf)("nodes built:      %u\n",       n_nodes_built);
   VG_(printf)("total size:       %u bytes\n", n_bytes_built);

   // Build exit node. Use global var 'rx_val' for the real value, as always.
   rx_val = exit_shadow->val;
   record_syscall( build_Exit( exit_shadow ) );

#ifdef RX_SLICING
   RX_(dmp_ShW)(exit_shadow);
#else
   RX_(gr_graph)(conds_n, conds_made);
#endif
}

VG_DETERMINE_INTERFACE_VERSION(SK_(pre_clo_init), 1.0)

/*--------------------------------------------------------------------*/
/*--- end                                                rx_main.c ---*/
/*--------------------------------------------------------------------*/

