//
// Overview file
//
// This file contains the data-packing scheme used in the
// 3D-Now AAN fDCT.

// sword = short int (16-bit)
// dword = long int (32-bit)
// float = float (32-bit)
// [x,y] represents a 64-bit vector, consisting of 2 floats.
// [x,y] = [ (x is upper 32-bit), (y is lower 32-bit) ]
// when [x,y] is stored into linear memory address $0000, 
//      memory[ $000Z ] <= [y], memory[ $0004 ] <= [x]

  inptr = block;
  dptr = data;
  for (i = 0; i < 8; i++)
  {
    tmp0 = inptr[0]                                                      +inptr[7];
    tmp1 =          inptr[1]                                    +inptr[6];

    tmp7 = inptr[0]                                                      -inptr[7];
    tmp6 =          inptr[1]                                    -inptr[6];

    tmp3 =                            inptr[3]+inptr[4];
    tmp2 =                   inptr[2]                  +inptr[5];

    tmp4 =                            inptr[3]-inptr[4];
    tmp5 =                   inptr[2]                  -inptr[5];

//  1a) [t7,t6] = [inptr7, inptr6]; // dword, dword
//  1b) [t4,t5] = [inptr5, inptr4]; //dword, dword *
//  1c) [t0,t1] = [inptr0, inptr1]; // dword,dword
//  1d) [t3,t2] = [inptr2, inptr3]; //dword, dword *


//  st1_0 <= [tmp0,tmp1]
//  st1_1 <= [tmp7,tmp6]
//  st1_2 <= [tmp3,tmp2]; // (reversed from direct output [tmp2,tmp3])
//  st1_3 <= [tmp4,tmp5]; // (reversed from direct output [tmp5,tmp4])

    /* Even part */

    tmp10 = tmp0 + tmp3;	/* phase 2 */
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;

//  2a) [tmp10,tmp11] <= [tmp0,tmp1] + [tmp3,tmp2]
//  2b) [tmp13,tmp12] <= [tmp0,tmp1] - [tmp3,tmp2]

//  st1_20 <= [tmp10,tmp11]
//  st1_21 <= [tmp10,-tmp11]   // negated tmp11
//  st1_22 <= [tmp13,tmp12] 
//  st1_23 <= [tmp13,tmp13] // tmp13 duplicated

    dptr[0] = tmp10 + tmp11; /* phase 3 */
    dptr[4] = tmp10 - tmp11;

//  3a) [dataptr0,dataptr4] <= [tmp10+tmp11, tmp10-tmp11]; // pfacc

    z1 = (tmp12 + tmp13) * ((float ) 0.707106781); /* c4 */

//  4a) [z1a,z1a] <= [tmp12+tmp13,tmp12+tmp13];  // pfacc
//  4b) [z1,-z1] <= [ z1a,z1a ] * [ 0.7071,-0.7071 ];
//  st1_24 <= [z1, -z1]
//


    dptr[2] = tmp13 + z1;	/* phase 5 */
    dptr[6] = tmp13 - z1;

//  5a) [dataptr2,dataptr6] <= [tmp13,tmp13] + [z1,-z1];

    /*  Odd part */

    tmp14 = tmp4 + tmp5;	/* phase 2 */
    tmp15 = tmp5 + tmp6;
    tmp16 = tmp6 + tmp7;
//  6a) [tmp14,tmp16] <= [ (tmp4+tmp5), (tmp7+tmp6) ]; //pfacc
//  6b) [      tmp15] <= [tmp7,tmp6] + [tmp4,tmp5];  //pfacc

    /* The rotator is modified from fig 4-8 to avoid extra negations. */
    z5 = (tmp14 - tmp16) * ((float ) 0.382683433); /* c6 */
    z2 = ((float ) 0.541196100) * tmp14 + z5; /* c2-c6 */
    z4 = ((float ) 1.306562965) * tmp16 + z5; /* c2+c6 */
    z3 = tmp15 * ((float ) 0.707106781); /* c4 */

//  form [tmp14-tmp16, tmp15]
//  7a) [z2a,z4a] <= [tmp14,tmp16] * [0.5411,1.3066]
//  7b) [z5 ,z3 ] <= [tmp14-tmp16,tmp15] * [0.3827,0.7071]

//  7c) [z2,z4]  <= [z2a,z4a] + [z5,z5];

    z11 = tmp7 + z3;		/* phase 5 */
    z13 = tmp7 - z3;

//  7d) [z13,z11] <= [tmp7,tmp7] - [z3,-z3]

    dptr[5] = z13 + z2;	/* phase 6 */
    dptr[3] = z13 - z2;
    dptr[1] = z11 + z4;
    dptr[7] = z11 - z4;

//  8a) [dataptr5,dataptr1] <= [z13,z11] + [z2,z4]
//  8b) [dataptr3,dataptr7] <= [z13,z11] - [z2,z4]


    dptr += 8;		/* advance pointer to next row */
    inptr += 8;
  }

  /* transpose the intermediate result */
  transpose( data );

  /* Pass 2: process columns. */

  /* The column-processing loop uses the *same* 3d-Now data-packing 
     scheme. Note, since the 1st-pass transposes the matrix, the
     2nd-pass processes data row-by-row instead of column-by-column.*/

  dptr = data;
  for (i = 0; i < 8; i++)
  {
    // col_dct uses same 3D-now code as row_dct...
    tmp0 = dptr[0] + dptr[7];
    tmp7 = dptr[0] - dptr[7];
    tmp1 = dptr[1] + dptr[6];
    tmp6 = dptr[1] - dptr[6];
    tmp2 = dptr[2] + dptr[5];
    tmp5 = dptr[2] - dptr[5];
    tmp3 = dptr[3] + dptr[4];
    tmp4 = dptr[3] - dptr[4];

    /* Even part */

    tmp10 = tmp0 + tmp3;	/* phase 2 */
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;

    dptr[0] = tmp10 + tmp11; /* phase 3 */
    dptr[4] = tmp10 - tmp11;

    z1 = (tmp12 + tmp13) * ((float ) 0.707106781); /* c4 */
    dptr[2] = tmp13 + z1; /* phase 5 */
    dptr[6] = tmp13 - z1;

    /* Odd part */

    tmp10 = tmp4 + tmp5;	/* phase 2 */
    tmp11 = tmp5 + tmp6;
    tmp12 = tmp6 + tmp7;

    /* The rotator is modified from fig 4-8 to avoid extra negations. */
    z5 = (tmp10 - tmp12) * ((float ) 0.382683433); /* c6 */
    z2 = ((float) 0.541196100) * tmp10 + z5; /* c2-c6 */
    z4 = ((float) 1.306562965) * tmp12 + z5; /* c2+c6 */
    z3 = tmp11 * ((float) 0.707106781); /* c4 */

    z11 = tmp7 + z3;		/* phase 5 */
    z13 = tmp7 - z3;

    dptr[5] = z13 + z2; /* phase 6 */
    dptr[3] = z13 - z2;
    dptr[1] = z11 + z4;
    dptr[7] = z11 - z4;

    dptr+=8;			/* advance pointer to next row */
  }
  /* descale */
  for (i = 0; i < 64; i++)
    block[i] = (short int) floor(data[i] * aanscales[i] + 0.499999);

  /* final transpose */
  transpose( block );
}