From owner-FreeBSD-users-jp@jp.freebsd.org  Sun Mar 10 09:51:59 1996
Received: by mail.jp.freebsd.org (8.7.3+2.6Wbeta5/8.7.3) id JAA20064
	Sun, 10 Mar 1996 09:51:59 +0900 (JST)
Received: by mail.jp.freebsd.org (8.7.3+2.6Wbeta5/8.7.3) with ESMTP id JAA20059
	for <FreeBSD-users-jp@jp.freebsd.org>; Sun, 10 Mar 1996 09:51:56 +0900 (JST)
From: Hiroshi Murakami <hiroshi@necom830.hpcl.titech.ac.jp>
Message-Id: <199603100043.JAA03425@necom830.hpcl.titech.ac.jp>
Received: by necom830.hpcl.titech.ac.jp (8.6.11/TM2.1)
	id JAA03425; Sun, 10 Mar 1996 09:43:33 +0900
To: FreeBSD-users-jp@jp.freebsd.org
Date: Sun, 10 Mar 96 9:43:32 JST
In-Reply-To: <9603051253.AA08927@n128>; from "Takashi Saeki" at Mar 5, 96 9:52 pm
X-Mailer: ELM [version 2.3 PL11]
Reply-To: FreeBSD-users-jp@jp.freebsd.org
X-Distribute: distribute [version 2.1 (Alpha) patchlevel=19]
X-Sequence: FreeBSD-users-jp 665
Subject: [FreeBSD-users-jp 665] an benchmark by blocked version matrix multiplication.
Errors-To: owner-FreeBSD-users-jp@jp.freebsd.org
Sender: owner-FreeBSD-users-jp@jp.freebsd.org


$@$3$l$O(B, 
	$@@5J}9TNs(B A $@$H(B B $@$N@Q(B A*B $@$r(B C $@$K:n$k(B
$@$H$$$&7W;;$r(B,$@%V%m%C%/2=>h;;5;K!$r;H$C$F9T$&%Y%s%A%^!<%/$G$9(B.

$@%V%m%C%/2=9TNs$N%5%$%:$O6v?t$J$i$J$s$G$bNI$$$N$G$9$,(B,
$@$=$l$r0l1~(B20$@$H$7(B, $@A4BN$N9TNs$N%5%$%:$r(B 20$@$NG\?t$G(B 
20,40,...,500 $@$HJQ$($F7W;;;~4V$r7WB,$7$F$$$^$9(B.

gcc $@$K$h$k%3%s%Q%$%k>r7o$O(B
	% gcc-i2.6.3 -mpentium -O3 -funroll-loops a.c
$@$G$9(B. -mpentium $@$,L5$$%P!<%8%g%s$N%3%s%Q%$%i$G$O(B -mpentium $@$r$D$1$J$$$+(B,
-m486 $@$H$7$F$/$@$5$$(B.

gcc $@%3%s%Q%$%k$N%*%W%7%g%s$H$7$F(B -funroll-loops $@$r(B -O3 $@$HAH$_9g$o$;$F(B
$@;H$&$3$H$,9b$$@-G=CM$r=P$90Y$K4N?4$G$9(B.

$@7WB,Nc$G$O(B, $@9TNs$N%5%$%:(B,$@%V%m%C%/%5%$%:(B,$@7WB,$N@:EY$rJ]$D0Y$N%k!<%W2s?t(B,
$@>h;;$K$+$+$C$?<B:]$N(Bcpu$@;~4V(B, A*B$@$N7W;;$,(B 2N^3 flops $@$G$"$k$HDj5A$7$F(B
$@7WB,;~4VCf$K9T$C$?IbF0>.?tE@1i;;2s?t(B, MFLOPS $@CM$rI=<($7$F$$$^$9(B.
$@%V%m%C%/2=$K$h$j(B,$@9TNs$N%5%$%:$,Bg$-$/$J$C$F$b@-G=$,Hf3SE*Nt2=$7$F$$$^$;$s(B.
-funroll-loops $@$KMj$i$:$K(B,$@%V%m%C%/%5%$%:$r(B20$@$J$i(B20$@$H7h$a$F$7$^$C$F(B
$@%=!<%9%3!<%I$rL@<(E*$K%"%s%m!<%k$9$k$H(B,$@@-G=$,$^$@8~>e$G$-$^$9$,(B,
$@$=$l$O>JN,$7$^$9(B.
$@%=!<%9%3!<%ICf$G(B $@%^%/%m%i%Y%k(B INNER_PRODUCT $@$rDj5A$9$k$H(B,
A*B $@$N$+$o$j$K(B A*B' $@$r7W;;$7$^$9(B. (B' $@$O(B B$@$NE>CV(B).
$@$=$N$[$&$,%G!<%?$N%"%/%;%9%Q%?!<%s$H(B,B $@$N%V%m%C%/>.9TNs(B
$@$NE>CV$r9T$o$:$K$9$`$N$G(B MFLOPS $@CM$O>e$,$j$^$9$,(B,$@$3$N%Y%s%A%^!<%/(B
$@$NL\E*$H$O$:$l$^$9(B.

$@<j85$K$O(B, Pentium 100MHz $@$H(B $@K?=j$K(B Pentium Pro-150MHz $@$,$"$k$@$1$J$N$G(B,
$@B>$N(BCPU$@$d%/%m%C%/$N0[$J$k(BPentium$@Ey$G$N@-G=$KB?>/6=L#$,$"$j$^$9(B.
$@<~JU$N%A%C%W%;%C%H$J$I$K$h$j(B,$@F1$8(BCPU$@$r;H$C$F$$$F$b@-G=$K0c$$$,=P$k2DG=@-$O(B
$@$"$j$^$9$,(B,$@K\%Y%s%A%^!<%/$O30It%a%b%j!<$X$N%G!<%?%"%/%;%9$rDc8:$5$;$k(B
$@;;K!$rMQ$$$F$$$^$9$N$G(B,$@Hf3SE*(BCPU$@$N@-G=$,F1$8$J$i;w$?7k2L$,$G$k$G$7$g$&(B.

====================== P5-100 $@$K$h$k7WB,Nc(B ===============================

p5-100% ./a.out
N=20,  BLK=20, loop=100000, cpu=44.86, flop=1.6e+09, Mflops=35.7
N=40,  BLK=20, loop=12500,  cpu=39.87, flop=1.6e+09, Mflops=40.1
N=60,  BLK=20, loop=3700,   cpu=38.37, flop=1.6e+09, Mflops=41.7
N=80,  BLK=20, loop=1560,   cpu=38.33, flop=1.6e+09, Mflops=41.7
N=100, BLK=20, loop=800,    cpu=37.47, flop=1.6e+09, Mflops=42.7
N=120, BLK=20, loop=460,    cpu=37.25, flop=1.6e+09, Mflops=42.7
N=140, BLK=20, loop=288,    cpu=37.23, flop=1.6e+09, Mflops=42.5
N=160, BLK=20, loop=192,    cpu=37.76, flop=1.6e+09, Mflops=41.7
N=180, BLK=20, loop=136,    cpu=37.96, flop=1.6e+09, Mflops=41.8
N=200, BLK=20, loop=100,    cpu=38.88, flop=1.6e+09, Mflops=41.2
N=220, BLK=20, loop=72,     cpu=37.63, flop=1.5e+09, Mflops=40.7
N=240, BLK=20, loop=56,     cpu=38.09, flop=1.5e+09, Mflops=40.7
N=260, BLK=20, loop=44,     cpu=38.29, flop=1.5e+09, Mflops=40.4
N=280, BLK=20, loop=36,     cpu=39.20, flop=1.6e+09, Mflops=40.3
N=300, BLK=20, loop=28,     cpu=37.60, flop=1.5e+09, Mflops=40.2
N=320, BLK=20, loop=24,     cpu=39.46, flop=1.6e+09, Mflops=39.9
N=340, BLK=20, loop=20,     cpu=39.07, flop=1.6e+09, Mflops=40.2
N=360, BLK=20, loop=16,     cpu=37.41, flop=1.5e+09, Mflops=39.9
N=380, BLK=20, loop=12,     cpu=33.08, flop=1.3e+09, Mflops=39.8
N=400, BLK=20, loop=12,     cpu=38.64, flop=1.5e+09, Mflops=39.8
N=420, BLK=20, loop=8,      cpu=29.91, flop=1.2e+09, Mflops=39.6
N=440, BLK=20, loop=8,      cpu=34.30, flop=1.4e+09, Mflops=39.7
N=460, BLK=20, loop=8,      cpu=39.33, flop=1.6e+09, Mflops=39.6
N=480, BLK=20, loop=4,      cpu=22.38, flop=8.8e+08, Mflops=39.5
N=500, BLK=20, loop=4,      cpu=25.75, flop=1.0e+09, Mflops=38.8

========================= P6-150 $@$K$h$k7WB,Nc(B ============================

p6-150% ./a.out
N=20,  BLK=20, loop=100000, cpu=22.63, flop=1.6e+09, Mflops=70.7
N=40,  BLK=20, loop=12500,  cpu=20.62, flop=1.6e+09, Mflops=77.6
N=60,  BLK=20, loop=3700,   cpu=19.93, flop=1.6e+09, Mflops=80.2
N=80,  BLK=20, loop=1560,   cpu=19.73, flop=1.6e+09, Mflops=81.0
N=100, BLK=20, loop=800,    cpu=19.76, flop=1.6e+09, Mflops=81.0
N=120, BLK=20, loop=460,    cpu=20.30, flop=1.6e+09, Mflops=78.3
N=140, BLK=20, loop=288,    cpu=20.51, flop=1.6e+09, Mflops=77.1
N=160, BLK=20, loop=192,    cpu=21.26, flop=1.6e+09, Mflops=74.0
N=180, BLK=20, loop=136,    cpu=21.90, flop=1.6e+09, Mflops=72.4
N=200, BLK=20, loop=100,    cpu=22.48, flop=1.6e+09, Mflops=71.2
N=220, BLK=20, loop=72,     cpu=21.68, flop=1.5e+09, Mflops=70.7
N=240, BLK=20, loop=56,     cpu=21.99, flop=1.5e+09, Mflops=70.4
N=260, BLK=20, loop=44,     cpu=21.81, flop=1.5e+09, Mflops=70.9
N=280, BLK=20, loop=36,     cpu=22.04, flop=1.6e+09, Mflops=71.7
N=300, BLK=20, loop=28,     cpu=21.49, flop=1.5e+09, Mflops=70.4
N=320, BLK=20, loop=24,     cpu=23.61, flop=1.6e+09, Mflops=66.6
N=340, BLK=20, loop=20,     cpu=22.32, flop=1.6e+09, Mflops=70.4
N=360, BLK=20, loop=16,     cpu=21.39, flop=1.5e+09, Mflops=69.8
N=380, BLK=20, loop=12,     cpu=18.75, flop=1.3e+09, Mflops=70.2
N=400, BLK=20, loop=12,     cpu=22.04, flop=1.5e+09, Mflops=69.7
N=420, BLK=20, loop=8,      cpu=16.98, flop=1.2e+09, Mflops=69.8
N=440, BLK=20, loop=8,      cpu=19.66, flop=1.4e+09, Mflops=69.3
N=460, BLK=20, loop=8,      cpu=22.70, flop=1.6e+09, Mflops=68.6
N=480, BLK=20, loop=4,      cpu=13.20, flop=8.8e+08, Mflops=67.0
N=500, BLK=20, loop=4,      cpu=14.55, flop=1.0e+09, Mflops=68.7

====================== $@<B:]$N%=!<%9%3!<%I(B ==========================

const char* Version="$Date: 1996/01/02 15:52:35 $";

#define BLK 20 /* An even number */

#define REAL double /* REAL type is double. */
/*#define REAL double /* REAL type is float. */
/* #define INNER_PRODUCT /* if defined, calculates C:=A*B' rather than A*B. */

/*
REAL bbt[BLK*BLK];
REAL a[N*Ma],b[N*Mb],c[N*Mc];
*/

REAL *bbt,*a,*b,*c;
int LOOP;
int Ma,Mb,Mc;
int N;
/*------------------------------------------------------------------*/

main()
{
int i,j,k,r;
float second(),t1,t2,flop;
int loop;
REAL *base;

        printf("?-bench: Version: %s\n", Version);
    for(N=BLK;N<=500;N+=BLK) {
        Ma=N;
        Mb=N;
        Mc=N;
        base=(REAL*)malloc(sizeof(REAL)*(BLK*BLK+N*Ma+N*Mb+N*Mc));
        sleep(1);
        bbt=base;
        a=bbt+BLK*BLK;
        b=a+N*Ma;
        c=b+N*Mb;
        LOOP=200000000/(N*N*N);   /* for Pentium 100MHz */
        LOOP=200000000/(N*N*N)*4; /* for Pentium Pro 150MHz */

        loop=(LOOP==0)?1:LOOP;

        print4addr((sizeof(REAL)==sizeof(double)),a,b,c,bbt);

        for(i=0;i<N;i++) { 
                for(j=0;j<N;j++) a[i*Ma+j]=1.0;
        }
        for(i=0;i<N;i++) { 
                for(j=0;j<N;j++) b[i*Mb+j]=2.0;
        }

        t1=second();
        for(r=0;r<loop;r++) {
                matmul_blk(N, a,Ma, b,Mb, c,Mc);
        }
        t2=second();
        flop=2.0*N*N*N*loop;

        printf(  "N=%d",N); 
        printf(", BLK=%d",BLK); 
        printf(", loop=%d",loop);
        printf(", cpu=%.2f",t2-t1); 
        printf(", flop=%.1e",flop);
        printf(", Mflops=%.1f",(1.0e-6)*flop/(t2-t1));
        printf("\n");
        free(base);
    }
        exit(0);
}


/*------------------------------------------------------------------*/

matmul_blk(n,a,ma,b,mb,c,mc)
int n; /* logical dimension of matrices. */
REAL *a;
int ma; /* Adjustable dimension of a. */
REAL *b;
int mb; /* Adjustable dimension of b. */
REAL *c;
int mc; /* Adjustable dimension of c. */
{
int I,J,K;
int i,j,k;

/*
        for J do 
           for I do 
              C(I,J):=0.0;
        for K do 
        for J do {
           BKJ:=B(K,j)
           for I do 
              C(I,J):=C(I,J)+A(I,K)*BKJ;
        }
*/
        for(i=0;i<n;i++) {
           for(j=0;j<n;j++)
              (c+i*mc)[j]=0.0;
        }
        for(K=0;K<n-BLK+1;K+=BLK) {
           for(J=0;J<n-BLK+1;J+=BLK) {
#ifndef INNER_PRODUCT
              for(k=K;k<K+BLK;k++) {
                 for(j=J;j<J+BLK;j++)
                    (bbt+(k-K))[(j-J)*BLK]=(b+k*mb)[j];
              }
              muladd_bt(a+K,bbt,c+J,ma,mc,n);
#else
              muladd_bt(a+K,b+K*mb+J,c+J,ma,mc,n);
#endif
           }
        }
}


/*------------------------------------------------------------------*/

/*
          Add matrix innerproduct of A and B to C.
          C(i,j) +=  sum{k} A(i,k) B(j,k)
*/

muladd_bt(a,b,c,ma,mc,n)
REAL *a;
REAL *b;
REAL *c;
int ma;
int mc;
int n;
{
REAL r0,r1,s;
REAL *ai,*bj,*ci;
int i,j,k;
        for(i=0;i<n;i++) {
           ai=a+i*ma;
           ci=c+i*mc;
           for(j=0,bj=b;j<BLK;j++,bj+=BLK) {
              r0=ai[0]*bj[0]; s=ci[j];
              r1=ai[1]*bj[1]; s+=r0;
              for(k=2;k<BLK-1;k+=2) {
                 r0=ai[k  ]*bj[k  ]; s+=r1;
                 r1=ai[k+1]*bj[k+1]; s+=r0;
              }
              ci[j]=r1+s;
           }
        }
}

/*------------------------------------------------------------------*/
float second(void) {
#include <time.h>
        return ((float)((float)clock()/(float)CLOCKS_PER_SEC));
}
print4addr(check,a,b,c,d) int check; void *a,*b,*c,*d; {
        printf("Octal-address: a:%8o, b:%8o, c:%8o, bbt:%8o\n",
                (unsigned)a, (unsigned)b, (unsigned)c, (unsigned)d );
        if(check) {
                if((int)a%8!=0) printf("Warning! a not aligned.\n");
                if((int)b%8!=0) printf("Warning! b not aligned.\n");
                if((int)c%8!=0) printf("Warning! c not aligned.\n");
                if((int)d%8!=0) printf("Warning! bbt not aligned.\n");
        }
}

====================== $@%3%s%Q%$%k>r7o(B =================================

% gcc-i2.6.3 -O3 -mpentium -funroll-loops a.c

