From owner-FreeBSD-users-jp@jp.freebsd.org  Sun Mar 10 10:14:09 1996
Received: by mail.jp.freebsd.org (8.7.3+2.6Wbeta5/8.7.3) id KAA20138
	Sun, 10 Mar 1996 10:14:09 +0900 (JST)
Received: by mail.jp.freebsd.org (8.7.3+2.6Wbeta5/8.7.3) with ESMTP id KAA20133
	for <FreeBSD-users-jp@jp.freebsd.org>; Sun, 10 Mar 1996 10:14:07 +0900 (JST)
From: Hiroshi Murakami <hiroshi@necom830.hpcl.titech.ac.jp>
Message-Id: <199603100105.KAA03476@necom830.hpcl.titech.ac.jp>
Received: by necom830.hpcl.titech.ac.jp (8.6.11/TM2.1)
	id KAA03476; Sun, 10 Mar 1996 10:05:44 +0900
To: FreeBSD-users-jp@jp.freebsd.org
Date: Sun, 10 Mar 96 10:05:44 JST
X-Mailer: ELM [version 2.3 PL11]
Reply-To: FreeBSD-users-jp@jp.freebsd.org
X-Distribute: distribute [version 2.1 (Alpha) patchlevel=19]
X-Sequence: FreeBSD-users-jp 666
Subject: [FreeBSD-users-jp 666] an benchmark of non-blocked version of matrix inner-product.
Errors-To: owner-FreeBSD-users-jp@jp.freebsd.org
Sender: owner-FreeBSD-users-jp@jp.freebsd.org


$@$3$l$O(B, 
	$@@5J}9TNs(B A $@$H(B B $@$NE>CV(BB'$@$N@Q(B A*B' [$@9TNs$NFb@Q(B] $@$r(B C $@$K:n$k(B
$@$H$$$&7W;;$r(B,$@%V%m%C%/2=>h;;5;K!$r;H$o$J$$IaDL$NJ}K!$G9T$&%Y%s%A%^!<%/$G$9(B.

$@9TNs$N%5%$%:$O(B20$@$NG\?t$G$"$k$H8BDj$7$F%=!<%9%3!<%I>e$G(B
$@%"%s%m!<%k$7$F$$$^$9(B. $@A4BN$N9TNs$N%5%$%:$r(B 20$@$NG\?t$G(B
20,40,...,500 $@$HJQ$($F7W;;;~4V$r7WB,$7$F$$$^$9(B.

$@7WB,Nc$G$O(B, $@9TNs$N%5%$%:(B,$@7WB,$N@:EY$rJ]$D0Y$N%k!<%W2s?t(B,
$@>h;;$K$+$+$C$?<B:]$N(Bcpu$@;~4V(B, A*B'$@$N7W;;$,(B 2N^3 flops $@$G$"$k$HDj5A$7$F(B
$@7WB,;~4VCf$K9T$C$?IbF0>.?tE@1i;;2s?t(B, MFLOPS $@CM$rI=<($7$F$$$^$9(B.
$@9TNs$N%5%$%:$,Bg$-$/$J$k$H%a%b%j!<$X$N%"%/%;%9$,A}$($F(B,
$@@-G=$,<!Bh$KNt2=$7$^$9(B. $@C1@:EY$K$h$k7W;;$OG\@:EY$KHf$Y$F(B
$@%a%b%j!<%G!<%?$NE>AwNL$,H>J,$J$N$G(B,$@@-G=$NNt2=$NDxEY$,H>J,DxEY$G$9(B.

$@K\%Y%s%A%^!<%/$O%a%b%j!<$N%"%/%;%9%Q%?!<%s$,3d$H2:$d$+$G$O$"$j$^$9$,(B,
$@%V%m%C%/2=$N%P!<%8%g%s$H$O0[$J$j(B,$@30It%a%b%j!<$X$N%G!<%?%"%/%;%9$,$"$k(B
$@DxEY$O$"$k$N$G(B, CPU$@%A%C%W<+?H$N@-G=$,F1$8$G$"$C$F$b(B,$@<~JU$N%A%C%W%;%C%H$d(B
$@Fs<!%-%c%C%7%e(B,$@<g5-21%a%b%j!<%7%9%F%`$N:n$j$K$h$C$F@-G=$K0c$$$,$G$k$G$7$g$&(B.

$@<j85$K$O(B, Pentium 100MHz $@$H(B $@K?=j$K(B Pentium Pro-150MHz $@$,$"$k$@$1$J$N$G(B,
$@B>$N(BCPU$@$d%/%m%C%/$N0[$J$k(BPentium$@Ey$G$N@-G=$K6=L#$,$"$j$^$9(B.


==================== P5-100 $@$G$N7WB,Nc(B ===================================

------------- $@G\@:EY1i;;Nc(B ------------------------

p5-100% a_dbl.out
N=20,  loop=25000, cpu=8.27,  flop=4.0e+08, Mflops=48.4
N=40,  loop=3125,  cpu=15.55, flop=4.0e+08, Mflops=25.7
N=60,  loop=925,   cpu=15.45, flop=4.0e+08, Mflops=25.9
N=80,  loop=390,   cpu=16.70, flop=4.0e+08, Mflops=23.9
N=100, loop=200,   cpu=17.95, flop=4.0e+08, Mflops=22.3
N=120, loop=115,   cpu=18.31, flop=4.0e+08, Mflops=21.7
N=140, loop=72,    cpu=18.44, flop=4.0e+08, Mflops=21.4
N=160, loop=48,    cpu=19.45, flop=3.9e+08, Mflops=20.2
N=180, loop=34,    cpu=20.10, flop=4.0e+08, Mflops=19.7
N=200, loop=25,    cpu=20.75, flop=4.0e+08, Mflops=19.3
N=220, loop=18,    cpu=21.46, flop=3.8e+08, Mflops=17.9
N=240, loop=14,    cpu=21.93, flop=3.9e+08, Mflops=17.7
N=260, loop=11,    cpu=22.29, flop=3.9e+08, Mflops=17.3
N=280, loop=9,     cpu=22.59, flop=4.0e+08, Mflops=17.5
N=300, loop=7,     cpu=22.09, flop=3.8e+08, Mflops=17.1
N=320, loop=6,     cpu=23.27, flop=3.9e+08, Mflops=16.9
N=340, loop=5,     cpu=23.06, flop=3.9e+08, Mflops=17.0
N=360, loop=4,     cpu=22.14, flop=3.7e+08, Mflops=16.9
N=380, loop=3,     cpu=19.58, flop=3.3e+08, Mflops=16.8
N=400, loop=3,     cpu=22.89, flop=3.8e+08, Mflops=16.8
N=420, loop=2,     cpu=17.66, flop=3.0e+08, Mflops=16.8
N=440, loop=2,     cpu=20.29, flop=3.4e+08, Mflops=16.8
N=460, loop=2,     cpu=23.20, flop=3.9e+08, Mflops=16.8
N=480, loop=1,     cpu=13.22, flop=2.2e+08, Mflops=16.7
N=500, loop=1,     cpu=14.89, flop=2.5e+08, Mflops=16.8

------------- $@C1@:EY1i;;Nc(B ------------------------

p5-100% a_flt.out
N=20,  loop=25000, cpu=7.91,  flop=4.0e+08, Mflops=50.5
N=40,  loop=3125,  cpu=8.04,  flop=4.0e+08, Mflops=49.8
N=60,  loop=925,   cpu=11.22, flop=4.0e+08, Mflops=35.6
N=80,  loop=390,   cpu=11.03, flop=4.0e+08, Mflops=36.2
N=100, loop=200,   cpu=10.94, flop=4.0e+08, Mflops=36.6
N=120, loop=115,   cpu=13.22, flop=4.0e+08, Mflops=30.1
N=140, loop=72,    cpu=11.55, flop=4.0e+08, Mflops=34.2
N=160, loop=48,    cpu=12.45, flop=3.9e+08, Mflops=31.6
N=180, loop=34,    cpu=12.70, flop=4.0e+08, Mflops=31.2
N=200, loop=25,    cpu=12.96, flop=4.0e+08, Mflops=30.9
N=220, loop=18,    cpu=12.72, flop=3.8e+08, Mflops=30.1
N=240, loop=14,    cpu=13.53, flop=3.9e+08, Mflops=28.6
N=260, loop=11,    cpu=13.48, flop=3.9e+08, Mflops=28.7
N=280, loop=9,     cpu=13.95, flop=4.0e+08, Mflops=28.3
N=300, loop=7,     cpu=13.40, flop=3.8e+08, Mflops=28.2
N=320, loop=6,     cpu=14.09, flop=3.9e+08, Mflops=27.9
N=340, loop=5,     cpu=13.93, flop=3.9e+08, Mflops=28.2
N=360, loop=4,     cpu=13.52, flop=3.7e+08, Mflops=27.6
N=380, loop=3,     cpu=11.96, flop=3.3e+08, Mflops=27.5
N=400, loop=3,     cpu=14.12, flop=3.8e+08, Mflops=27.2
N=420, loop=2,     cpu=11.12, flop=3.0e+08, Mflops=26.7
N=440, loop=2,     cpu=12.81, flop=3.4e+08, Mflops=26.6
N=460, loop=2,     cpu=14.64, flop=3.9e+08, Mflops=26.6
N=480, loop=1,     cpu=8.36,  flop=2.2e+08, Mflops=26.5
N=500, loop=1,     cpu=9.44,  flop=2.5e+08, Mflops=26.5

======================= P6-150 ===================================

------------- $@G\@:EY1i;;Nc(B ------------------------

p6-150% a_dbl.out
N=20,  loop=25000, cpu=4.88,  flop=4.0e+08, Mflops=82.1
N=40,  loop=3125,  cpu=5.34,  flop=4.0e+08, Mflops=75.0
N=60,  loop=925,   cpu=5.14,  flop=4.0e+08, Mflops=77.7
N=80,  loop=390,   cpu=5.12,  flop=4.0e+08, Mflops=77.9
N=100, loop=200,   cpu=5.19,  flop=4.0e+08, Mflops=77.1
N=120, loop=115,   cpu=5.21,  flop=4.0e+08, Mflops=76.3
N=140, loop=72,    cpu=5.55,  flop=4.0e+08, Mflops=71.1
N=160, loop=48,    cpu=9.35,  flop=3.9e+08, Mflops=42.0
N=180, loop=34,    cpu=11.27, flop=4.0e+08, Mflops=35.2
N=200, loop=25,    cpu=11.80, flop=4.0e+08, Mflops=33.9
N=220, loop=18,    cpu=12.93, flop=3.8e+08, Mflops=29.6
N=240, loop=14,    cpu=13.47, flop=3.9e+08, Mflops=28.7
N=260, loop=11,    cpu=13.30, flop=3.9e+08, Mflops=29.1
N=280, loop=9,     cpu=14.11, flop=4.0e+08, Mflops=28.0
N=300, loop=7,     cpu=13.45, flop=3.8e+08, Mflops=28.1
N=320, loop=6,     cpu=14.17, flop=3.9e+08, Mflops=27.7
N=340, loop=5,     cpu=14.24, flop=3.9e+08, Mflops=27.6
N=360, loop=4,     cpu=13.42, flop=3.7e+08, Mflops=27.8
N=380, loop=3,     cpu=11.85, flop=3.3e+08, Mflops=27.8
N=400, loop=3,     cpu=13.90, flop=3.8e+08, Mflops=27.6
N=420, loop=2,     cpu=10.62, flop=3.0e+08, Mflops=27.9
N=440, loop=2,     cpu=12.19, flop=3.4e+08, Mflops=28.0
N=460, loop=2,     cpu=14.04, flop=3.9e+08, Mflops=27.7
N=480, loop=1,     cpu=7.95,  flop=2.2e+08, Mflops=27.8
N=500, loop=1,     cpu=8.87,  flop=2.5e+08, Mflops=28.2

------------- $@C1@:EY1i;;Nc(B ------------------------

p6-150% a_flt.out
N=20,  loop=25000, cpu=4.85, flop=4.0e+08, Mflops=82.4
N=40,  loop=3125,  cpu=4.73, flop=4.0e+08, Mflops=84.6
N=60,  loop=925,   cpu=4.88, flop=4.0e+08, Mflops=81.8
N=80,  loop=390,   cpu=4.84, flop=4.0e+08, Mflops=82.4
N=100, loop=200,   cpu=4.81, flop=4.0e+08, Mflops=83.1
N=120, loop=115,   cpu=4.78, flop=4.0e+08, Mflops=83.1
N=140, loop=72,    cpu=5.01, flop=4.0e+08, Mflops=78.9
N=160, loop=48,    cpu=4.99, flop=3.9e+08, Mflops=78.8
N=180, loop=34,    cpu=4.99, flop=4.0e+08, Mflops=79.4
N=200, loop=25,    cpu=5.48, flop=4.0e+08, Mflops=73.0
N=220, loop=18,    cpu=5.61, flop=3.8e+08, Mflops=68.3
N=240, loop=14,    cpu=5.94, flop=3.9e+08, Mflops=65.2
N=260, loop=11,    cpu=6.76, flop=3.9e+08, Mflops=57.2
N=280, loop=9,     cpu=7.06, flop=4.0e+08, Mflops=55.9
N=300, loop=7,     cpu=7.70, flop=3.8e+08, Mflops=49.1
N=320, loop=6,     cpu=7.98, flop=3.9e+08, Mflops=49.2
N=340, loop=5,     cpu=7.94, flop=3.9e+08, Mflops=49.5
N=360, loop=4,     cpu=7.68, flop=3.7e+08, Mflops=48.6
N=380, loop=3,     cpu=6.77, flop=3.3e+08, Mflops=48.6
N=400, loop=3,     cpu=7.95, flop=3.8e+08, Mflops=48.3
N=420, loop=2,     cpu=6.19, flop=3.0e+08, Mflops=47.9
N=440, loop=2,     cpu=7.00, flop=3.4e+08, Mflops=48.7
N=460, loop=2,     cpu=7.93, flop=3.9e+08, Mflops=49.1
N=480, loop=1,     cpu=4.54, flop=2.2e+08, Mflops=48.7
N=500, loop=1,     cpu=5.18, flop=2.5e+08, Mflops=48.3


==================== $@<B:]$N%=!<%9%3!<%I(B ====================

const char* Version="$Date: 1996/01/03 20:11:55 $";

/*#define REAL double /* REAL type is double. */
/*#define REAL float /* REAL type is float.*/

/*
    REAL a[N*Ma],b[N*Mb],c[N*Mc];
*/

REAL *a,*b,*c;
int Ma,Mb,Mc;
int N;
/*------------------------------------------------------------------*/

main()
{
int i,j,k,r;
float second(),t1,t2,flop;
int loop;
REAL *base;

        printf("?-bench: Version: %s\n", Version);
    for(N=20;N<=500;N+=20) {
        Ma=N;
        Mb=N;
        Mc=N;
        base=(REAL*)malloc(sizeof(REAL)*(N*Ma+N*Mb+N*Mc));
        sleep(1);
        a=base;
        b=a+N*Ma;
        c=b+N*Mb;
        loop=200000000/(N*N*N)*4;
        loop=200000000/(N*N*N);

        loop=(loop==0)?1:loop;

        print3addr((sizeof(REAL)==sizeof(double)),a,b,c);

        for(i=0;i<N;i++) { 
                for(j=0;j<N;j++) {
                        a[i*Ma+j]=1.0;
                        b[i*Mb+j]=2.0;
                        c[i*Mc+j]=3.0;
                }
        }

        t1=second();
        for(r=0;r<loop;r++) {
                matinner_noblk(N, a,Ma, b,Mb, c,Mc);
        }
        t2=second();
        flop=2.0*N*N*N*loop;

        printf(  "N=%d",N); 
        printf(", loop=%d",loop);
        printf(", cpu=%.2f",t2-t1); 
        printf(", flop=%.1e",flop);
        printf(", Mflops=%.1f",(1.0e-6)*flop/(t2-t1));
        printf("\n");
        free(base);
    }
        exit(0);
}


/*------------------------------------------------------------------*/

matinner_noblk(n,a,ma,b,mb,c,mc)
int n; /* logical dimension of matrices. */
REAL *a;
int ma; /* Adjustable dimension of a. */
REAL *b;
int mb; /* Adjustable dimension of b. */
REAL *c;
int mc; /* Adjustable dimension of c. */
{
int i,j,k,k0;
REAL *ai,*bj,*ci;
REAL r0,r1,s;

        for(i=0; i<n; i++) {
           ai=a+i*ma;
           ci=c+i*mc;
           for(j=0,bj=b; j<n; j++,bj+=mb) {
              r0=ai[ 0]*bj[ 0]; s=ci[j];
              r1=ai[ 1]*bj[ 1]; s+=r0;
              r0=ai[ 2]*bj[ 2]; s+=r1;
              r1=ai[ 3]*bj[ 3]; s+=r0;
              r0=ai[ 4]*bj[ 4]; s+=r1;
              r1=ai[ 5]*bj[ 5]; s+=r0;
              r0=ai[ 6]*bj[ 6]; s+=r1;
              r1=ai[ 7]*bj[ 7]; s+=r0;
              r0=ai[ 8]*bj[ 8]; s+=r1;
              r1=ai[ 9]*bj[ 9]; s+=r0;
              r0=ai[10]*bj[10]; s+=r1;
              r1=ai[11]*bj[11]; s+=r0;
              r0=ai[12]*bj[12]; s+=r1;
              r1=ai[13]*bj[13]; s+=r0;
              r0=ai[14]*bj[14]; s+=r1;
              r1=ai[15]*bj[15]; s+=r0;
              r0=ai[16]*bj[16]; s+=r1;
              r1=ai[17]*bj[17]; s+=r0;
              r0=ai[18]*bj[18]; s+=r1;
              r1=ai[19]*bj[19]; s+=r0;
              for(k=20;k<n; k+=20) {
                        r0=ai[k+ 0]*bj[k+ 0]; s+=r1;
                        r1=ai[k+ 1]*bj[k+ 1]; s+=r0;
                        r0=ai[k+ 2]*bj[k+ 2]; s+=r1;
                        r1=ai[k+ 3]*bj[k+ 3]; s+=r0;
                        r0=ai[k+ 4]*bj[k+ 4]; s+=r1;
                        r1=ai[k+ 5]*bj[k+ 5]; s+=r0;
                        r0=ai[k+ 6]*bj[k+ 6]; s+=r1;
                        r1=ai[k+ 7]*bj[k+ 7]; s+=r0;
                        r0=ai[k+ 8]*bj[k+ 8]; s+=r1;
                        r1=ai[k+ 9]*bj[k+ 9]; s+=r0;
                        r0=ai[k+10]*bj[k+10]; s+=r1;
                        r1=ai[k+11]*bj[k+11]; s+=r0;
                        r0=ai[k+12]*bj[k+12]; s+=r1;
                        r1=ai[k+13]*bj[k+13]; s+=r0;
                        r0=ai[k+14]*bj[k+14]; s+=r1;
                        r1=ai[k+15]*bj[k+15]; s+=r0;
                        r0=ai[k+16]*bj[k+16]; s+=r1;
                        r1=ai[k+17]*bj[k+17]; s+=r0;
                        r0=ai[k+18]*bj[k+18]; s+=r1;
                        r1=ai[k+19]*bj[k+19]; s+=r0;
              }
              ci[j]=r1+s;
           }
        }
}


/*------------------------------------------------------------------*/
float second(void) {
#include <time.h>
        return ((float)((float)clock()/(float)CLOCKS_PER_SEC));
}
print3addr(check,a,b,c) int check; void *a,*b,*c; {
        printf("Octal-address: a:%8o, b:%8o, c:%8o\n",
                (unsigned)a, (unsigned)b, (unsigned)c );
        if(check) {
                if((int)a%8!=0) printf("Warning! a not aligned.\n");
                if((int)b%8!=0) printf("Warning! b not aligned.\n");
                if((int)c%8!=0) printf("Warning! c not aligned.\n");
        }
}

====================== $@%3%s%Q%$%k$N>r7o(B ====================

% gcc-i2.6.3 -O3 -mpentium -DREAL=double a.c -o a_dbl.out # $@G\@:EY7W;;%P%$%J%j(B

% gcc-i2.6.3 -O3 -mpentium -DREAL=float  a.c -o a_flt.out # $@C1@:EY7W;;%P%$%J%j(B

$@$3$l$O(B, -funroll-loops $@$r$D$1$kI,MW$O$"$j$^$;$s(B.
-mpentium $@$rIU$1$i$l$J$$%3%s%Q%$%i$G$O(B $@>J$/$+(B -m486 $@$rIU$1$F$/$@$5$$(B.
