virtualx-engine/drivers/builtin_openssl2/crypto/sha/asm/sha1-s390x.pl

#!/usr/bin/env perl

# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================

# SHA1 block procedure for s390x.

# April 2007.
#
# Performance is >30% better than gcc 3.3 generated code. But the real
# twist is that SHA1 hardware support is detected and utilized. In
# which case performance can reach further >4.5x for larger chunks.

# January 2009.
#
# Optimize Xupdate for amount of memory references and reschedule
# instructions to favour dual-issue z10 pipeline. On z10 hardware is
# "only" ~2.3x faster than software.

# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific.

$kimdfunc=1;	# magic function code for kimd instruction

$flavour = shift;

if ($flavour =~ /3[12]/) {
	$SIZE_T=4;
	$g="";
} else {
	$SIZE_T=8;
	$g="g";
}

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

$K_00_39="%r0"; $K=$K_00_39;
$K_40_79="%r1";
$ctx="%r2";	$prefetch="%r2";
$inp="%r3";
$len="%r4";

$A="%r5";
$B="%r6";
$C="%r7";
$D="%r8";
$E="%r9";	@V=($A,$B,$C,$D,$E);
$t0="%r10";
$t1="%r11";
@X=("%r12","%r13","%r14");
$sp="%r15";

$stdframe=16*$SIZE_T+4*8;
$frame=$stdframe+16*4;

sub Xupdate {
my $i=shift;

$code.=<<___ if ($i==15);
	lg	$prefetch,$stdframe($sp)	### Xupdate(16) warm-up
	lr	$X[0],$X[2]
___
return if ($i&1);	# Xupdate is vectorized and executed every 2nd cycle
$code.=<<___ if ($i<16);
	lg	$X[0],`$i*4`($inp)	### Xload($i)
	rllg	$X[1],$X[0],32
___
$code.=<<___ if ($i>=16);
	xgr	$X[0],$prefetch		### Xupdate($i)
	lg	$prefetch,`$stdframe+4*(($i+2)%16)`($sp)
	xg	$X[0],`$stdframe+4*(($i+8)%16)`($sp)
	xgr	$X[0],$prefetch
	rll	$X[0],$X[0],1
	rllg	$X[1],$X[0],32
	rll	$X[1],$X[1],1
	rllg	$X[0],$X[1],32
	lr	$X[2],$X[1]		# feedback
___
$code.=<<___ if ($i<=70);
	stg	$X[0],`$stdframe+4*($i%16)`($sp)
___
unshift(@X,pop(@X));
}

sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=$X[1];

	&Xupdate($i);
$code.=<<___;
	alr	$e,$K		### $i
	rll	$t1,$a,5
	lr	$t0,$d
	xr	$t0,$c
	alr	$e,$t1
	nr	$t0,$b
	alr	$e,$xi
	xr	$t0,$d
	rll	$b,$b,30
	alr	$e,$t0
___
}

sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=$X[1];

	&Xupdate($i);
$code.=<<___;
	alr	$e,$K		### $i
	rll	$t1,$a,5
	lr	$t0,$b
	alr	$e,$t1
	xr	$t0,$c
	alr	$e,$xi
	xr	$t0,$d
	rll	$b,$b,30
	alr	$e,$t0
___
}

sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=$X[1];

	&Xupdate($i);
$code.=<<___;
	alr	$e,$K		### $i
	rll	$t1,$a,5
	lr	$t0,$b
	alr	$e,$t1
	or	$t0,$c
	lr	$t1,$b
	nr	$t0,$d
	nr	$t1,$c
	alr	$e,$xi
	or	$t0,$t1
	rll	$b,$b,30
	alr	$e,$t0
___
}

$code.=<<___;
.text
.align	64
.type	Ktable,\@object
Ktable: .long	0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
	.skip	48	#.long	0,0,0,0,0,0,0,0,0,0,0,0
.size	Ktable,.-Ktable
.globl	sha1_block_data_order
.type	sha1_block_data_order,\@function
sha1_block_data_order:
___
$code.=<<___ if ($kimdfunc);
	larl	%r1,OPENSSL_s390xcap_P
	lg	%r0,0(%r1)
	tmhl	%r0,0x4000	# check for message-security assist
	jz	.Lsoftware
	lghi	%r0,0
	la	%r1,`2*$SIZE_T`($sp)
	.long	0xb93e0002	# kimd %r0,%r2
	lg	%r0,`2*$SIZE_T`($sp)
	tmhh	%r0,`0x8000>>$kimdfunc`
	jz	.Lsoftware
	lghi	%r0,$kimdfunc
	lgr	%r1,$ctx
	lgr	%r2,$inp
	sllg	%r3,$len,6
	.long	0xb93e0002	# kimd %r0,%r2
	brc	1,.-4		# pay attention to "partial completion"
	br	%r14
.align	16
.Lsoftware:
___
$code.=<<___;
	lghi	%r1,-$frame
	st${g}	$ctx,`2*$SIZE_T`($sp)
	stm${g}	%r6,%r15,`6*$SIZE_T`($sp)
	lgr	%r0,$sp
	la	$sp,0(%r1,$sp)
	st${g}	%r0,0($sp)

	larl	$t0,Ktable
	llgf	$A,0($ctx)
	llgf	$B,4($ctx)
	llgf	$C,8($ctx)
	llgf	$D,12($ctx)
	llgf	$E,16($ctx)

	lg	$K_00_39,0($t0)
	lg	$K_40_79,8($t0)

.Lloop:
	rllg	$K_00_39,$K_00_39,32
___
for ($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
	rllg	$K_00_39,$K_00_39,32
___
for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;	$K=$K_40_79;
	rllg	$K_40_79,$K_40_79,32
___
for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
	rllg	$K_40_79,$K_40_79,32
___
for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;

	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
	la	$inp,64($inp)
	al	$A,0($ctx)
	al	$B,4($ctx)
	al	$C,8($ctx)
	al	$D,12($ctx)
	al	$E,16($ctx)
	st	$A,0($ctx)
	st	$B,4($ctx)
	st	$C,8($ctx)
	st	$D,12($ctx)
	st	$E,16($ctx)
	brct${g} $len,.Lloop

	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)
	br	%r14
.size	sha1_block_data_order,.-sha1_block_data_order
.string	"SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
.comm	OPENSSL_s390xcap_P,16,8
___

$code =~ s/\`([^\`]*)\`/eval $1/gem;

print $code;
close STDOUT;
-Added OpenSSL and HTTPS support -Built-in version of the library for Windows, Android and iOS (other OSs use system one) -Small fixes all around 2014-04-29 02:56:43 +02:00			`#!/usr/bin/env perl`

			`# ====================================================================`
			`# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL`
			`# project. The module is, however, dual licensed under OpenSSL and`
			`# CRYPTOGAMS licenses depending on where you obtain it. For further`
			`# details see http://www.openssl.org/~appro/cryptogams/.`
			`# ====================================================================`

			`# SHA1 block procedure for s390x.`

			`# April 2007.`
			`#`
			`# Performance is >30% better than gcc 3.3 generated code. But the real`
			`# twist is that SHA1 hardware support is detected and utilized. In`
			`# which case performance can reach further >4.5x for larger chunks.`

			`# January 2009.`
			`#`
			`# Optimize Xupdate for amount of memory references and reschedule`
			`# instructions to favour dual-issue z10 pipeline. On z10 hardware is`
			`# "only" ~2.3x faster than software.`

			`# November 2010.`
			`#`
			`# Adapt for -m31 build. If kernel supports what's called "highgprs"`
			`# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit`
			`# instructions and achieve "64-bit" performance even in 31-bit legacy`
			`# application context. The feature is not specific to any particular`
			`# processor, as long as it's "z-CPU". Latter implies that the code`
			`# remains z/Architecture specific.`

			`$kimdfunc=1; # magic function code for kimd instruction`

			`$flavour = shift;`

			`if ($flavour =~ /3[12]/) {`
			`$SIZE_T=4;`
			`$g="";`
			`} else {`
			`$SIZE_T=8;`
			`$g="g";`
			`}`

			`while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}`
			`open STDOUT,">$output";`

			`$K_00_39="%r0"; $K=$K_00_39;`
			`$K_40_79="%r1";`
			`$ctx="%r2"; $prefetch="%r2";`
			`$inp="%r3";`
			`$len="%r4";`

			`$A="%r5";`
			`$B="%r6";`
			`$C="%r7";`
			`$D="%r8";`
			`$E="%r9"; @V=($A,$B,$C,$D,$E);`
			`$t0="%r10";`
			`$t1="%r11";`
			`@X=("%r12","%r13","%r14");`
			`$sp="%r15";`

			`$stdframe=16$SIZE_T+48;`
			`$frame=$stdframe+16*4;`

			`sub Xupdate {`
			`my $i=shift;`

			`$code.=<<___ if ($i==15);`
			`lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up`
			`lr $X[0],$X[2]`
			`___`
			`return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle`
			`$code.=<<___ if ($i<16);`
			lg $X[0],`$i*4`($inp) ### Xload($i)
			`rllg $X[1],$X[0],32`
			`___`
			`$code.=<<___ if ($i>=16);`
			`xgr $X[0],$prefetch ### Xupdate($i)`
			lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp)
			xg $X[0],`$stdframe+4*(($i+8)%16)`($sp)
			`xgr $X[0],$prefetch`
			`rll $X[0],$X[0],1`
			`rllg $X[1],$X[0],32`
			`rll $X[1],$X[1],1`
			`rllg $X[0],$X[1],32`
			`lr $X[2],$X[1] # feedback`
			`___`
			`$code.=<<___ if ($i<=70);`
			stg $X[0],`$stdframe+4*($i%16)`($sp)
			`___`
			`unshift(@X,pop(@X));`
			`}`

			`sub BODY_00_19 {`
			`my ($i,$a,$b,$c,$d,$e)=@_;`
			`my $xi=$X[1];`

			`&Xupdate($i);`
			`$code.=<<___;`
			`alr $e,$K ### $i`
			`rll $t1,$a,5`
			`lr $t0,$d`
			`xr $t0,$c`
			`alr $e,$t1`
			`nr $t0,$b`
			`alr $e,$xi`
			`xr $t0,$d`
			`rll $b,$b,30`
			`alr $e,$t0`
			`___`
			`}`

			`sub BODY_20_39 {`
			`my ($i,$a,$b,$c,$d,$e)=@_;`
			`my $xi=$X[1];`

			`&Xupdate($i);`
			`$code.=<<___;`
			`alr $e,$K ### $i`
			`rll $t1,$a,5`
			`lr $t0,$b`
			`alr $e,$t1`
			`xr $t0,$c`
			`alr $e,$xi`
			`xr $t0,$d`
			`rll $b,$b,30`
			`alr $e,$t0`
			`___`
			`}`

			`sub BODY_40_59 {`
			`my ($i,$a,$b,$c,$d,$e)=@_;`
			`my $xi=$X[1];`

			`&Xupdate($i);`
			`$code.=<<___;`
			`alr $e,$K ### $i`
			`rll $t1,$a,5`
			`lr $t0,$b`
			`alr $e,$t1`
			`or $t0,$c`
			`lr $t1,$b`
			`nr $t0,$d`
			`nr $t1,$c`
			`alr $e,$xi`
			`or $t0,$t1`
			`rll $b,$b,30`
			`alr $e,$t0`
			`___`
			`}`

			`$code.=<<___;`
			`.text`
			`.align 64`
			`.type Ktable,\@object`
			`Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6`
			`.skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0`
			`.size Ktable,.-Ktable`
			`.globl sha1_block_data_order`
			`.type sha1_block_data_order,\@function`
			`sha1_block_data_order:`
			`___`
			`$code.=<<___ if ($kimdfunc);`
			`larl %r1,OPENSSL_s390xcap_P`
			`lg %r0,0(%r1)`
			`tmhl %r0,0x4000 # check for message-security assist`
			`jz .Lsoftware`
			`lghi %r0,0`
			la %r1,`2*$SIZE_T`($sp)
			`.long 0xb93e0002 # kimd %r0,%r2`
			lg %r0,`2*$SIZE_T`($sp)
			tmhh %r0,`0x8000>>$kimdfunc`
			`jz .Lsoftware`
			`lghi %r0,$kimdfunc`
			`lgr %r1,$ctx`
			`lgr %r2,$inp`
			`sllg %r3,$len,6`
			`.long 0xb93e0002 # kimd %r0,%r2`
			`brc 1,.-4 # pay attention to "partial completion"`
			`br %r14`
			`.align 16`
			`.Lsoftware:`
			`___`
			`$code.=<<___;`
			`lghi %r1,-$frame`
			st${g} $ctx,`2*$SIZE_T`($sp)
			stm${g} %r6,%r15,`6*$SIZE_T`($sp)
			`lgr %r0,$sp`
			`la $sp,0(%r1,$sp)`
			`st${g} %r0,0($sp)`

			`larl $t0,Ktable`
			`llgf $A,0($ctx)`
			`llgf $B,4($ctx)`
			`llgf $C,8($ctx)`
			`llgf $D,12($ctx)`
			`llgf $E,16($ctx)`

			`lg $K_00_39,0($t0)`
			`lg $K_40_79,8($t0)`

			`.Lloop:`
			`rllg $K_00_39,$K_00_39,32`
			`___`
			`for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }`
			`$code.=<<___;`
			`rllg $K_00_39,$K_00_39,32`
			`___`
			`for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }`
			`$code.=<<___; $K=$K_40_79;`
			`rllg $K_40_79,$K_40_79,32`
			`___`
			`for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }`
			`$code.=<<___;`
			`rllg $K_40_79,$K_40_79,32`
			`___`
			`for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }`
			`$code.=<<___;`

			l${g} $ctx,`$frame+2*$SIZE_T`($sp)
			`la $inp,64($inp)`
			`al $A,0($ctx)`
			`al $B,4($ctx)`
			`al $C,8($ctx)`
			`al $D,12($ctx)`
			`al $E,16($ctx)`
			`st $A,0($ctx)`
			`st $B,4($ctx)`
			`st $C,8($ctx)`
			`st $D,12($ctx)`
			`st $E,16($ctx)`
			`brct${g} $len,.Lloop`

			lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
			`br %r14`
			`.size sha1_block_data_order,.-sha1_block_data_order`
			`.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"`
			`.comm OPENSSL_s390xcap_P,16,8`
			`___`

			$code =~ s/\`([^\`]*)\`/eval $1/gem;

			`print $code;`
			`close STDOUT;`