Search in sources :

Example 31 with AMD64MacroAssembler

use of org.graalvm.compiler.asm.amd64.AMD64MacroAssembler in project graal by oracle.

the class AMD64MathIntrinsicBinaryOp method powIntrinsic.

public void powIntrinsic(Register dest, Register value1, Register value2, CompilationResultBuilder crb, AMD64MacroAssembler masm) {
    ArrayDataPointerConstant highSigMaskPtr = new ArrayDataPointerConstant(highSigMask, 16);
    ArrayDataPointerConstant logTwoEPtr = new ArrayDataPointerConstant(logTwoE, 16);
    ArrayDataPointerConstant highmaskYPtr = new ArrayDataPointerConstant(highmaskY, 16);
    ArrayDataPointerConstant tExpPtr = new ArrayDataPointerConstant(tExp, 16);
    ArrayDataPointerConstant eCoeffPtr = new ArrayDataPointerConstant(eCoeff, 16);
    ArrayDataPointerConstant coeffHPtr = new ArrayDataPointerConstant(coeffH, 16);
    ArrayDataPointerConstant highmaskLogXPtr = new ArrayDataPointerConstant(highmaskLogX, 16);
    ArrayDataPointerConstant halfmaskPtr = new ArrayDataPointerConstant(halfmask, 8);
    ArrayDataPointerConstant coeffPowPtr = new ArrayDataPointerConstant(coeffPow, 16);
    ArrayDataPointerConstant lTblPowPtr = new ArrayDataPointerConstant(lTblPow, 16);
    ArrayDataPointerConstant logTwoPowPtr = new ArrayDataPointerConstant(logTwoPow, 8);
    Label bb0 = new Label();
    Label bb1 = new Label();
    Label bb2 = new Label();
    Label bb3 = new Label();
    Label bb4 = new Label();
    Label bb5 = new Label();
    Label bb6 = new Label();
    Label bb7 = new Label();
    Label bb8 = new Label();
    Label bb9 = new Label();
    Label bb10 = new Label();
    Label bb11 = new Label();
    Label bb12 = new Label();
    Label bb13 = new Label();
    Label bb14 = new Label();
    Label bb15 = new Label();
    Label bb16 = new Label();
    Label bb18 = new Label();
    Label bb19 = new Label();
    Label bb20 = new Label();
    Label bb21 = new Label();
    Label bb22 = new Label();
    Label bb23 = new Label();
    Label bb24 = new Label();
    Label bb25 = new Label();
    Label bb26 = new Label();
    Label bb27 = new Label();
    Label bb28 = new Label();
    Label bb29 = new Label();
    Label bb30 = new Label();
    Label bb31 = new Label();
    Label bb32 = new Label();
    Label bb33 = new Label();
    Label bb34 = new Label();
    Label bb35 = new Label();
    Label bb36 = new Label();
    Label bb37 = new Label();
    Label bb38 = new Label();
    Label bb39 = new Label();
    Label bb40 = new Label();
    Label bb41 = new Label();
    Label bb42 = new Label();
    Label bb43 = new Label();
    Label bb44 = new Label();
    Label bb45 = new Label();
    Label bb46 = new Label();
    Label bb47 = new Label();
    Label bb48 = new Label();
    Label bb49 = new Label();
    Label bb50 = new Label();
    Label bb51 = new Label();
    Label bb53 = new Label();
    Label bb54 = new Label();
    Label bb55 = new Label();
    Label bb56 = new Label();
    Register gpr1 = asRegister(gpr1Temp, AMD64Kind.QWORD);
    Register gpr2 = asRegister(gpr2Temp, AMD64Kind.QWORD);
    Register gpr3 = asRegister(rcxTemp, AMD64Kind.QWORD);
    Register gpr4 = asRegister(gpr4Temp, AMD64Kind.QWORD);
    Register gpr5 = asRegister(gpr5Temp, AMD64Kind.QWORD);
    Register gpr6 = asRegister(gpr6Temp, AMD64Kind.QWORD);
    Register gpr7 = asRegister(gpr7Temp, AMD64Kind.QWORD);
    Register gpr8 = asRegister(gpr8Temp, AMD64Kind.QWORD);
    Register temp1 = asRegister(xmm1Temp, AMD64Kind.DOUBLE);
    Register temp2 = asRegister(xmm2Temp, AMD64Kind.DOUBLE);
    Register temp3 = asRegister(xmm3Temp, AMD64Kind.DOUBLE);
    Register temp4 = asRegister(xmm4Temp, AMD64Kind.DOUBLE);
    Register temp5 = asRegister(xmm5Temp, AMD64Kind.DOUBLE);
    Register temp6 = asRegister(xmm6Temp, AMD64Kind.DOUBLE);
    Register temp7 = asRegister(xmm7Temp, AMD64Kind.DOUBLE);
    Register temp8 = asRegister(xmm8Temp, AMD64Kind.DOUBLE);
    Register temp9 = asRegister(xmm9Temp, AMD64Kind.DOUBLE);
    Register temp10 = asRegister(xmm10Temp, AMD64Kind.DOUBLE);
    setCrb(crb);
    masm.movdqu(temp10, value1);
    masm.movsd(temp8, value2);
    if (dest.encoding != value1.encoding) {
        masm.movdqu(dest, value1);
    }
    // 0x00000000,
    masm.movq(temp9, externalAddress(logTwoEPtr));
    // 0x3ff72000
    masm.pextrw(gpr1, dest, 3);
    masm.xorpd(temp2, temp2);
    masm.movq(gpr2, 0x3ff0000000000000L);
    masm.movdq(temp2, gpr2);
    masm.movl(gpr5, 1069088768);
    masm.movdq(temp7, gpr5);
    masm.xorpd(temp1, temp1);
    masm.movq(gpr6, 0x77f0000000000000L);
    masm.movdq(temp1, gpr6);
    masm.movdqu(temp3, dest);
    masm.movl(gpr4, 32752);
    masm.andl(gpr4, gpr1);
    masm.subl(gpr4, 16368);
    masm.movl(gpr3, gpr4);
    masm.sarl(gpr4, 31);
    masm.addl(gpr3, gpr4);
    masm.xorl(gpr3, gpr4);
    masm.por(dest, temp2);
    // 0x00000000,
    masm.movdqu(temp6, externalAddress(highSigMaskPtr));
    // 0xfffff800,
    // 0x00000000,
    // 0xfffff800
    masm.psrlq(dest, 27);
    masm.psrld(dest, 2);
    masm.addl(gpr3, 16);
    masm.bsrl(gpr3, gpr3);
    masm.rcpps(dest, dest);
    masm.psllq(temp3, 12);
    masm.movl(gpr7, 8192);
    masm.movdq(temp4, gpr7);
    masm.psrlq(temp3, 12);
    masm.subl(gpr1, 16);
    masm.cmpl(gpr1, 32736);
    masm.jcc(ConditionFlag.AboveEqual, bb0);
    masm.movl(gpr5, 0);
    masm.bind(bb1);
    masm.mulss(dest, temp7);
    masm.movl(gpr4, -1);
    masm.subl(gpr3, 4);
    masm.shll(gpr4);
    masm.shlq(gpr4, 32);
    masm.movdq(temp5, gpr4);
    masm.por(temp3, temp1);
    masm.subl(gpr1, 16351);
    masm.cmpl(gpr1, 1);
    masm.jcc(ConditionFlag.BelowEqual, bb2);
    masm.paddd(dest, temp4);
    masm.pand(temp5, temp3);
    masm.movdl(gpr4, dest);
    masm.psllq(dest, 29);
    masm.bind(bb3);
    masm.subsd(temp3, temp5);
    masm.pand(dest, temp6);
    masm.subl(gpr1, 1);
    masm.sarl(gpr1, 4);
    masm.cvtsi2sdl(temp7, gpr1);
    masm.mulpd(temp5, dest);
    masm.bind(bb4);
    masm.mulsd(temp3, dest);
    masm.leaq(gpr8, externalAddress(coeffPowPtr));
    // 0x6dc96112,
    masm.movdqu(temp1, new AMD64Address(gpr8, 0));
    // 0xbf836578,
    // 0xee241472,
    // 0xbf9b0301
    // 0x9f95985a,
    masm.movdqu(temp4, new AMD64Address(gpr8, 16));
    // 0xbfb528db,
    // 0xb3841d2a,
    // 0xbfd619b6
    // 0x518775e3,
    masm.movdqu(temp6, new AMD64Address(gpr8, 32));
    // 0x3f9004f2,
    // 0xac8349bb,
    // 0x3fa76c9b
    // 0x486ececc,
    masm.movdqu(dest, new AMD64Address(gpr8, 48));
    // 0x3fc4635e,
    // 0x161bb241,
    // 0xbf5dabe1
    masm.subsd(temp5, temp9);
    masm.movl(gpr3, gpr1);
    masm.sarl(gpr1, 31);
    masm.addl(gpr3, gpr1);
    masm.xorl(gpr1, gpr3);
    masm.addl(gpr1, 1);
    masm.bsrl(gpr1, gpr1);
    masm.unpcklpd(temp5, temp3);
    masm.addsd(temp3, temp5);
    masm.leaq(gpr7, externalAddress(lTblPowPtr));
    masm.andl(gpr4, 16760832);
    masm.shrl(gpr4, 10);
    masm.addpd(temp5, new AMD64Address(gpr7, gpr4, Scale.Times1, -3648));
    masm.pshufd(temp2, temp3, 0x44);
    masm.mulsd(temp3, temp3);
    masm.mulpd(temp1, temp2);
    masm.mulpd(temp4, temp2);
    masm.addsd(temp5, temp7);
    masm.mulsd(temp2, temp3);
    masm.addpd(temp6, temp1);
    masm.mulsd(temp3, temp3);
    masm.addpd(dest, temp4);
    masm.movdqu(temp1, temp8);
    masm.pextrw(gpr3, temp8, 3);
    masm.pshufd(temp7, temp5, 0xEE);
    // 0x00000000,
    masm.movq(temp4, externalAddress(highmaskYPtr));
    // 0xfffffff8
    masm.mulpd(temp6, temp2);
    masm.pshufd(temp3, temp3, 0x44);
    masm.mulpd(dest, temp2);
    masm.shll(gpr1, 4);
    masm.subl(gpr1, 15872);
    masm.andl(gpr3, 32752);
    masm.addl(gpr1, gpr3);
    masm.mulpd(temp3, temp6);
    masm.cmpl(gpr1, 624);
    masm.jcc(ConditionFlag.AboveEqual, bb5);
    masm.xorpd(temp6, temp6);
    masm.movl(gpr4, 17080);
    masm.pinsrw(temp6, gpr4, 3);
    masm.movdqu(temp2, temp1);
    masm.pand(temp4, temp1);
    masm.subsd(temp1, temp4);
    masm.mulsd(temp4, temp5);
    masm.addsd(dest, temp7);
    masm.mulsd(temp1, temp5);
    masm.movdqu(temp7, temp6);
    masm.addsd(temp6, temp4);
    masm.leaq(gpr7, externalAddress(tExpPtr));
    masm.addpd(temp3, dest);
    masm.movdl(gpr4, temp6);
    masm.movl(gpr3, gpr4);
    masm.andl(gpr4, 255);
    masm.addl(gpr4, gpr4);
    masm.movdqu(temp5, new AMD64Address(gpr7, gpr4, Scale.Times8, 0));
    masm.subsd(temp6, temp7);
    masm.pshufd(dest, temp3, 0xEE);
    masm.subsd(temp4, temp6);
    masm.addsd(dest, temp3);
    masm.addsd(temp4, temp1);
    masm.mulsd(temp2, dest);
    masm.leaq(gpr8, externalAddress(eCoeffPtr));
    // 0xe78a6731,
    masm.movdqu(temp7, new AMD64Address(gpr8, 0));
    // 0x3f55d87f,
    // 0xd704a0c0,
    // 0x3fac6b08
    // 0x6fba4e77,
    masm.movdqu(temp3, new AMD64Address(gpr8, 16));
    // 0x3f83b2ab,
    // 0xff82c58f,
    // 0x3fcebfbd
    masm.shll(gpr3, 12);
    masm.xorl(gpr3, gpr5);
    masm.andl(gpr3, -1048576);
    masm.movdq(temp6, gpr3);
    masm.addsd(temp2, temp4);
    masm.movq(gpr2, 0x3fe62e42fefa39efL);
    masm.movdq(temp1, gpr2);
    masm.pshufd(dest, temp2, 0x44);
    masm.pshufd(temp4, temp2, 0x44);
    masm.mulsd(temp1, temp2);
    masm.pshufd(temp6, temp6, 0x11);
    masm.mulpd(dest, dest);
    masm.mulpd(temp7, temp4);
    masm.paddd(temp5, temp6);
    masm.mulsd(temp1, temp5);
    masm.pshufd(temp6, temp5, 0xEE);
    masm.mulsd(dest, dest);
    masm.addpd(temp3, temp7);
    masm.addsd(temp1, temp6);
    masm.mulpd(dest, temp3);
    masm.pshufd(temp3, dest, 0xEE);
    masm.mulsd(dest, temp5);
    masm.mulsd(temp3, temp5);
    masm.addsd(dest, temp1);
    masm.addsd(dest, temp3);
    masm.addsd(dest, temp5);
    masm.jmp(bb56);
    masm.bind(bb0);
    masm.addl(gpr1, 16);
    masm.movl(gpr4, 32752);
    masm.andl(gpr4, gpr1);
    masm.cmpl(gpr4, 32752);
    masm.jcc(ConditionFlag.Equal, bb6);
    masm.testl(gpr1, 32768);
    masm.jcc(ConditionFlag.NotEqual, bb7);
    masm.bind(bb8);
    masm.movdqu(dest, temp10);
    masm.movdqu(temp3, temp10);
    masm.movdl(gpr4, temp3);
    masm.psrlq(temp3, 32);
    masm.movdl(gpr3, temp3);
    masm.orl(gpr4, gpr3);
    masm.cmpl(gpr4, 0);
    masm.jcc(ConditionFlag.Equal, bb9);
    masm.xorpd(temp3, temp3);
    masm.movl(gpr1, 18416);
    masm.pinsrw(temp3, gpr1, 3);
    masm.mulsd(dest, temp3);
    masm.xorpd(temp2, temp2);
    masm.movl(gpr1, 16368);
    masm.pinsrw(temp2, gpr1, 3);
    masm.movdqu(temp3, dest);
    masm.pextrw(gpr1, dest, 3);
    masm.por(dest, temp2);
    masm.movl(gpr3, 18416);
    masm.psrlq(dest, 27);
    masm.psrld(dest, 2);
    masm.rcpps(dest, dest);
    masm.psllq(temp3, 12);
    // 0x00000000,
    masm.movdqu(temp6, externalAddress(highSigMaskPtr));
    // 0xfffff800,
    // 0x00000000,
    // 0xfffff800
    masm.psrlq(temp3, 12);
    masm.mulss(dest, temp7);
    masm.movl(gpr4, -1024);
    masm.movdl(temp5, gpr4);
    masm.por(temp3, temp1);
    masm.paddd(dest, temp4);
    masm.psllq(temp5, 32);
    masm.movdl(gpr4, dest);
    masm.psllq(dest, 29);
    masm.pand(temp5, temp3);
    masm.movl(gpr5, 0);
    masm.pand(dest, temp6);
    masm.subsd(temp3, temp5);
    masm.andl(gpr1, 32752);
    masm.subl(gpr1, 18416);
    masm.sarl(gpr1, 4);
    masm.cvtsi2sdl(temp7, gpr1);
    masm.mulpd(temp5, dest);
    masm.jmp(bb4);
    masm.bind(bb10);
    masm.movdqu(dest, temp10);
    masm.movdqu(temp3, temp10);
    masm.movdl(gpr4, temp3);
    masm.psrlq(temp3, 32);
    masm.movdl(gpr3, temp3);
    masm.orl(gpr4, gpr3);
    masm.cmpl(gpr4, 0);
    masm.jcc(ConditionFlag.Equal, bb9);
    masm.xorpd(temp3, temp3);
    masm.movl(gpr1, 18416);
    masm.pinsrw(temp3, gpr1, 3);
    masm.mulsd(dest, temp3);
    masm.xorpd(temp2, temp2);
    masm.movl(gpr1, 16368);
    masm.pinsrw(temp2, gpr1, 3);
    masm.movdqu(temp3, dest);
    masm.pextrw(gpr1, dest, 3);
    masm.por(dest, temp2);
    masm.movl(gpr3, 18416);
    masm.psrlq(dest, 27);
    masm.psrld(dest, 2);
    masm.rcpps(dest, dest);
    masm.psllq(temp3, 12);
    // 0x00000000,
    masm.movdqu(temp6, externalAddress(highSigMaskPtr));
    // 0xfffff800,
    // 0x00000000,
    // 0xfffff800
    masm.psrlq(temp3, 12);
    masm.mulss(dest, temp7);
    masm.movl(gpr4, -1024);
    masm.movdl(temp5, gpr4);
    masm.por(temp3, temp1);
    masm.paddd(dest, temp4);
    masm.psllq(temp5, 32);
    masm.movdl(gpr4, dest);
    masm.psllq(dest, 29);
    masm.pand(temp5, temp3);
    masm.movl(gpr5, Integer.MIN_VALUE);
    masm.pand(dest, temp6);
    masm.subsd(temp3, temp5);
    masm.andl(gpr1, 32752);
    masm.subl(gpr1, 18416);
    masm.sarl(gpr1, 4);
    masm.cvtsi2sdl(temp7, gpr1);
    masm.mulpd(temp5, dest);
    masm.jmp(bb4);
    masm.bind(bb5);
    masm.cmpl(gpr1, 0);
    masm.jcc(ConditionFlag.Less, bb11);
    masm.cmpl(gpr1, 752);
    masm.jcc(ConditionFlag.AboveEqual, bb12);
    masm.addsd(dest, temp7);
    // 0xf8000000,
    masm.movq(temp4, externalAddress(halfmaskPtr));
    // 0xffffffff
    masm.addpd(temp3, dest);
    masm.xorpd(temp6, temp6);
    masm.movl(gpr1, 17080);
    masm.pinsrw(temp6, gpr1, 3);
    masm.pshufd(dest, temp3, 0xEE);
    masm.addsd(dest, temp3);
    masm.movdqu(temp3, temp5);
    masm.addsd(temp5, dest);
    masm.subsd(temp3, temp5);
    masm.movdqu(temp7, temp5);
    masm.pand(temp5, temp4);
    masm.movdqu(temp2, temp1);
    masm.pand(temp4, temp1);
    masm.subsd(temp7, temp5);
    masm.addsd(dest, temp3);
    masm.subsd(temp1, temp4);
    masm.mulsd(temp4, temp5);
    masm.addsd(dest, temp7);
    masm.mulsd(temp2, dest);
    masm.movdqu(temp7, temp6);
    masm.mulsd(temp1, temp5);
    masm.addsd(temp6, temp4);
    masm.movdl(gpr1, temp6);
    masm.subsd(temp6, temp7);
    masm.leaq(gpr7, externalAddress(tExpPtr));
    masm.movl(gpr3, gpr1);
    masm.andl(gpr1, 255);
    masm.addl(gpr1, gpr1);
    masm.movdqu(temp5, new AMD64Address(gpr7, gpr1, Scale.Times8, 0));
    masm.addsd(temp2, temp1);
    masm.leaq(gpr8, externalAddress(eCoeffPtr));
    // 0xe78a6731,
    masm.movdqu(temp7, new AMD64Address(gpr8, 0));
    // 0x3f55d87f,
    // 0xd704a0c0,
    // 0x3fac6b08
    // 0x6fba4e77,
    masm.movdqu(temp3, new AMD64Address(gpr8, 16));
    // 0x3f83b2ab,
    // 0xff82c58f,
    // 0x3fcebfbd
    masm.subsd(temp4, temp6);
    masm.pextrw(gpr4, temp6, 3);
    masm.addsd(temp2, temp4);
    masm.sarl(gpr3, 8);
    masm.movl(gpr1, gpr3);
    masm.sarl(gpr3, 1);
    masm.subl(gpr1, gpr3);
    masm.shll(gpr3, 20);
    masm.xorl(gpr3, gpr5);
    masm.movdl(temp6, gpr3);
    // 0xfefa39ef,
    masm.movq(temp1, new AMD64Address(gpr8, 32));
    // 0x3fe62e42
    masm.andl(gpr4, 32767);
    masm.cmpl(gpr4, 16529);
    masm.jcc(ConditionFlag.Above, bb12);
    masm.pshufd(dest, temp2, 0x44);
    masm.pshufd(temp4, temp2, 0x44);
    masm.mulpd(dest, dest);
    masm.mulpd(temp7, temp4);
    masm.pshufd(temp6, temp6, 0x11);
    masm.mulsd(temp1, temp2);
    masm.mulsd(dest, dest);
    masm.paddd(temp5, temp6);
    masm.addpd(temp3, temp7);
    masm.mulsd(temp1, temp5);
    masm.pshufd(temp6, temp5, 0xEE);
    masm.mulpd(dest, temp3);
    masm.addsd(temp1, temp6);
    masm.pshufd(temp3, dest, 0xEE);
    masm.mulsd(dest, temp5);
    masm.mulsd(temp3, temp5);
    masm.shll(gpr1, 4);
    masm.xorpd(temp4, temp4);
    masm.addl(gpr1, 16368);
    masm.pinsrw(temp4, gpr1, 3);
    masm.addsd(dest, temp1);
    masm.addsd(dest, temp3);
    masm.movdqu(temp1, dest);
    masm.addsd(dest, temp5);
    masm.mulsd(dest, temp4);
    masm.pextrw(gpr1, dest, 3);
    masm.andl(gpr1, 32752);
    masm.jcc(ConditionFlag.Equal, bb13);
    masm.cmpl(gpr1, 32752);
    masm.jcc(ConditionFlag.Equal, bb14);
    masm.jmp(bb56);
    masm.bind(bb6);
    masm.movdqu(temp1, temp8);
    masm.movdqu(dest, temp10);
    masm.movdqu(temp2, dest);
    masm.movdl(gpr1, temp2);
    masm.psrlq(temp2, 20);
    masm.movdl(gpr4, temp2);
    masm.orl(gpr1, gpr4);
    masm.jcc(ConditionFlag.Equal, bb15);
    masm.movdl(gpr1, temp1);
    masm.psrlq(temp1, 32);
    masm.movdl(gpr4, temp1);
    masm.movl(gpr3, gpr4);
    masm.addl(gpr4, gpr4);
    masm.orl(gpr1, gpr4);
    masm.jcc(ConditionFlag.Equal, bb16);
    masm.addsd(dest, dest);
    masm.jmp(bb56);
    masm.bind(bb16);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 16368);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb18);
    masm.addpd(dest, temp8);
    masm.jmp(bb56);
    masm.bind(bb15);
    masm.movdl(gpr1, temp1);
    masm.movdqu(temp2, temp1);
    masm.psrlq(temp1, 32);
    masm.movdl(gpr4, temp1);
    masm.movl(gpr3, gpr4);
    masm.addl(gpr4, gpr4);
    masm.orl(gpr1, gpr4);
    masm.jcc(ConditionFlag.Equal, bb19);
    masm.pextrw(gpr1, temp2, 3);
    masm.andl(gpr1, 32752);
    masm.cmpl(gpr1, 32752);
    masm.jcc(ConditionFlag.NotEqual, bb20);
    masm.movdl(gpr1, temp2);
    masm.psrlq(temp2, 20);
    masm.movdl(gpr4, temp2);
    masm.orl(gpr1, gpr4);
    masm.jcc(ConditionFlag.NotEqual, bb18);
    masm.bind(bb20);
    masm.pextrw(gpr1, dest, 3);
    masm.testl(gpr1, 32768);
    masm.jcc(ConditionFlag.NotEqual, bb21);
    masm.testl(gpr3, Integer.MIN_VALUE);
    masm.jcc(ConditionFlag.NotZero, bb22);
    masm.jmp(bb56);
    masm.bind(bb23);
    masm.movdl(gpr1, temp8);
    masm.testl(gpr1, 1);
    masm.jcc(ConditionFlag.NotEqual, bb24);
    masm.testl(gpr1, 2);
    masm.jcc(ConditionFlag.NotEqual, bb25);
    masm.jmp(bb24);
    masm.bind(bb21);
    masm.shrl(gpr3, 20);
    masm.andl(gpr3, 2047);
    masm.cmpl(gpr3, 1075);
    masm.jcc(ConditionFlag.Above, bb24);
    masm.jcc(ConditionFlag.Equal, bb26);
    masm.cmpl(gpr3, 1074);
    masm.jcc(ConditionFlag.Above, bb23);
    masm.cmpl(gpr3, 1023);
    masm.jcc(ConditionFlag.Below, bb24);
    masm.movdqu(temp1, temp8);
    masm.movl(gpr1, 17208);
    masm.xorpd(temp3, temp3);
    masm.pinsrw(temp3, gpr1, 3);
    masm.movdqu(temp4, temp3);
    masm.addsd(temp3, temp1);
    masm.subsd(temp4, temp3);
    masm.addsd(temp1, temp4);
    masm.pextrw(gpr1, temp1, 3);
    masm.andl(gpr1, 32752);
    masm.jcc(ConditionFlag.NotEqual, bb24);
    masm.movdl(gpr1, temp3);
    masm.andl(gpr1, 1);
    masm.jcc(ConditionFlag.Equal, bb24);
    masm.bind(bb25);
    masm.pextrw(gpr1, temp8, 3);
    masm.andl(gpr1, 32768);
    masm.jcc(ConditionFlag.NotEqual, bb27);
    masm.jmp(bb56);
    masm.bind(bb27);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 32768);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb24);
    masm.pextrw(gpr1, temp8, 3);
    masm.andl(gpr1, 32768);
    masm.jcc(ConditionFlag.NotEqual, bb22);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 32752);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb26);
    masm.movdl(gpr1, temp8);
    masm.andl(gpr1, 1);
    masm.jcc(ConditionFlag.Equal, bb24);
    masm.jmp(bb25);
    masm.bind(bb28);
    masm.movdl(gpr1, temp1);
    masm.psrlq(temp1, 20);
    masm.movdl(gpr4, temp1);
    masm.orl(gpr1, gpr4);
    masm.jcc(ConditionFlag.Equal, bb29);
    masm.addsd(dest, temp8);
    masm.jmp(bb56);
    masm.bind(bb29);
    masm.movdqu(dest, temp10);
    masm.pextrw(gpr1, dest, 3);
    masm.cmpl(gpr1, 49136);
    masm.jcc(ConditionFlag.NotEqual, bb30);
    masm.movdl(gpr3, dest);
    masm.psrlq(dest, 20);
    masm.movdl(gpr4, dest);
    masm.orl(gpr3, gpr4);
    masm.jcc(ConditionFlag.NotEqual, bb30);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 32760);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb30);
    masm.andl(gpr1, 32752);
    masm.subl(gpr1, 16368);
    masm.pextrw(gpr4, temp8, 3);
    masm.xorpd(dest, dest);
    masm.xorl(gpr1, gpr4);
    masm.andl(gpr1, 32768);
    masm.jcc(ConditionFlag.Equal, bb31);
    masm.jmp(bb56);
    masm.bind(bb31);
    masm.movl(gpr3, 32752);
    masm.pinsrw(dest, gpr3, 3);
    masm.jmp(bb56);
    masm.bind(bb32);
    masm.movdl(gpr1, temp1);
    masm.cmpl(gpr4, 17184);
    masm.jcc(ConditionFlag.Above, bb33);
    masm.testl(gpr1, 1);
    masm.jcc(ConditionFlag.NotEqual, bb34);
    masm.testl(gpr1, 2);
    masm.jcc(ConditionFlag.Equal, bb35);
    masm.jmp(bb36);
    masm.bind(bb33);
    masm.testl(gpr1, 1);
    masm.jcc(ConditionFlag.Equal, bb35);
    masm.jmp(bb36);
    masm.bind(bb7);
    masm.movdqu(temp2, temp10);
    masm.movdl(gpr1, temp2);
    masm.psrlq(temp2, 31);
    masm.movdl(gpr3, temp2);
    masm.orl(gpr1, gpr3);
    masm.jcc(ConditionFlag.Equal, bb9);
    masm.pextrw(gpr4, temp8, 3);
    masm.movdl(gpr1, temp8);
    masm.movdqu(temp2, temp8);
    masm.psrlq(temp2, 32);
    masm.movdl(gpr3, temp2);
    masm.addl(gpr3, gpr3);
    masm.orl(gpr3, gpr1);
    masm.jcc(ConditionFlag.Equal, bb37);
    masm.andl(gpr4, 32752);
    masm.cmpl(gpr4, 32752);
    masm.jcc(ConditionFlag.Equal, bb28);
    masm.cmpl(gpr4, 17200);
    masm.jcc(ConditionFlag.Above, bb35);
    masm.cmpl(gpr4, 17184);
    masm.jcc(ConditionFlag.AboveEqual, bb32);
    masm.cmpl(gpr4, 16368);
    masm.jcc(ConditionFlag.Below, bb34);
    masm.movl(gpr1, 17208);
    masm.xorpd(temp2, temp2);
    masm.pinsrw(temp2, gpr1, 3);
    masm.movdqu(temp4, temp2);
    masm.addsd(temp2, temp1);
    masm.subsd(temp4, temp2);
    masm.addsd(temp1, temp4);
    masm.pextrw(gpr1, temp1, 3);
    masm.andl(gpr1, 32767);
    masm.jcc(ConditionFlag.NotEqual, bb34);
    masm.movdl(gpr1, temp2);
    masm.andl(gpr1, 1);
    masm.jcc(ConditionFlag.Equal, bb35);
    masm.bind(bb36);
    masm.xorpd(temp1, temp1);
    masm.movl(gpr4, 30704);
    masm.pinsrw(temp1, gpr4, 3);
    masm.pextrw(gpr1, temp10, 3);
    masm.movl(gpr4, 8192);
    masm.movdl(temp4, gpr4);
    masm.andl(gpr1, 32767);
    masm.subl(gpr1, 16);
    masm.jcc(ConditionFlag.Less, bb10);
    masm.movl(gpr4, gpr1);
    masm.andl(gpr4, 32752);
    masm.subl(gpr4, 16368);
    masm.movl(gpr3, gpr4);
    masm.sarl(gpr4, 31);
    masm.addl(gpr3, gpr4);
    masm.xorl(gpr3, gpr4);
    masm.addl(gpr3, 16);
    masm.bsrl(gpr3, gpr3);
    masm.movl(gpr5, Integer.MIN_VALUE);
    masm.jmp(bb1);
    masm.bind(bb34);
    masm.xorpd(temp1, temp1);
    masm.movl(gpr1, 32752);
    masm.pinsrw(temp1, gpr1, 3);
    masm.xorpd(dest, dest);
    masm.mulsd(dest, temp1);
    masm.jmp(bb56);
    masm.bind(bb35);
    masm.xorpd(temp1, temp1);
    masm.movl(gpr4, 30704);
    masm.pinsrw(temp1, gpr4, 3);
    masm.pextrw(gpr1, temp10, 3);
    masm.movl(gpr4, 8192);
    masm.movdl(temp4, gpr4);
    masm.andl(gpr1, 32767);
    masm.subl(gpr1, 16);
    masm.jcc(ConditionFlag.Less, bb8);
    masm.movl(gpr4, gpr1);
    masm.andl(gpr4, 32752);
    masm.subl(gpr4, 16368);
    masm.movl(gpr3, gpr4);
    masm.sarl(gpr4, 31);
    masm.addl(gpr3, gpr4);
    masm.xorl(gpr3, gpr4);
    masm.addl(gpr3, 16);
    masm.bsrl(gpr3, gpr3);
    masm.movl(gpr5, 0);
    masm.jmp(bb1);
    masm.bind(bb19);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 16368);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb22);
    masm.xorpd(dest, dest);
    masm.jmp(bb56);
    masm.bind(bb11);
    masm.addl(gpr1, 384);
    masm.cmpl(gpr1, 0);
    masm.jcc(ConditionFlag.Less, bb38);
    masm.mulsd(temp5, temp1);
    masm.addsd(dest, temp7);
    masm.shrl(gpr5, 31);
    masm.addpd(temp3, dest);
    masm.pshufd(dest, temp3, 0xEE);
    masm.addsd(temp3, dest);
    // 0xfefa39ef,
    masm.leaq(gpr7, externalAddress(logTwoPowPtr));
    // 0x3fe62e42,
    // 0xfefa39ef,
    // 0xbfe62e42
    masm.movq(temp4, new AMD64Address(gpr7, gpr5, Scale.Times8, 0));
    masm.mulsd(temp1, temp3);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 16368);
    masm.shll(gpr5, 15);
    masm.orl(gpr1, gpr5);
    masm.pinsrw(dest, gpr1, 3);
    masm.addsd(temp5, temp1);
    masm.mulsd(temp5, temp4);
    masm.addsd(dest, temp5);
    masm.jmp(bb56);
    masm.bind(bb38);
    masm.bind(bb37);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 16368);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb39);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 16368);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb9);
    masm.movdqu(temp2, temp8);
    masm.pextrw(gpr1, temp8, 3);
    masm.andl(gpr1, 32752);
    masm.cmpl(gpr1, 32752);
    masm.jcc(ConditionFlag.NotEqual, bb40);
    masm.movdl(gpr1, temp2);
    masm.psrlq(temp2, 20);
    masm.movdl(gpr4, temp2);
    masm.orl(gpr1, gpr4);
    masm.jcc(ConditionFlag.NotEqual, bb18);
    masm.bind(bb40);
    masm.movdl(gpr1, temp1);
    masm.psrlq(temp1, 32);
    masm.movdl(gpr4, temp1);
    masm.movl(gpr3, gpr4);
    masm.addl(gpr4, gpr4);
    masm.orl(gpr1, gpr4);
    masm.jcc(ConditionFlag.Equal, bb39);
    masm.shrl(gpr4, 21);
    masm.cmpl(gpr4, 1075);
    masm.jcc(ConditionFlag.Above, bb41);
    masm.jcc(ConditionFlag.Equal, bb42);
    masm.cmpl(gpr4, 1023);
    masm.jcc(ConditionFlag.Below, bb41);
    masm.movdqu(temp1, temp8);
    masm.movl(gpr1, 17208);
    masm.xorpd(temp3, temp3);
    masm.pinsrw(temp3, gpr1, 3);
    masm.movdqu(temp4, temp3);
    masm.addsd(temp3, temp1);
    masm.subsd(temp4, temp3);
    masm.addsd(temp1, temp4);
    masm.pextrw(gpr1, temp1, 3);
    masm.andl(gpr1, 32752);
    masm.jcc(ConditionFlag.NotEqual, bb41);
    masm.movdl(gpr1, temp3);
    masm.andl(gpr1, 1);
    masm.jcc(ConditionFlag.Equal, bb41);
    masm.bind(bb43);
    masm.movdqu(dest, temp10);
    masm.testl(gpr3, Integer.MIN_VALUE);
    masm.jcc(ConditionFlag.NotEqual, bb44);
    masm.jmp(bb56);
    masm.bind(bb42);
    masm.movdl(gpr1, temp8);
    masm.testl(gpr1, 1);
    masm.jcc(ConditionFlag.NotEqual, bb43);
    masm.bind(bb41);
    masm.testl(gpr3, Integer.MIN_VALUE);
    masm.jcc(ConditionFlag.Equal, bb22);
    masm.xorpd(dest, dest);
    masm.bind(bb44);
    masm.movl(gpr1, 16368);
    masm.xorpd(temp1, temp1);
    masm.pinsrw(temp1, gpr1, 3);
    masm.divsd(temp1, dest);
    masm.movdqu(dest, temp1);
    masm.jmp(bb56);
    masm.bind(bb12);
    masm.pextrw(gpr1, temp10, 3);
    masm.pextrw(gpr4, temp8, 3);
    masm.movl(gpr3, 32752);
    masm.andl(gpr3, gpr4);
    masm.cmpl(gpr3, 32752);
    masm.jcc(ConditionFlag.Equal, bb45);
    masm.andl(gpr1, 32752);
    masm.subl(gpr1, 16368);
    masm.xorl(gpr4, gpr1);
    masm.testl(gpr4, 32768);
    masm.jcc(ConditionFlag.NotEqual, bb46);
    masm.bind(bb47);
    masm.movl(gpr1, 32736);
    masm.pinsrw(dest, gpr1, 3);
    masm.shrl(gpr5, 16);
    masm.orl(gpr1, gpr5);
    masm.pinsrw(temp1, gpr1, 3);
    masm.mulsd(dest, temp1);
    masm.bind(bb14);
    masm.jmp(bb56);
    masm.bind(bb46);
    masm.movl(gpr1, 16);
    masm.pinsrw(dest, gpr1, 3);
    masm.mulsd(dest, dest);
    masm.testl(gpr3, Integer.MIN_VALUE);
    masm.jcc(ConditionFlag.Equal, bb48);
    masm.movq(gpr2, 0x8000000000000000L);
    masm.movdq(temp2, gpr2);
    masm.xorpd(dest, temp2);
    masm.bind(bb48);
    masm.jmp(bb56);
    masm.bind(bb13);
    masm.pextrw(gpr3, temp5, 3);
    masm.pextrw(gpr4, temp4, 3);
    masm.movl(gpr1, -1);
    masm.andl(gpr3, 32752);
    masm.subl(gpr3, 16368);
    masm.andl(gpr4, 32752);
    masm.addl(gpr4, gpr3);
    masm.movl(gpr3, -31);
    masm.sarl(gpr4, 4);
    masm.subl(gpr3, gpr4);
    masm.jcc(ConditionFlag.LessEqual, bb49);
    masm.cmpl(gpr3, 20);
    masm.jcc(ConditionFlag.Above, bb50);
    masm.shll(gpr1);
    masm.bind(bb49);
    masm.movdl(dest, gpr1);
    masm.psllq(dest, 32);
    masm.pand(dest, temp5);
    masm.subsd(temp5, dest);
    masm.addsd(temp5, temp1);
    masm.mulsd(dest, temp4);
    masm.mulsd(temp5, temp4);
    masm.addsd(dest, temp5);
    masm.bind(bb50);
    masm.jmp(bb48);
    masm.bind(bb2);
    masm.pextrw(gpr3, temp8, 3);
    masm.movl(gpr4, Integer.MIN_VALUE);
    masm.movdl(temp1, gpr4);
    masm.xorpd(temp7, temp7);
    masm.paddd(dest, temp4);
    masm.movdl(gpr4, dest);
    masm.psllq(dest, 29);
    masm.paddq(temp1, temp3);
    masm.pand(temp5, temp1);
    masm.andl(gpr3, 32752);
    masm.cmpl(gpr3, 16560);
    masm.jcc(ConditionFlag.Less, bb3);
    masm.leaq(gpr7, externalAddress(lTblPowPtr));
    masm.leaq(gpr8, externalAddress(coeffHPtr));
    // 0x00000000,
    masm.movdqu(temp4, new AMD64Address(gpr8, 0));
    // 0xbfd61a00,
    // 0x00000000,
    // 0xbf5dabe1
    masm.pand(dest, temp6);
    masm.subsd(temp3, temp5);
    masm.addl(gpr1, 16351);
    masm.shrl(gpr1, 4);
    masm.subl(gpr1, 1022);
    masm.cvtsi2sdl(temp7, gpr1);
    masm.mulpd(temp5, dest);
    masm.mulsd(temp3, dest);
    masm.subsd(temp5, temp9);
    masm.pshufd(temp1, temp4, 0xE);
    masm.pshufd(temp2, temp3, 0x44);
    masm.unpcklpd(temp5, temp3);
    masm.addsd(temp3, temp5);
    masm.andl(gpr4, 16760832);
    masm.shrl(gpr4, 10);
    masm.addpd(temp7, new AMD64Address(gpr7, gpr4, Scale.Times1, -3648));
    masm.movdqu(temp6, temp4);
    masm.mulsd(temp4, temp5);
    masm.movdqu(dest, temp1);
    masm.mulsd(dest, temp5);
    masm.mulsd(temp6, temp2);
    masm.mulsd(temp1, temp2);
    masm.movdqu(temp2, temp5);
    masm.mulsd(temp4, temp5);
    masm.addsd(temp5, dest);
    masm.movdqu(dest, temp7);
    masm.addsd(temp2, temp3);
    masm.addsd(temp7, temp5);
    masm.mulsd(temp6, temp2);
    masm.subsd(dest, temp7);
    masm.movdqu(temp2, temp7);
    masm.addsd(temp7, temp4);
    masm.addsd(dest, temp5);
    masm.subsd(temp2, temp7);
    masm.addsd(temp4, temp2);
    masm.pshufd(temp2, temp5, 0xEE);
    masm.movdqu(temp5, temp7);
    masm.addsd(temp7, temp2);
    masm.addsd(temp4, dest);
    masm.leaq(gpr8, externalAddress(coeffPowPtr));
    // 0x6dc96112,
    masm.movdqu(dest, new AMD64Address(gpr8, 0));
    // 0xbf836578,
    // 0xee241472,
    // 0xbf9b0301
    masm.subsd(temp5, temp7);
    masm.addsd(temp6, temp4);
    masm.movdqu(temp4, temp7);
    masm.addsd(temp5, temp2);
    masm.addsd(temp7, temp1);
    // 0x486ececc,
    masm.movdqu(temp2, new AMD64Address(gpr8, 64));
    // 0x3fc4635e,
    // 0x161bb241,
    // 0xbf5dabe1
    masm.subsd(temp4, temp7);
    masm.addsd(temp6, temp5);
    masm.addsd(temp4, temp1);
    masm.pshufd(temp5, temp7, 0xEE);
    masm.movapd(temp1, temp7);
    masm.addsd(temp7, temp5);
    masm.subsd(temp1, temp7);
    masm.addsd(temp1, temp5);
    // 0x9f95985a,
    masm.movdqu(temp5, new AMD64Address(gpr8, 80));
    // 0xbfb528db,
    // 0xf8b5787d,
    // 0x3ef2531e
    masm.pshufd(temp3, temp3, 0x44);
    masm.addsd(temp6, temp4);
    masm.addsd(temp6, temp1);
    // 0x9f95985a,
    masm.movdqu(temp1, new AMD64Address(gpr8, 32));
    // 0xbfb528db,
    // 0xb3841d2a,
    // 0xbfd619b6
    masm.mulpd(dest, temp3);
    masm.mulpd(temp2, temp3);
    masm.pshufd(temp4, temp3, 0x44);
    masm.mulpd(temp3, temp3);
    masm.addpd(dest, temp1);
    masm.addpd(temp5, temp2);
    masm.mulsd(temp4, temp3);
    // 0xf8000000,
    masm.movq(temp2, externalAddress(highmaskLogXPtr));
    // 0xffffffff
    masm.mulpd(temp3, temp3);
    masm.movdqu(temp1, temp8);
    masm.pextrw(gpr3, temp8, 3);
    masm.mulpd(dest, temp4);
    masm.pextrw(gpr1, temp7, 3);
    masm.mulpd(temp5, temp4);
    masm.mulpd(dest, temp3);
    masm.leaq(gpr8, externalAddress(highmaskYPtr));
    // 0x00000000,
    masm.movq(temp4, new AMD64Address(gpr8, 8));
    // 0xffffffff
    masm.pand(temp2, temp7);
    masm.addsd(temp5, temp6);
    masm.subsd(temp7, temp2);
    masm.addpd(temp5, dest);
    masm.andl(gpr1, 32752);
    masm.subl(gpr1, 16368);
    masm.andl(gpr3, 32752);
    masm.cmpl(gpr3, 32752);
    masm.jcc(ConditionFlag.Equal, bb45);
    masm.addl(gpr3, gpr1);
    masm.cmpl(gpr3, 16576);
    masm.jcc(ConditionFlag.AboveEqual, bb51);
    masm.pshufd(dest, temp5, 0xEE);
    masm.pand(temp4, temp1);
    masm.movdqu(temp3, temp1);
    masm.addsd(temp5, dest);
    masm.subsd(temp1, temp4);
    masm.xorpd(temp6, temp6);
    masm.movl(gpr4, 17080);
    masm.pinsrw(temp6, gpr4, 3);
    masm.addsd(temp7, temp5);
    masm.mulsd(temp4, temp2);
    masm.mulsd(temp1, temp2);
    masm.movdqu(temp5, temp6);
    masm.mulsd(temp3, temp7);
    masm.addsd(temp6, temp4);
    masm.addsd(temp1, temp3);
    masm.leaq(gpr8, externalAddress(eCoeffPtr));
    // 0xe78a6731,
    masm.movdqu(temp7, new AMD64Address(gpr8, 0));
    // 0x3f55d87f,
    // 0xd704a0c0,
    // 0x3fac6b08
    masm.movdl(gpr4, temp6);
    masm.subsd(temp6, temp5);
    masm.leaq(gpr7, externalAddress(tExpPtr));
    masm.movl(gpr3, gpr4);
    masm.andl(gpr4, 255);
    masm.addl(gpr4, gpr4);
    masm.movdqu(temp5, new AMD64Address(gpr7, gpr4, Scale.Times8, 0));
    // 0x6fba4e77,
    masm.movdqu(temp3, new AMD64Address(gpr8, 16));
    // 0x3f83b2ab,
    // 0xff82c58f,
    // 0x3fcebfbd
    // 0xfefa39ef,
    masm.movq(temp2, new AMD64Address(gpr8, 32));
    // 0x3fe62e42
    masm.subsd(temp4, temp6);
    masm.addsd(temp4, temp1);
    masm.pextrw(gpr4, temp6, 3);
    masm.shrl(gpr3, 8);
    masm.movl(gpr1, gpr3);
    masm.shrl(gpr3, 1);
    masm.subl(gpr1, gpr3);
    masm.shll(gpr3, 20);
    masm.movdl(temp6, gpr3);
    masm.pshufd(dest, temp4, 0x44);
    masm.pshufd(temp1, temp4, 0x44);
    masm.mulpd(dest, dest);
    masm.mulpd(temp7, temp1);
    masm.pshufd(temp6, temp6, 0x11);
    masm.mulsd(temp2, temp4);
    masm.andl(gpr4, 32767);
    masm.cmpl(gpr4, 16529);
    masm.jcc(ConditionFlag.Above, bb12);
    masm.mulsd(dest, dest);
    masm.paddd(temp5, temp6);
    masm.addpd(temp3, temp7);
    masm.mulsd(temp2, temp5);
    masm.pshufd(temp6, temp5, 0xEE);
    masm.mulpd(dest, temp3);
    masm.addsd(temp2, temp6);
    masm.pshufd(temp3, dest, 0xEE);
    masm.addl(gpr1, 1023);
    masm.shll(gpr1, 20);
    masm.orl(gpr1, gpr5);
    masm.movdl(temp4, gpr1);
    masm.mulsd(dest, temp5);
    masm.mulsd(temp3, temp5);
    masm.addsd(dest, temp2);
    masm.psllq(temp4, 32);
    masm.addsd(dest, temp3);
    masm.movdqu(temp1, dest);
    masm.addsd(dest, temp5);
    masm.mulsd(dest, temp4);
    masm.pextrw(gpr1, dest, 3);
    masm.andl(gpr1, 32752);
    masm.jcc(ConditionFlag.Equal, bb13);
    masm.cmpl(gpr1, 32752);
    masm.jcc(ConditionFlag.Equal, bb14);
    masm.jmp(bb56);
    masm.bind(bb45);
    masm.movdqu(dest, temp10);
    masm.xorpd(temp2, temp2);
    masm.movl(gpr1, 49136);
    masm.pinsrw(temp2, gpr1, 3);
    masm.addsd(temp2, dest);
    masm.pextrw(gpr1, temp2, 3);
    masm.cmpl(gpr1, 0);
    masm.jcc(ConditionFlag.NotEqual, bb53);
    masm.xorpd(dest, dest);
    masm.movl(gpr1, 32760);
    masm.pinsrw(dest, gpr1, 3);
    masm.jmp(bb56);
    masm.bind(bb53);
    masm.movdqu(temp1, temp8);
    masm.movdl(gpr4, temp1);
    masm.movdqu(temp3, temp1);
    masm.psrlq(temp3, 20);
    masm.movdl(gpr3, temp3);
    masm.orl(gpr3, gpr4);
    masm.jcc(ConditionFlag.Equal, bb54);
    masm.addsd(temp1, temp1);
    masm.movdqu(dest, temp1);
    masm.jmp(bb56);
    masm.bind(bb51);
    masm.pextrw(gpr1, temp1, 3);
    masm.pextrw(gpr3, temp2, 3);
    masm.xorl(gpr1, gpr3);
    masm.testl(gpr1, 32768);
    masm.jcc(ConditionFlag.Equal, bb47);
    masm.jmp(bb46);
    masm.bind(bb54);
    masm.pextrw(gpr1, dest, 3);
    masm.andl(gpr1, 32752);
    masm.pextrw(gpr4, temp1, 3);
    masm.xorpd(dest, dest);
    masm.subl(gpr1, 16368);
    masm.xorl(gpr1, gpr4);
    masm.testl(gpr1, 32768);
    masm.jcc(ConditionFlag.Equal, bb55);
    masm.jmp(bb56);
    masm.bind(bb55);
    masm.movl(gpr4, 32752);
    masm.pinsrw(dest, gpr4, 3);
    masm.jmp(bb56);
    masm.bind(bb56);
}
Also used : Register(jdk.vm.ci.code.Register) ValueUtil.asRegister(jdk.vm.ci.code.ValueUtil.asRegister) ArrayDataPointerConstant(org.graalvm.compiler.lir.asm.ArrayDataPointerConstant) Label(org.graalvm.compiler.asm.Label) AMD64Address(org.graalvm.compiler.asm.amd64.AMD64Address)

Example 32 with AMD64MacroAssembler

use of org.graalvm.compiler.asm.amd64.AMD64MacroAssembler in project graal by oracle.

the class AMD64ArrayCompareToOp method loadNextElements.

private void loadNextElements(AMD64MacroAssembler masm, Register elem1, Register elem2, Register str1, Register str2, AMD64Address.Scale scale, AMD64Address.Scale scale1, AMD64Address.Scale scale2, Register index) {
    // if (ae == StrIntrinsicNode::LL) {
    if (kind1 == JavaKind.Byte && kind2 == JavaKind.Byte) {
        masm.movzbl(elem1, new AMD64Address(str1, index, scale, 0));
        masm.movzbl(elem2, new AMD64Address(str2, index, scale, 0));
    // } else if (ae == StrIntrinsicNode::UU) {
    } else if (kind1 == JavaKind.Char && kind2 == JavaKind.Char) {
        masm.movzwl(elem1, new AMD64Address(str1, index, scale, 0));
        masm.movzwl(elem2, new AMD64Address(str2, index, scale, 0));
    } else {
        masm.movzbl(elem1, new AMD64Address(str1, index, scale1, 0));
        masm.movzwl(elem2, new AMD64Address(str2, index, scale2, 0));
    }
}
Also used : AMD64Address(org.graalvm.compiler.asm.amd64.AMD64Address)

Example 33 with AMD64MacroAssembler

use of org.graalvm.compiler.asm.amd64.AMD64MacroAssembler in project graal by oracle.

the class AMD64ArrayEqualsOp method emit8ByteCompare.

/**
 * Emits code that uses 8-byte vector compares.
 */
private void emit8ByteCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {
    Label loop = new Label();
    Label compareTail = new Label();
    boolean requiresNaNCheck = kind.isNumericFloat();
    Label loopCheck = new Label();
    Label nanCheck = new Label();
    Register temp = asRegister(temp4);
    // tail count (in bytes)
    masm.andl(result, VECTOR_SIZE - 1);
    // vector count (in bytes)
    masm.andl(length, ~(VECTOR_SIZE - 1));
    masm.jcc(ConditionFlag.Zero, compareTail);
    masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
    masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
    masm.negq(length);
    // Align the main loop
    masm.align(crb.target.wordSize * 2);
    masm.bind(loop);
    masm.movq(temp, new AMD64Address(array1, length, Scale.Times1, 0));
    masm.cmpq(temp, new AMD64Address(array2, length, Scale.Times1, 0));
    masm.jcc(ConditionFlag.NotEqual, requiresNaNCheck ? nanCheck : falseLabel);
    masm.bind(loopCheck);
    masm.addq(length, VECTOR_SIZE);
    masm.jccb(ConditionFlag.NotZero, loop);
    masm.testl(result, result);
    masm.jcc(ConditionFlag.Zero, trueLabel);
    if (requiresNaNCheck) {
        // NaN check is slow path and hence placed outside of the main loop.
        Label unalignedCheck = new Label();
        masm.jmpb(unalignedCheck);
        masm.bind(nanCheck);
        // At most two iterations, unroll in the emitted code.
        for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
            emitFloatCompare(masm, array1, array2, length, offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
        }
        masm.jmpb(loopCheck);
        masm.bind(unalignedCheck);
    }
    /*
         * Compare the remaining bytes with an unaligned memory load aligned to the end of the
         * array.
         */
    masm.movq(temp, new AMD64Address(array1, result, Scale.Times1, -VECTOR_SIZE));
    masm.cmpq(temp, new AMD64Address(array2, result, Scale.Times1, -VECTOR_SIZE));
    if (requiresNaNCheck) {
        masm.jcc(ConditionFlag.Equal, trueLabel);
        // At most two iterations, unroll in the emitted code.
        for (int offset = 0; offset < VECTOR_SIZE; offset += kind.getByteCount()) {
            emitFloatCompare(masm, array1, array2, result, -VECTOR_SIZE + offset, falseLabel, kind.getByteCount() == VECTOR_SIZE);
        }
    } else {
        masm.jccb(ConditionFlag.NotEqual, falseLabel);
    }
    masm.jmpb(trueLabel);
    masm.bind(compareTail);
    masm.movl(length, result);
}
Also used : Register(jdk.vm.ci.code.Register) ValueUtil.asRegister(jdk.vm.ci.code.ValueUtil.asRegister) Label(org.graalvm.compiler.asm.Label) AMD64Address(org.graalvm.compiler.asm.amd64.AMD64Address)

Example 34 with AMD64MacroAssembler

use of org.graalvm.compiler.asm.amd64.AMD64MacroAssembler in project graal by oracle.

the class AMD64ArrayEqualsOp method emitAVXCompare.

private void emitAVXCompare(CompilationResultBuilder crb, AMD64MacroAssembler masm, Register result, Register array1, Register array2, Register length, Label trueLabel, Label falseLabel) {
    assert supportsAVX2(crb.target);
    Register vector1 = asRegister(vectorTemp1, AMD64Kind.DOUBLE);
    Register vector2 = asRegister(vectorTemp2, AMD64Kind.DOUBLE);
    Label loop = new Label();
    Label compareTail = new Label();
    boolean requiresNaNCheck = kind.isNumericFloat();
    Label loopCheck = new Label();
    Label nanCheck = new Label();
    // Compare 16-byte vectors
    // tail count (in bytes)
    masm.andl(result, AVX_VECTOR_SIZE - 1);
    // vector count (in bytes)
    masm.andl(length, ~(AVX_VECTOR_SIZE - 1));
    masm.jcc(ConditionFlag.Zero, compareTail);
    masm.leaq(array1, new AMD64Address(array1, length, Scale.Times1, 0));
    masm.leaq(array2, new AMD64Address(array2, length, Scale.Times1, 0));
    masm.negq(length);
    // Align the main loop
    masm.align(crb.target.wordSize * 2);
    masm.bind(loop);
    masm.vmovdqu(vector1, new AMD64Address(array1, length, Scale.Times1, 0));
    masm.vmovdqu(vector2, new AMD64Address(array2, length, Scale.Times1, 0));
    masm.vpxor(vector1, vector1, vector2);
    masm.vptest(vector1, vector1);
    masm.jcc(ConditionFlag.NotZero, requiresNaNCheck ? nanCheck : falseLabel);
    masm.bind(loopCheck);
    masm.addq(length, AVX_VECTOR_SIZE);
    masm.jcc(ConditionFlag.NotZero, loop);
    masm.testl(result, result);
    masm.jcc(ConditionFlag.Zero, trueLabel);
    if (requiresNaNCheck) {
        Label unalignedCheck = new Label();
        masm.jmpb(unalignedCheck);
        masm.bind(nanCheck);
        emitFloatCompareWithinRange(crb, masm, array1, array2, length, 0, falseLabel, AVX_VECTOR_SIZE);
        masm.jmpb(loopCheck);
        masm.bind(unalignedCheck);
    }
    /*
         * Compare the remaining bytes with an unaligned memory load aligned to the end of the
         * array.
         */
    masm.vmovdqu(vector1, new AMD64Address(array1, result, Scale.Times1, -AVX_VECTOR_SIZE));
    masm.vmovdqu(vector2, new AMD64Address(array2, result, Scale.Times1, -AVX_VECTOR_SIZE));
    masm.vpxor(vector1, vector1, vector2);
    masm.vptest(vector1, vector1);
    if (requiresNaNCheck) {
        masm.jcc(ConditionFlag.Zero, trueLabel);
        emitFloatCompareWithinRange(crb, masm, array1, array2, result, -AVX_VECTOR_SIZE, falseLabel, AVX_VECTOR_SIZE);
    } else {
        masm.jcc(ConditionFlag.NotZero, falseLabel);
    }
    masm.jmp(trueLabel);
    masm.bind(compareTail);
    masm.movl(length, result);
}
Also used : Register(jdk.vm.ci.code.Register) ValueUtil.asRegister(jdk.vm.ci.code.ValueUtil.asRegister) Label(org.graalvm.compiler.asm.Label) AMD64Address(org.graalvm.compiler.asm.amd64.AMD64Address)

Example 35 with AMD64MacroAssembler

use of org.graalvm.compiler.asm.amd64.AMD64MacroAssembler in project graal by oracle.

the class AMD64DecrementingSafepointCheckOp method emitCode.

@Override
public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
    assert SubstrateOptions.MultiThreaded.getValue();
    SubstrateRegisterConfig threadRegister = (SubstrateRegisterConfig) crb.codeCache.getRegisterConfig();
    masm.decrementl(new AMD64Address(threadRegister.getThreadRegister(), Math.toIntExact(Safepoint.getThreadLocalSafepointRequestedOffset())), 1);
}
Also used : SubstrateRegisterConfig(com.oracle.svm.core.graal.meta.SubstrateRegisterConfig) AMD64Address(org.graalvm.compiler.asm.amd64.AMD64Address)

Aggregations

AMD64Address (org.graalvm.compiler.asm.amd64.AMD64Address)32 Register (jdk.vm.ci.code.Register)23 ValueUtil.asRegister (jdk.vm.ci.code.ValueUtil.asRegister)19 Label (org.graalvm.compiler.asm.Label)16 ArrayDataPointerConstant (org.graalvm.compiler.lir.asm.ArrayDataPointerConstant)7 AMD64MacroAssembler (org.graalvm.compiler.asm.amd64.AMD64MacroAssembler)6 CallingConvention (jdk.vm.ci.code.CallingConvention)4 RegisterConfig (jdk.vm.ci.code.RegisterConfig)4 TargetDescription (jdk.vm.ci.code.TargetDescription)3 AMD64Kind (jdk.vm.ci.amd64.AMD64Kind)2 DataSectionReference (jdk.vm.ci.code.site.DataSectionReference)2 AssemblerTest (org.graalvm.compiler.asm.test.AssemblerTest)2 CompilationResult (org.graalvm.compiler.code.CompilationResult)2 Data (org.graalvm.compiler.code.DataSection.Data)2 RawData (org.graalvm.compiler.code.DataSection.RawData)2 SerializableData (org.graalvm.compiler.code.DataSection.SerializableData)2 FrameMap (org.graalvm.compiler.lir.framemap.FrameMap)2 Test (org.junit.Test)2 CGlobalDataReference (com.oracle.svm.core.graal.code.CGlobalDataReference)1 SubstrateRegisterConfig (com.oracle.svm.core.graal.meta.SubstrateRegisterConfig)1