Index: vm/port/src/encoder/ia32_em64t/enc_base.h =================================================================== --- vm/port/src/encoder/ia32_em64t/enc_base.h (revision 660156) +++ vm/port/src/encoder/ia32_em64t/enc_base.h (working copy) @@ -281,7 +281,7 @@ * The value was increased from '5155' to '8192' to make it aligned * for faster access in EncoderBase::lookup(). */ - static const unsigned int HASH_MAX = 8192; //5155; + static const unsigned int HASH_MAX = 16384;//8192; //5155; /** * @brief Empty value, used in hash-to-opcode map to show an empty slot. */ Index: vm/port/src/encoder/ia32_em64t/enc_tabl.cpp =================================================================== --- vm/port/src/encoder/ia32_em64t/enc_tabl.cpp (revision 660156) +++ vm/port/src/encoder/ia32_em64t/enc_tabl.cpp (working copy) @@ -43,7 +43,6 @@ #endif #if !defined(_HAVE_MMX_) - #define Mnemonic_PADDQ Mnemonic_Null #define Mnemonic_PAND Mnemonic_Null #define Mnemonic_POR Mnemonic_Null #define Mnemonic_PSUBQ Mnemonic_Null @@ -1154,10 +1153,21 @@ BEGIN_MNEMONIC(MOVD, MF_NONE, D_U ) BEGIN_OPCODES() {OpcodeInfo::all, {0x66, 0x0F, 0x6E, _r}, {xmm32, r_m32}, D_U }, + {OpcodeInfo::all, {0x66, 0x0F, 0x6E, _r}, {xmm64, r_m32}, D_U }, {OpcodeInfo::all, {0x66, 0x0F, 0x7E, _r}, {r_m32, xmm32}, D_U }, END_OPCODES() END_MNEMONIC() +BEGIN_MNEMONIC(PADDQ, MF_NONE, DU_U) +BEGIN_OPCODES() +#ifdef _HAVE_MMX_ + {OpcodeInfo::all, {0x0F, 0xD4, _r}, {mm64, mm_m64}, DU_U }, +#endif + {OpcodeInfo::all, {0x66, 0x0F, 0xD4, _r}, {xmm64, xmm_m64}, DU_U }, +END_OPCODES() +END_MNEMONIC() + + // // A bunch of MMX instructions // @@ -1169,12 +1179,6 @@ END_OPCODES() END_MNEMONIC() -BEGIN_MNEMONIC(PADDQ, MF_NONE, DU_U) -BEGIN_OPCODES() - {OpcodeInfo::all, {0x0F, 0xD4, _r}, {mm64, mm_m64}, DU_U }, -END_OPCODES() -END_MNEMONIC() - BEGIN_MNEMONIC(PAND, MF_NONE, DU_U) BEGIN_OPCODES() {OpcodeInfo::all, {0x0F, 0xDB, _r}, {mm64, mm_m64}, DU_U }, @@ -1195,7 +1199,7 @@ #endif // ~_HAVE_MMX_ -BEGIN_MNEMONIC(PXOR, MF_NONE, DU_U) +BEGIN_MNEMONIC(PXOR, MF_SAME_ARG_NO_USE, DU_U) BEGIN_OPCODES() #ifdef _HAVE_MMX_ {OpcodeInfo::all, {0x0F, 0xEF, _r}, {mm64, mm_m64}, DU_U }, @@ -1525,6 +1529,59 @@ END_OPCODES() END_MNEMONIC() + +/////////////////////////////// + +BEGIN_MNEMONIC(MOVDQA, MF_NONE, D_U ) +BEGIN_OPCODES() + //Note: they're actually 128 bits + {OpcodeInfo::all, {0x66, 0x0F, 0x6F, _r}, {xmm64, xmm_m64}, D_U }, + {OpcodeInfo::all, {0x66, 0x0F, 0x7F, _r}, {xmm_m64, xmm64}, D_U }, +END_OPCODES() +END_MNEMONIC() + +BEGIN_MNEMONIC(PSHUFD, MF_NONE, D_U_U ) +BEGIN_OPCODES() + //Note: they're actually 128 bits + {OpcodeInfo::all, {0x66, 0x0F, 0x70, _r, ib}, {xmm64, xmm_m64, imm8}, D_U_U }, +END_OPCODES() +END_MNEMONIC() + +BEGIN_MNEMONIC(PMULUDQ, MF_NONE, DU_U ) +BEGIN_OPCODES() + //Note: they're actually 128 bits + {OpcodeInfo::all, {0x66, 0x0F, 0xF4, _r}, {xmm64, xmm_m64}, DU_U }, +END_OPCODES() +END_MNEMONIC() + +BEGIN_MNEMONIC(PSRLQ, MF_NONE, DU_U ) +BEGIN_OPCODES() + {OpcodeInfo::all, {0x66, 0x0F, 0x73, _2, ib}, {xmm64, imm8}, DU_U }, +END_OPCODES() +END_MNEMONIC() + +BEGIN_MNEMONIC(PSLLQ, MF_NONE, DU_U ) +BEGIN_OPCODES() + {OpcodeInfo::all, {0x66, 0x0F, 0x73, _6, ib}, {xmm64, imm8}, DU_U }, +END_OPCODES() +END_MNEMONIC() + +BEGIN_MNEMONIC(PUNPCKLQDQ, MF_NONE, DU_U ) +BEGIN_OPCODES() + {OpcodeInfo::all, {0x66, 0x0F, 0x6C, _r}, {xmm64, xmm_m64}, DU_U }, +END_OPCODES() +END_MNEMONIC() + +BEGIN_MNEMONIC(PUNPCKLDQ, MF_NONE, DU_U ) +BEGIN_OPCODES() + {OpcodeInfo::all, {0x66, 0x0F, 0x62, _r}, {xmm64, xmm_m64}, DU_U }, +END_OPCODES() +END_MNEMONIC() + + +/////////////////////////////// + + // // String operations // Index: vm/port/src/encoder/ia32_em64t/enc_defs.h =================================================================== --- vm/port/src/encoder/ia32_em64t/enc_defs.h (revision 660156) +++ vm/port/src/encoder/ia32_em64t/enc_defs.h (working copy) @@ -21,7 +21,6 @@ #ifndef _ENCODER_DEFS_H_ #define _ENCODER_DEFS_H_ - // Used to isolate experimental or being tuned encoder into a separate // namespace so it can coexist with a stable one in the same bundle. #ifdef ENCODER_ISOLATE @@ -571,8 +570,9 @@ Mnemonic_OR, // Logical Inclusive OR Mnemonic_PREFETCH, // prefetch +Mnemonic_PADDQ, // Add Packed Quadword Integers + #ifdef _HAVE_MMX_ - Mnemonic_PADDQ, // Add Packed Quadword Integers Mnemonic_PAND, // Logical AND Mnemonic_POR, // Bitwise Logical OR Mnemonic_PSUBQ, // Subtract Packed Quadword Integers @@ -635,6 +635,16 @@ Mnemonic_CVTDQ2PS, // Convert Packed Doubleword Integers to Packed Single-Precision Floating-Point Values Mnemonic_CVTTPS2DQ, // Convert with Truncation Packed Single-Precision Floating-Point Values to Packed Doubleword Integers + +Mnemonic_MOVDQA, +Mnemonic_PSHUFD, +Mnemonic_PMULUDQ, +Mnemonic_PSRLQ, +Mnemonic_PSLLQ, +Mnemonic_PUNPCKLQDQ, +Mnemonic_PUNPCKLDQ, + + // // String operations // Index: vm/jitrino/src/codegenerator/ia32/Ia32APIMagics.cpp =================================================================== --- vm/jitrino/src/codegenerator/ia32/Ia32APIMagics.cpp (revision 660156) +++ vm/jitrino/src/codegenerator/ia32/Ia32APIMagics.cpp (working copy) @@ -41,6 +41,7 @@ U_32 idx = defs.begin(); return callInst->getOpnd(idx); } + static Opnd* getCallSrc(CallInst* callInst, U_32 n) { Inst::Opnds uses(callInst, Inst::OpndRole_InstLevel | Inst::OpndRole_Use | Inst::OpndRole_Explicit); U_32 idx = uses.begin(); //the first use is call addr @@ -109,6 +110,7 @@ DECLARE_HELPER_INLINER(String_indexOf_Handler_x_String_x_I_x_I); DECLARE_HELPER_INLINER(Float_floatToRawIntBits_x_F_x_I); DECLARE_HELPER_INLINER(Float_intBitsToFloat_x_I_x_F); +DECLARE_HELPER_INLINER(Multiplication_multPAP_Handler); void APIMagicsHandlerSession::runImpl() { CompilationContext* cc = getCompilationContext(); @@ -220,6 +222,8 @@ } else if( strcmp((char*)ri->getValue(0),"String_indexOf")==0 ) { if(getBoolArg("String_indexOf_as_magic", true)) handlers.push_back(new (tmpMM) String_indexOf_Handler_x_String_x_I_x_I(irm, callInst, NULL)); + } else if( strcmp((char*)ri->getValue(0),"multPAP")==0 ) { + handlers.push_back(new (tmpMM) Multiplication_multPAP_Handler(irm, callInst, NULL)); } } } @@ -237,7 +241,160 @@ } } +void Multiplication_multPAP_Handler::run() { +#ifdef _EM64T_ + return; +#else + Constraint regConstr(OpndKind_GPReg, OpndSize_32); + ControlFlowGraph* cfg = irm->getFlowGraph(); + + Type* int64Type = irm->getTypeManager().getInt64Type(); + Type* int32Type = irm->getTypeManager().getInt32Type(); + Type* int8Type = irm->getTypeManager().getInt8Type(); + Opnd* xmm0 = irm->newOpnd(int64Type, RegName_XMM0D); + Opnd* xmm1 = irm->newOpnd(int64Type, RegName_XMM1D); + Opnd* xmm2 = irm->newOpnd(int64Type, RegName_XMM2D); + Opnd* xmm3 = irm->newRegOpnd(int64Type, RegName_XMM3D); + Opnd* xmm4 = irm->newOpnd(int64Type, RegName_XMM4D); + Opnd * const1 = irm->newImmOpnd(int32Type, 1); + Opnd * const3 = irm->newImmOpnd(int32Type, 3); + Opnd * const4 = irm->newImmOpnd(int32Type, 4); + Opnd * const8 = irm->newImmOpnd(int32Type, 8); + Opnd * const14 = irm->newImmOpnd(int32Type, 14); + Opnd * const32 = irm->newImmOpnd(int32Type, 32); + Opnd* zero = irm->newImmOpnd(int32Type,0); + + Node* callInstNode = callInst->getNode(); + callInst->unlink(); + + Opnd* num1 = getCallSrc(callInst, 0); + Opnd* num2 = getCallSrc(callInst, 1); + Opnd* res = getCallSrc(callInst, 2); + Opnd* size_a = getCallSrc(callInst, 3); + Opnd* size_b = getCallSrc(callInst, 4); + + Node* nextNode = callInstNode->getUnconditionalEdgeTarget(); + assert(nextNode!=NULL); + cfg->removeEdge(callInstNode->getUnconditionalEdge()); + + Opnd* num1_count = irm->newOpnd(int32Type, regConstr); + Opnd* num2_count = irm->newOpnd(int32Type, regConstr); + Opnd* res_base = irm->newOpnd(irm->getTypeManager().getManagedPtrType(int32Type), regConstr); + + Opnd* size_min_1 = irm->newOpnd(int32Type); + callInstNode->appendInst(irm->newCopyPseudoInst(Mnemonic_MOV, size_min_1, size_b)); + callInstNode->appendInst(irm->newInst(Mnemonic_SUB, size_min_1, const1)); + + Opnd* size_outer_odd = irm->newOpnd(int32Type); + callInstNode->appendInst(irm->newCopyPseudoInst(Mnemonic_MOV, size_outer_odd, size_a)); + callInstNode->appendInst(irm->newInst(Mnemonic_AND, size_outer_odd, irm->newImmOpnd(int32Type,0xFFFE))); + + callInstNode->appendInst(irm->newInst(Mnemonic_MOV, num1_count, zero)); + Opnd* num1Addr = addElemIndexWithLEA(num1, zero, RegName_Null, callInstNode); + Opnd* num2Addr = addElemIndexWithLEA(num2, zero, RegName_Null, callInstNode); + Opnd* resAddr = addElemIndexWithLEA(res, zero, RegName_Null, callInstNode); + + Node* outerLoop = cfg->createBlockNode(); + Node* outerLoop_2 = cfg->createBlockNode(); + Node* innerLoop = cfg->createBlockNode(); + + Node* outerLoop_unroll = cfg->createBlockNode(); + Node* outerLoop_unroll_2 = cfg->createBlockNode(); + Node* outerLoop_unroll_3 = cfg->createBlockNode(); + Node* innerLoop_2 = cfg->createBlockNode(); + + cfg->addEdge(callInstNode, outerLoop); + cfg->addEdge(outerLoop_2, outerLoop_unroll, 0.25); + cfg->addEdge(outerLoop_2, outerLoop, 0.75); + + cfg->addEdge(outerLoop_unroll, outerLoop_unroll_2, 0.25); + cfg->addEdge(outerLoop_unroll, nextNode, 0.75); + cfg->addEdge(outerLoop_unroll_2, innerLoop_2); + cfg->addEdge(innerLoop_2, innerLoop_2, 0.75); + cfg->addEdge(innerLoop_2, outerLoop_unroll_3, 0.25); + cfg->addEdge(outerLoop_unroll_3, nextNode); + + cfg->addEdge(outerLoop, innerLoop); + cfg->addEdge(innerLoop, innerLoop, 0.75); + cfg->addEdge(innerLoop, outerLoop_2, 0.25); + + outerLoop->appendInst(irm->newInst(Mnemonic_PXOR, xmm3, xmm3)); + outerLoop->appendInst(irm->newInst(Mnemonic_MOVD, xmm0, irm->newMemOpnd(int32Type, num1Addr, num1_count, const4, 0)));// + outerLoop->appendInst(irm->newInst(Mnemonic_MOVD, xmm4, irm->newMemOpnd(int32Type, num1Addr, num1_count, const4, const4))); + outerLoop->appendInst(irm->newInst(Mnemonic_PUNPCKLQDQ, xmm0, xmm4)); + outerLoop->appendInst(irm->newInst(Mnemonic_LEA, res_base, irm->newMemOpnd(int32Type, resAddr, num1_count, irm->newImmOpnd(int32Type, 4), 0))); + outerLoop->appendInst(irm->newInst(Mnemonic_MOVD, xmm2, irm->newMemOpnd(int32Type, res_base, 0, 0, 0))); + outerLoop->appendInst(irm->newInst(Mnemonic_MOV, num2_count, zero)); + + innerLoop->appendInst(irm->newInst(Mnemonic_MOVD, xmm1, irm->newMemOpnd(int32Type, num2Addr, num2_count, const4, 0))); + innerLoop->appendInst(irm->newInst(Mnemonic_PSHUFD, xmm1, xmm1, irm->newImmOpnd(int32Type, 68))); + innerLoop->appendInst(irm->newInst(Mnemonic_PMULUDQ, xmm1, xmm0)); + innerLoop->appendInst(irm->newInst(Mnemonic_MOVD, xmm4, irm->newMemOpnd(int32Type, res_base, num2_count, const4, const4))); + innerLoop->appendInst(irm->newInst(Mnemonic_PUNPCKLQDQ, xmm2, xmm4)); + innerLoop->appendInst(irm->newInst(Mnemonic_PADDQ, xmm1, xmm3)); + innerLoop->appendInst(irm->newInst(Mnemonic_PADDQ, xmm1, xmm2)); + innerLoop->appendInst(irm->newInst(Mnemonic_MOVD, irm->newMemOpnd(int32Type, res_base, num2_count, const4, 0), xmm1)); + innerLoop->appendInst(irm->newInst(Mnemonic_MOVDQA, xmm3, xmm1)); + innerLoop->appendInst(irm->newInst(Mnemonic_PSRLQ, xmm3, const32)); + innerLoop->appendInst(irm->newInst(Mnemonic_PSLLQ, xmm1, const32)); + innerLoop->appendInst(irm->newInst(Mnemonic_PSHUFD, xmm2, xmm1, const3)); + innerLoop->appendInst(irm->newInst(Mnemonic_ADD, num2_count, irm->newImmOpnd(int8Type, 1))); + innerLoop->appendInst(irm->newInst(Mnemonic_CMP, num2_count, size_min_1)); + innerLoop->appendInst(irm->newBranchInst(Mnemonic_JGE, outerLoop_2, innerLoop)); + + // last iteration unrolling + outerLoop_2->appendInst(irm->newInst(Mnemonic_MOVD, xmm1, irm->newMemOpnd(int32Type, num2Addr, num2_count, const4, 0))); + outerLoop_2->appendInst(irm->newInst(Mnemonic_PSHUFD, xmm1, xmm1, irm->newImmOpnd(int32Type, 204))); + outerLoop_2->appendInst(irm->newInst(Mnemonic_PMULUDQ, xmm1, xmm0)); + outerLoop_2->appendInst(irm->newInst(Mnemonic_PADDQ, xmm1, xmm3)); + outerLoop_2->appendInst(irm->newInst(Mnemonic_PADDQ, xmm1, xmm2)); + outerLoop_2->appendInst(irm->newInst(Mnemonic_MOVD, irm->newMemOpnd(int32Type, res_base, num2_count, const4, 0), xmm1)); + outerLoop_2->appendInst(irm->newInst(Mnemonic_MOVDQA, xmm2, xmm1)); + outerLoop_2->appendInst(irm->newInst(Mnemonic_PXOR, xmm3, xmm3)); + outerLoop_2->appendInst(irm->newInst(Mnemonic_PUNPCKLDQ, xmm2, xmm3)); + outerLoop_2->appendInst(irm->newInst(Mnemonic_PADDQ, xmm1, xmm2)); + outerLoop_2->appendInst(irm->newInst(Mnemonic_PSHUFD, xmm1, xmm1, const14)); + outerLoop_2->appendInst(irm->newInst(Mnemonic_MOVD, irm->newMemOpnd(int32Type, res_base, num2_count, const4, const4), xmm1)); + outerLoop_2->appendInst(irm->newInst(Mnemonic_PSHUFD, xmm1, xmm1, const1)); + outerLoop_2->appendInst(irm->newInst(Mnemonic_MOVD, irm->newMemOpnd(int32Type, res_base, num2_count, const4, const8), xmm1)); + + // inc counter, cmp, jump + outerLoop_2->appendInst(irm->newInst(Mnemonic_ADD, num1_count, irm->newImmOpnd(int8Type, 2))); + outerLoop_2->appendInst(irm->newInst(Mnemonic_CMP, num1_count, size_outer_odd)); + outerLoop_2->appendInst(irm->newBranchInst(Mnemonic_JGE, outerLoop_unroll, outerLoop)); + + // unroll outer loop if it has odd elements + outerLoop_unroll->appendInst(irm->newCopyPseudoInst(Mnemonic_MOV, size_outer_odd, size_a)); + outerLoop_unroll->appendInst(irm->newInst(Mnemonic_TEST, size_outer_odd, const1)); + outerLoop_unroll->appendInst(irm->newBranchInst(Mnemonic_JE, nextNode, outerLoop_unroll_2)); + + outerLoop_unroll_2->appendInst(irm->newInst(Mnemonic_MOV, num2_count, zero)); + outerLoop_unroll_2->appendInst(irm->newInst(Mnemonic_PXOR, xmm3, xmm3)); + outerLoop_unroll_2->appendInst(irm->newInst(Mnemonic_MOVD, xmm0, irm->newMemOpnd(int32Type, num1Addr, num1_count, const4, 0))); + outerLoop_unroll_2->appendInst(irm->newInst(Mnemonic_LEA, res_base, irm->newMemOpnd(int32Type, resAddr, num1_count, irm->newImmOpnd(int32Type, 4), 0))); + outerLoop_unroll_2->appendInst(irm->newInst(Mnemonic_MOVD, xmm2, irm->newMemOpnd(int32Type, res_base, 0, 0, 0))); + + innerLoop_2->appendInst(irm->newInst(Mnemonic_MOVD, xmm1, irm->newMemOpnd(int32Type, num2Addr, num2_count, const4, 0))); + innerLoop_2->appendInst(irm->newInst(Mnemonic_PMULUDQ, xmm1, xmm0)); + innerLoop_2->appendInst(irm->newInst(Mnemonic_MOVD, xmm4, irm->newMemOpnd(int32Type, res_base, num2_count, const4, const4))); + innerLoop_2->appendInst(irm->newInst(Mnemonic_PUNPCKLQDQ, xmm2, xmm4)); + innerLoop_2->appendInst(irm->newInst(Mnemonic_PADDQ, xmm1, xmm3)); + innerLoop_2->appendInst(irm->newInst(Mnemonic_PADDQ, xmm1, xmm2)); + innerLoop_2->appendInst(irm->newInst(Mnemonic_MOVD, irm->newMemOpnd(int32Type, res_base, num2_count, const4, 0), xmm1)); + innerLoop_2->appendInst(irm->newInst(Mnemonic_MOVDQA, xmm3, xmm1)); + innerLoop_2->appendInst(irm->newInst(Mnemonic_PSRLQ, xmm3, const32)); + innerLoop_2->appendInst(irm->newInst(Mnemonic_PSLLQ, xmm1, const32)); + innerLoop_2->appendInst(irm->newInst(Mnemonic_PSHUFD, xmm2, xmm1, const3)); + innerLoop_2->appendInst(irm->newInst(Mnemonic_ADD, num2_count, irm->newImmOpnd(int8Type, 1))); + innerLoop_2->appendInst(irm->newInst(Mnemonic_CMP, num2_count, size_b)); + innerLoop_2->appendInst(irm->newBranchInst(Mnemonic_JGE, outerLoop_unroll_3, innerLoop_2)); + + outerLoop_unroll_3->appendInst(irm->newInst(Mnemonic_MOVD, irm->newMemOpnd(int32Type, res_base, num2_count, const4, 0), xmm3)); +#endif +} + + void Integer_numberOfLeadingZeros_Handler_x_I_x_I::run() { //mov r2,-1 //bsr r1,arg Index: vm/jitrino/src/codegenerator/ia32/Ia32InstCodeSelector.cpp =================================================================== --- vm/jitrino/src/codegenerator/ia32/Ia32InstCodeSelector.cpp (revision 660156) +++ vm/jitrino/src/codegenerator/ia32/Ia32InstCodeSelector.cpp (working copy) @@ -248,6 +248,7 @@ irManager.registerInternalHelperInfo("String_compareTo", IRManager::InternalHelperInfo(NULL,&CallingConvention_STDCALL)); irManager.registerInternalHelperInfo("String_regionMatches", IRManager::InternalHelperInfo(NULL,&CallingConvention_STDCALL)); irManager.registerInternalHelperInfo("String_indexOf", IRManager::InternalHelperInfo(NULL,&CallingConvention_STDCALL)); + irManager.registerInternalHelperInfo("multPAP", IRManager::InternalHelperInfo(NULL,&CallingConvention_STDCALL)); } //_______________________________________________________________________________________________________________ @@ -2882,6 +2883,14 @@ break; } + case multPAP: + { + assert(numArgs == 5); + Opnd * newArgs[5] = {(Opnd *)args[0], (Opnd *)args[1], (Opnd *)args[2], (Opnd *)args[3], (Opnd *)args[4]}; + appendInsts(irManager.newInternalRuntimeHelperCallInst("multPAP", numArgs, newArgs, dstOpnd)); + break; + } + default: { assert(0); Index: vm/jitrino/src/codegenerator/CodeGenIntfc.h =================================================================== --- vm/jitrino/src/codegenerator/CodeGenIntfc.h (revision 660156) +++ vm/jitrino/src/codegenerator/CodeGenIntfc.h (working copy) @@ -167,7 +167,8 @@ ArrayCopyReverse, StringCompareTo, StringRegionMatches, - StringIndexOf + StringIndexOf, + multPAP }; }; Index: vm/jitrino/src/optimizer/HLOAPIMagics.cpp =================================================================== --- vm/jitrino/src/optimizer/HLOAPIMagics.cpp (revision 660156) +++ vm/jitrino/src/optimizer/HLOAPIMagics.cpp (working copy) @@ -99,7 +99,24 @@ return isOptimizable; } +void +multPAP_HLO_Handler::run() +{ + IRManager* irm = builder->getIRManager(); + InstFactory& instFactory = builder->getInstFactory(); + Opnd* dst = callInst->getDst(); + Opnd* a = callInst->getSrc(2); + Opnd* b = callInst->getSrc(3); + Opnd* t = callInst->getSrc(4); + Opnd* size_a = callInst->getSrc(5); + Opnd* size_b = callInst->getSrc(6); + Opnd* opnds[] = {a, b, t, size_a, size_b}; + // This helper call will be processed in Ia32ApiMagics pass + instFactory.makeJitHelperCall(dst, multPAP, NULL, NULL, 5, opnds)->insertAfter(callInst); + callInst->unlink(); +} + void System_arraycopy_HLO_Handler::run() { Index: vm/jitrino/src/optimizer/HLOAPIMagics.h =================================================================== --- vm/jitrino/src/optimizer/HLOAPIMagics.h (revision 660156) +++ vm/jitrino/src/optimizer/HLOAPIMagics.h (working copy) @@ -128,6 +128,7 @@ DECLARE_HLO_MAGIC_INLINER(String_compareTo_HLO_Handler); DECLARE_HLO_MAGIC_INLINER(String_regionMatches_HLO_Handler); DECLARE_HLO_MAGIC_INLINER(String_indexOf_HLO_Handler); +DECLARE_HLO_MAGIC_INLINER(multPAP_HLO_Handler); DEFINE_SESSION_ACTION(HLOAPIMagicSession, hlo_api_magic, "APIMagics HLO Pass") @@ -160,6 +161,13 @@ handlers.push_back(new (mm) System_arraycopy_HLO_Handler(callInst)); } } + if (!strcmp(className, "java/math/Multiplication")) { + if (!strcmp(methodName, "multPAP") && !strcmp(signature, "([I[I[III)V")) { + if (getBoolArg("multPAP_on_SSE", true)) { + handlers.push_back(new (mm) multPAP_HLO_Handler(callInst)); + } + } + } if (!strcmp(className, "java/lang/String")) { if (!strcmp(methodName, "compareTo") && !strcmp(signature, "(Ljava/lang/String;)I")) { if(getBoolArg("String_compareTo_as_magic", true)) Index: vm/jitrino/src/optimizer/escanalyzer.cpp =================================================================== --- vm/jitrino/src/optimizer/escanalyzer.cpp (revision 660156) +++ vm/jitrino/src/optimizer/escanalyzer.cpp (working copy) @@ -425,6 +425,7 @@ case StringCompareTo: case StringIndexOf: case StringRegionMatches: + case multPAP: case ClassIsArray: case ClassGetAllocationHandle: case ClassGetTypeSize: Index: vm/jitrino/src/optimizer/Inst.cpp =================================================================== --- vm/jitrino/src/optimizer/Inst.cpp (revision 660156) +++ vm/jitrino/src/optimizer/Inst.cpp (working copy) @@ -470,6 +470,8 @@ os << "StringIndexOf"; break; case StringRegionMatches: os << "StringRegionMatches"; break; + case multPAP: + os << "multPAP"; break; case ClassIsArray: os << "ClassIsArray"; break; case ClassGetAllocationHandle: Index: vm/jitrino/src/optimizer/CodeSelectors.cpp =================================================================== --- vm/jitrino/src/optimizer/CodeSelectors.cpp (revision 660156) +++ vm/jitrino/src/optimizer/CodeSelectors.cpp (working copy) @@ -403,6 +403,7 @@ case StringCompareTo: return JitHelperCallOp::StringCompareTo; case StringRegionMatches: return JitHelperCallOp::StringRegionMatches; case StringIndexOf: return JitHelperCallOp::StringIndexOf; + case multPAP: return JitHelperCallOp::multPAP; default: break; } crash("\n JIT helper in not supported in LIR : %d\n", callId); Index: vm/jitrino/src/optimizer/inliner.cpp =================================================================== --- vm/jitrino/src/optimizer/inliner.cpp (revision 660156) +++ vm/jitrino/src/optimizer/inliner.cpp (working copy) @@ -147,6 +147,7 @@ _inlineSkipMethodTable->add_method_record("java/lang/Math", "log", "(D)D", des, false); _inlineSkipMethodTable->add_method_record("java/lang/Math", "log10", "(D)D", des, false); _inlineSkipMethodTable->add_method_record("java/lang/Math", "log1p", "(D)D", des, false); + _inlineSkipMethodTable->add_method_record("java/math/Multiplication", "multPAP", "([I[I[III)V", des, false); #endif if(argSource->getBoolArg("System_arraycopy_as_magic",true)) { _inlineSkipMethodTable->add_method_record("java/lang/System", "arraycopy", "(Ljava/lang/Object;ILjava/lang/Object;II)V", des, false); Index: vm/jitrino/src/optimizer/Opcode.h =================================================================== --- vm/jitrino/src/optimizer/Opcode.h (revision 660156) +++ vm/jitrino/src/optimizer/Opcode.h (working copy) @@ -277,6 +277,7 @@ StringCompareTo, StringRegionMatches, StringIndexOf, + multPAP, ClassIsArray, ClassGetAllocationHandle, ClassGetTypeSize, Index: vm/jitrino/src/optimizer/memoryopt.cpp =================================================================== --- vm/jitrino/src/optimizer/memoryopt.cpp (revision 660156) +++ vm/jitrino/src/optimizer/memoryopt.cpp (working copy) @@ -638,6 +638,7 @@ case StringIndexOf: case StringRegionMatches: case FillArrayWithConst: + case multPAP: case ClassIsArray: case ClassGetAllocationHandle: case ClassGetTypeSize: