bn_mul.h 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072
  1. /**
  2. * \file bn_mul.h
  3. *
  4. * \brief Multi-precision integer library
  5. */
  6. /*
  7. * Copyright The Mbed TLS Contributors
  8. * SPDX-License-Identifier: Apache-2.0
  9. *
  10. * Licensed under the Apache License, Version 2.0 (the "License"); you may
  11. * not use this file except in compliance with the License.
  12. * You may obtain a copy of the License at
  13. *
  14. * http://www.apache.org/licenses/LICENSE-2.0
  15. *
  16. * Unless required by applicable law or agreed to in writing, software
  17. * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  18. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  19. * See the License for the specific language governing permissions and
  20. * limitations under the License.
  21. */
  22. /*
  23. * Multiply source vector [s] with b, add result
  24. * to destination vector [d] and set carry c.
  25. *
  26. * Currently supports:
  27. *
  28. * . IA-32 (386+) . AMD64 / EM64T
  29. * . IA-32 (SSE2) . Motorola 68000
  30. * . PowerPC, 32-bit . MicroBlaze
  31. * . PowerPC, 64-bit . TriCore
  32. * . SPARC v8 . ARM v3+
  33. * . Alpha . MIPS32
  34. * . C, longlong . C, generic
  35. */
  36. #ifndef MBEDTLS_BN_MUL_H
  37. #define MBEDTLS_BN_MUL_H
  38. #include "mbedtls/build_info.h"
  39. #include "mbedtls/bignum.h"
  40. /*
  41. * Conversion macros for embedded constants:
  42. * build lists of mbedtls_mpi_uint's from lists of unsigned char's grouped by 8, 4 or 2
  43. */
  44. #if defined(MBEDTLS_HAVE_INT32)
  45. #define MBEDTLS_BYTES_TO_T_UINT_4(a, b, c, d) \
  46. ((mbedtls_mpi_uint) (a) << 0) | \
  47. ((mbedtls_mpi_uint) (b) << 8) | \
  48. ((mbedtls_mpi_uint) (c) << 16) | \
  49. ((mbedtls_mpi_uint) (d) << 24)
  50. #define MBEDTLS_BYTES_TO_T_UINT_2(a, b) \
  51. MBEDTLS_BYTES_TO_T_UINT_4(a, b, 0, 0)
  52. #define MBEDTLS_BYTES_TO_T_UINT_8(a, b, c, d, e, f, g, h) \
  53. MBEDTLS_BYTES_TO_T_UINT_4(a, b, c, d), \
  54. MBEDTLS_BYTES_TO_T_UINT_4(e, f, g, h)
  55. #else /* 64-bits */
  56. #define MBEDTLS_BYTES_TO_T_UINT_8(a, b, c, d, e, f, g, h) \
  57. ((mbedtls_mpi_uint) (a) << 0) | \
  58. ((mbedtls_mpi_uint) (b) << 8) | \
  59. ((mbedtls_mpi_uint) (c) << 16) | \
  60. ((mbedtls_mpi_uint) (d) << 24) | \
  61. ((mbedtls_mpi_uint) (e) << 32) | \
  62. ((mbedtls_mpi_uint) (f) << 40) | \
  63. ((mbedtls_mpi_uint) (g) << 48) | \
  64. ((mbedtls_mpi_uint) (h) << 56)
  65. #define MBEDTLS_BYTES_TO_T_UINT_4(a, b, c, d) \
  66. MBEDTLS_BYTES_TO_T_UINT_8(a, b, c, d, 0, 0, 0, 0)
  67. #define MBEDTLS_BYTES_TO_T_UINT_2(a, b) \
  68. MBEDTLS_BYTES_TO_T_UINT_8(a, b, 0, 0, 0, 0, 0, 0)
  69. #endif /* bits in mbedtls_mpi_uint */
  70. /* *INDENT-OFF* */
  71. #if defined(MBEDTLS_HAVE_ASM)
  72. /* armcc5 --gnu defines __GNUC__ but doesn't support GNU's extended asm */
  73. #if defined(__GNUC__) && \
  74. ( !defined(__ARMCC_VERSION) || __ARMCC_VERSION >= 6000000 )
  75. /*
  76. * GCC < 5.0 treated the x86 ebx (which is used for the GOT) as a
  77. * fixed reserved register when building as PIC, leading to errors
  78. * like: bn_mul.h:46:13: error: PIC register clobbered by 'ebx' in 'asm'
  79. *
  80. * This is fixed by an improved register allocator in GCC 5+. From the
  81. * release notes:
  82. * Register allocation improvements: Reuse of the PIC hard register,
  83. * instead of using a fixed register, was implemented on x86/x86-64
  84. * targets. This improves generated PIC code performance as more hard
  85. * registers can be used.
  86. */
  87. #if defined(__GNUC__) && __GNUC__ < 5 && defined(__PIC__)
  88. #define MULADDC_CANNOT_USE_EBX
  89. #endif
  90. /*
  91. * Disable use of the i386 assembly code below if option -O0, to disable all
  92. * compiler optimisations, is passed, detected with __OPTIMIZE__
  93. * This is done as the number of registers used in the assembly code doesn't
  94. * work with the -O0 option.
  95. */
  96. #if defined(__i386__) && defined(__OPTIMIZE__) && !defined(MULADDC_CANNOT_USE_EBX)
  97. #define MULADDC_X1_INIT \
  98. { mbedtls_mpi_uint t; \
  99. asm( \
  100. "movl %%ebx, %0 \n\t" \
  101. "movl %5, %%esi \n\t" \
  102. "movl %6, %%edi \n\t" \
  103. "movl %7, %%ecx \n\t" \
  104. "movl %8, %%ebx \n\t"
  105. #define MULADDC_X1_CORE \
  106. "lodsl \n\t" \
  107. "mull %%ebx \n\t" \
  108. "addl %%ecx, %%eax \n\t" \
  109. "adcl $0, %%edx \n\t" \
  110. "addl (%%edi), %%eax \n\t" \
  111. "adcl $0, %%edx \n\t" \
  112. "movl %%edx, %%ecx \n\t" \
  113. "stosl \n\t"
  114. #define MULADDC_X1_STOP \
  115. "movl %4, %%ebx \n\t" \
  116. "movl %%ecx, %1 \n\t" \
  117. "movl %%edi, %2 \n\t" \
  118. "movl %%esi, %3 \n\t" \
  119. : "=m" (t), "=m" (c), "=m" (d), "=m" (s) \
  120. : "m" (t), "m" (s), "m" (d), "m" (c), "m" (b) \
  121. : "eax", "ebx", "ecx", "edx", "esi", "edi" \
  122. ); }
  123. #if defined(MBEDTLS_HAVE_SSE2)
  124. #define MULADDC_X8_INIT MULADDC_X1_INIT
  125. #define MULADDC_X8_CORE \
  126. "movd %%ecx, %%mm1 \n\t" \
  127. "movd %%ebx, %%mm0 \n\t" \
  128. "movd (%%edi), %%mm3 \n\t" \
  129. "paddq %%mm3, %%mm1 \n\t" \
  130. "movd (%%esi), %%mm2 \n\t" \
  131. "pmuludq %%mm0, %%mm2 \n\t" \
  132. "movd 4(%%esi), %%mm4 \n\t" \
  133. "pmuludq %%mm0, %%mm4 \n\t" \
  134. "movd 8(%%esi), %%mm6 \n\t" \
  135. "pmuludq %%mm0, %%mm6 \n\t" \
  136. "movd 12(%%esi), %%mm7 \n\t" \
  137. "pmuludq %%mm0, %%mm7 \n\t" \
  138. "paddq %%mm2, %%mm1 \n\t" \
  139. "movd 4(%%edi), %%mm3 \n\t" \
  140. "paddq %%mm4, %%mm3 \n\t" \
  141. "movd 8(%%edi), %%mm5 \n\t" \
  142. "paddq %%mm6, %%mm5 \n\t" \
  143. "movd 12(%%edi), %%mm4 \n\t" \
  144. "paddq %%mm4, %%mm7 \n\t" \
  145. "movd %%mm1, (%%edi) \n\t" \
  146. "movd 16(%%esi), %%mm2 \n\t" \
  147. "pmuludq %%mm0, %%mm2 \n\t" \
  148. "psrlq $32, %%mm1 \n\t" \
  149. "movd 20(%%esi), %%mm4 \n\t" \
  150. "pmuludq %%mm0, %%mm4 \n\t" \
  151. "paddq %%mm3, %%mm1 \n\t" \
  152. "movd 24(%%esi), %%mm6 \n\t" \
  153. "pmuludq %%mm0, %%mm6 \n\t" \
  154. "movd %%mm1, 4(%%edi) \n\t" \
  155. "psrlq $32, %%mm1 \n\t" \
  156. "movd 28(%%esi), %%mm3 \n\t" \
  157. "pmuludq %%mm0, %%mm3 \n\t" \
  158. "paddq %%mm5, %%mm1 \n\t" \
  159. "movd 16(%%edi), %%mm5 \n\t" \
  160. "paddq %%mm5, %%mm2 \n\t" \
  161. "movd %%mm1, 8(%%edi) \n\t" \
  162. "psrlq $32, %%mm1 \n\t" \
  163. "paddq %%mm7, %%mm1 \n\t" \
  164. "movd 20(%%edi), %%mm5 \n\t" \
  165. "paddq %%mm5, %%mm4 \n\t" \
  166. "movd %%mm1, 12(%%edi) \n\t" \
  167. "psrlq $32, %%mm1 \n\t" \
  168. "paddq %%mm2, %%mm1 \n\t" \
  169. "movd 24(%%edi), %%mm5 \n\t" \
  170. "paddq %%mm5, %%mm6 \n\t" \
  171. "movd %%mm1, 16(%%edi) \n\t" \
  172. "psrlq $32, %%mm1 \n\t" \
  173. "paddq %%mm4, %%mm1 \n\t" \
  174. "movd 28(%%edi), %%mm5 \n\t" \
  175. "paddq %%mm5, %%mm3 \n\t" \
  176. "movd %%mm1, 20(%%edi) \n\t" \
  177. "psrlq $32, %%mm1 \n\t" \
  178. "paddq %%mm6, %%mm1 \n\t" \
  179. "movd %%mm1, 24(%%edi) \n\t" \
  180. "psrlq $32, %%mm1 \n\t" \
  181. "paddq %%mm3, %%mm1 \n\t" \
  182. "movd %%mm1, 28(%%edi) \n\t" \
  183. "addl $32, %%edi \n\t" \
  184. "addl $32, %%esi \n\t" \
  185. "psrlq $32, %%mm1 \n\t" \
  186. "movd %%mm1, %%ecx \n\t"
  187. #define MULADDC_X8_STOP \
  188. "emms \n\t" \
  189. "movl %4, %%ebx \n\t" \
  190. "movl %%ecx, %1 \n\t" \
  191. "movl %%edi, %2 \n\t" \
  192. "movl %%esi, %3 \n\t" \
  193. : "=m" (t), "=m" (c), "=m" (d), "=m" (s) \
  194. : "m" (t), "m" (s), "m" (d), "m" (c), "m" (b) \
  195. : "eax", "ebx", "ecx", "edx", "esi", "edi" \
  196. ); } \
  197. #endif /* SSE2 */
  198. #endif /* i386 */
  199. #if defined(__amd64__) || defined (__x86_64__)
  200. #define MULADDC_X1_INIT \
  201. asm( \
  202. "xorq %%r8, %%r8\n"
  203. #define MULADDC_X1_CORE \
  204. "movq (%%rsi), %%rax\n" \
  205. "mulq %%rbx\n" \
  206. "addq $8, %%rsi\n" \
  207. "addq %%rcx, %%rax\n" \
  208. "movq %%r8, %%rcx\n" \
  209. "adcq $0, %%rdx\n" \
  210. "nop \n" \
  211. "addq %%rax, (%%rdi)\n" \
  212. "adcq %%rdx, %%rcx\n" \
  213. "addq $8, %%rdi\n"
  214. #define MULADDC_X1_STOP \
  215. : "+c" (c), "+D" (d), "+S" (s), "+m" (*(uint64_t (*)[16]) d) \
  216. : "b" (b), "m" (*(const uint64_t (*)[16]) s) \
  217. : "rax", "rdx", "r8" \
  218. );
  219. #endif /* AMD64 */
  220. #if defined(__aarch64__)
  221. #define MULADDC_X1_INIT \
  222. asm(
  223. #define MULADDC_X1_CORE \
  224. "ldr x4, [%2], #8 \n\t" \
  225. "ldr x5, [%1] \n\t" \
  226. "mul x6, x4, %4 \n\t" \
  227. "umulh x7, x4, %4 \n\t" \
  228. "adds x5, x5, x6 \n\t" \
  229. "adc x7, x7, xzr \n\t" \
  230. "adds x5, x5, %0 \n\t" \
  231. "adc %0, x7, xzr \n\t" \
  232. "str x5, [%1], #8 \n\t"
  233. #define MULADDC_X1_STOP \
  234. : "+r" (c), "+r" (d), "+r" (s), "+m" (*(uint64_t (*)[16]) d) \
  235. : "r" (b), "m" (*(const uint64_t (*)[16]) s) \
  236. : "x4", "x5", "x6", "x7", "cc" \
  237. );
  238. #endif /* Aarch64 */
  239. #if defined(__mc68020__) || defined(__mcpu32__)
  240. #define MULADDC_X1_INIT \
  241. asm( \
  242. "movl %3, %%a2 \n\t" \
  243. "movl %4, %%a3 \n\t" \
  244. "movl %5, %%d3 \n\t" \
  245. "movl %6, %%d2 \n\t" \
  246. "moveq #0, %%d0 \n\t"
  247. #define MULADDC_X1_CORE \
  248. "movel %%a2@+, %%d1 \n\t" \
  249. "mulul %%d2, %%d4:%%d1 \n\t" \
  250. "addl %%d3, %%d1 \n\t" \
  251. "addxl %%d0, %%d4 \n\t" \
  252. "moveq #0, %%d3 \n\t" \
  253. "addl %%d1, %%a3@+ \n\t" \
  254. "addxl %%d4, %%d3 \n\t"
  255. #define MULADDC_X1_STOP \
  256. "movl %%d3, %0 \n\t" \
  257. "movl %%a3, %1 \n\t" \
  258. "movl %%a2, %2 \n\t" \
  259. : "=m" (c), "=m" (d), "=m" (s) \
  260. : "m" (s), "m" (d), "m" (c), "m" (b) \
  261. : "d0", "d1", "d2", "d3", "d4", "a2", "a3" \
  262. );
  263. #define MULADDC_X8_INIT MULADDC_X1_INIT
  264. #define MULADDC_X8_CORE \
  265. "movel %%a2@+, %%d1 \n\t" \
  266. "mulul %%d2, %%d4:%%d1 \n\t" \
  267. "addxl %%d3, %%d1 \n\t" \
  268. "addxl %%d0, %%d4 \n\t" \
  269. "addl %%d1, %%a3@+ \n\t" \
  270. "movel %%a2@+, %%d1 \n\t" \
  271. "mulul %%d2, %%d3:%%d1 \n\t" \
  272. "addxl %%d4, %%d1 \n\t" \
  273. "addxl %%d0, %%d3 \n\t" \
  274. "addl %%d1, %%a3@+ \n\t" \
  275. "movel %%a2@+, %%d1 \n\t" \
  276. "mulul %%d2, %%d4:%%d1 \n\t" \
  277. "addxl %%d3, %%d1 \n\t" \
  278. "addxl %%d0, %%d4 \n\t" \
  279. "addl %%d1, %%a3@+ \n\t" \
  280. "movel %%a2@+, %%d1 \n\t" \
  281. "mulul %%d2, %%d3:%%d1 \n\t" \
  282. "addxl %%d4, %%d1 \n\t" \
  283. "addxl %%d0, %%d3 \n\t" \
  284. "addl %%d1, %%a3@+ \n\t" \
  285. "movel %%a2@+, %%d1 \n\t" \
  286. "mulul %%d2, %%d4:%%d1 \n\t" \
  287. "addxl %%d3, %%d1 \n\t" \
  288. "addxl %%d0, %%d4 \n\t" \
  289. "addl %%d1, %%a3@+ \n\t" \
  290. "movel %%a2@+, %%d1 \n\t" \
  291. "mulul %%d2, %%d3:%%d1 \n\t" \
  292. "addxl %%d4, %%d1 \n\t" \
  293. "addxl %%d0, %%d3 \n\t" \
  294. "addl %%d1, %%a3@+ \n\t" \
  295. "movel %%a2@+, %%d1 \n\t" \
  296. "mulul %%d2, %%d4:%%d1 \n\t" \
  297. "addxl %%d3, %%d1 \n\t" \
  298. "addxl %%d0, %%d4 \n\t" \
  299. "addl %%d1, %%a3@+ \n\t" \
  300. "movel %%a2@+, %%d1 \n\t" \
  301. "mulul %%d2, %%d3:%%d1 \n\t" \
  302. "addxl %%d4, %%d1 \n\t" \
  303. "addxl %%d0, %%d3 \n\t" \
  304. "addl %%d1, %%a3@+ \n\t" \
  305. "addxl %%d0, %%d3 \n\t"
  306. #define MULADDC_X8_STOP MULADDC_X1_STOP
  307. #endif /* MC68000 */
  308. #if defined(__powerpc64__) || defined(__ppc64__)
  309. #if defined(__MACH__) && defined(__APPLE__)
  310. #define MULADDC_X1_INIT \
  311. asm( \
  312. "ld r3, %3 \n\t" \
  313. "ld r4, %4 \n\t" \
  314. "ld r5, %5 \n\t" \
  315. "ld r6, %6 \n\t" \
  316. "addi r3, r3, -8 \n\t" \
  317. "addi r4, r4, -8 \n\t" \
  318. "addic r5, r5, 0 \n\t"
  319. #define MULADDC_X1_CORE \
  320. "ldu r7, 8(r3) \n\t" \
  321. "mulld r8, r7, r6 \n\t" \
  322. "mulhdu r9, r7, r6 \n\t" \
  323. "adde r8, r8, r5 \n\t" \
  324. "ld r7, 8(r4) \n\t" \
  325. "addze r5, r9 \n\t" \
  326. "addc r8, r8, r7 \n\t" \
  327. "stdu r8, 8(r4) \n\t"
  328. #define MULADDC_X1_STOP \
  329. "addze r5, r5 \n\t" \
  330. "addi r4, r4, 8 \n\t" \
  331. "addi r3, r3, 8 \n\t" \
  332. "std r5, %0 \n\t" \
  333. "std r4, %1 \n\t" \
  334. "std r3, %2 \n\t" \
  335. : "=m" (c), "=m" (d), "=m" (s) \
  336. : "m" (s), "m" (d), "m" (c), "m" (b) \
  337. : "r3", "r4", "r5", "r6", "r7", "r8", "r9" \
  338. );
  339. #else /* __MACH__ && __APPLE__ */
  340. #define MULADDC_X1_INIT \
  341. asm( \
  342. "ld %%r3, %3 \n\t" \
  343. "ld %%r4, %4 \n\t" \
  344. "ld %%r5, %5 \n\t" \
  345. "ld %%r6, %6 \n\t" \
  346. "addi %%r3, %%r3, -8 \n\t" \
  347. "addi %%r4, %%r4, -8 \n\t" \
  348. "addic %%r5, %%r5, 0 \n\t"
  349. #define MULADDC_X1_CORE \
  350. "ldu %%r7, 8(%%r3) \n\t" \
  351. "mulld %%r8, %%r7, %%r6 \n\t" \
  352. "mulhdu %%r9, %%r7, %%r6 \n\t" \
  353. "adde %%r8, %%r8, %%r5 \n\t" \
  354. "ld %%r7, 8(%%r4) \n\t" \
  355. "addze %%r5, %%r9 \n\t" \
  356. "addc %%r8, %%r8, %%r7 \n\t" \
  357. "stdu %%r8, 8(%%r4) \n\t"
  358. #define MULADDC_X1_STOP \
  359. "addze %%r5, %%r5 \n\t" \
  360. "addi %%r4, %%r4, 8 \n\t" \
  361. "addi %%r3, %%r3, 8 \n\t" \
  362. "std %%r5, %0 \n\t" \
  363. "std %%r4, %1 \n\t" \
  364. "std %%r3, %2 \n\t" \
  365. : "=m" (c), "=m" (d), "=m" (s) \
  366. : "m" (s), "m" (d), "m" (c), "m" (b) \
  367. : "r3", "r4", "r5", "r6", "r7", "r8", "r9" \
  368. );
  369. #endif /* __MACH__ && __APPLE__ */
  370. #elif defined(__powerpc__) || defined(__ppc__) /* end PPC64/begin PPC32 */
  371. #if defined(__MACH__) && defined(__APPLE__)
  372. #define MULADDC_X1_INIT \
  373. asm( \
  374. "lwz r3, %3 \n\t" \
  375. "lwz r4, %4 \n\t" \
  376. "lwz r5, %5 \n\t" \
  377. "lwz r6, %6 \n\t" \
  378. "addi r3, r3, -4 \n\t" \
  379. "addi r4, r4, -4 \n\t" \
  380. "addic r5, r5, 0 \n\t"
  381. #define MULADDC_X1_CORE \
  382. "lwzu r7, 4(r3) \n\t" \
  383. "mullw r8, r7, r6 \n\t" \
  384. "mulhwu r9, r7, r6 \n\t" \
  385. "adde r8, r8, r5 \n\t" \
  386. "lwz r7, 4(r4) \n\t" \
  387. "addze r5, r9 \n\t" \
  388. "addc r8, r8, r7 \n\t" \
  389. "stwu r8, 4(r4) \n\t"
  390. #define MULADDC_X1_STOP \
  391. "addze r5, r5 \n\t" \
  392. "addi r4, r4, 4 \n\t" \
  393. "addi r3, r3, 4 \n\t" \
  394. "stw r5, %0 \n\t" \
  395. "stw r4, %1 \n\t" \
  396. "stw r3, %2 \n\t" \
  397. : "=m" (c), "=m" (d), "=m" (s) \
  398. : "m" (s), "m" (d), "m" (c), "m" (b) \
  399. : "r3", "r4", "r5", "r6", "r7", "r8", "r9" \
  400. );
  401. #else /* __MACH__ && __APPLE__ */
  402. #define MULADDC_X1_INIT \
  403. asm( \
  404. "lwz %%r3, %3 \n\t" \
  405. "lwz %%r4, %4 \n\t" \
  406. "lwz %%r5, %5 \n\t" \
  407. "lwz %%r6, %6 \n\t" \
  408. "addi %%r3, %%r3, -4 \n\t" \
  409. "addi %%r4, %%r4, -4 \n\t" \
  410. "addic %%r5, %%r5, 0 \n\t"
  411. #define MULADDC_X1_CORE \
  412. "lwzu %%r7, 4(%%r3) \n\t" \
  413. "mullw %%r8, %%r7, %%r6 \n\t" \
  414. "mulhwu %%r9, %%r7, %%r6 \n\t" \
  415. "adde %%r8, %%r8, %%r5 \n\t" \
  416. "lwz %%r7, 4(%%r4) \n\t" \
  417. "addze %%r5, %%r9 \n\t" \
  418. "addc %%r8, %%r8, %%r7 \n\t" \
  419. "stwu %%r8, 4(%%r4) \n\t"
  420. #define MULADDC_X1_STOP \
  421. "addze %%r5, %%r5 \n\t" \
  422. "addi %%r4, %%r4, 4 \n\t" \
  423. "addi %%r3, %%r3, 4 \n\t" \
  424. "stw %%r5, %0 \n\t" \
  425. "stw %%r4, %1 \n\t" \
  426. "stw %%r3, %2 \n\t" \
  427. : "=m" (c), "=m" (d), "=m" (s) \
  428. : "m" (s), "m" (d), "m" (c), "m" (b) \
  429. : "r3", "r4", "r5", "r6", "r7", "r8", "r9" \
  430. );
  431. #endif /* __MACH__ && __APPLE__ */
  432. #endif /* PPC32 */
  433. /*
  434. * The Sparc(64) assembly is reported to be broken.
  435. * Disable it for now, until we're able to fix it.
  436. */
  437. #if 0 && defined(__sparc__)
  438. #if defined(__sparc64__)
  439. #define MULADDC_X1_INIT \
  440. asm( \
  441. "ldx %3, %%o0 \n\t" \
  442. "ldx %4, %%o1 \n\t" \
  443. "ld %5, %%o2 \n\t" \
  444. "ld %6, %%o3 \n\t"
  445. #define MULADDC_X1_CORE \
  446. "ld [%%o0], %%o4 \n\t" \
  447. "inc 4, %%o0 \n\t" \
  448. "ld [%%o1], %%o5 \n\t" \
  449. "umul %%o3, %%o4, %%o4 \n\t" \
  450. "addcc %%o4, %%o2, %%o4 \n\t" \
  451. "rd %%y, %%g1 \n\t" \
  452. "addx %%g1, 0, %%g1 \n\t" \
  453. "addcc %%o4, %%o5, %%o4 \n\t" \
  454. "st %%o4, [%%o1] \n\t" \
  455. "addx %%g1, 0, %%o2 \n\t" \
  456. "inc 4, %%o1 \n\t"
  457. #define MULADDC_X1_STOP \
  458. "st %%o2, %0 \n\t" \
  459. "stx %%o1, %1 \n\t" \
  460. "stx %%o0, %2 \n\t" \
  461. : "=m" (c), "=m" (d), "=m" (s) \
  462. : "m" (s), "m" (d), "m" (c), "m" (b) \
  463. : "g1", "o0", "o1", "o2", "o3", "o4", \
  464. "o5" \
  465. );
  466. #else /* __sparc64__ */
  467. #define MULADDC_X1_INIT \
  468. asm( \
  469. "ld %3, %%o0 \n\t" \
  470. "ld %4, %%o1 \n\t" \
  471. "ld %5, %%o2 \n\t" \
  472. "ld %6, %%o3 \n\t"
  473. #define MULADDC_X1_CORE \
  474. "ld [%%o0], %%o4 \n\t" \
  475. "inc 4, %%o0 \n\t" \
  476. "ld [%%o1], %%o5 \n\t" \
  477. "umul %%o3, %%o4, %%o4 \n\t" \
  478. "addcc %%o4, %%o2, %%o4 \n\t" \
  479. "rd %%y, %%g1 \n\t" \
  480. "addx %%g1, 0, %%g1 \n\t" \
  481. "addcc %%o4, %%o5, %%o4 \n\t" \
  482. "st %%o4, [%%o1] \n\t" \
  483. "addx %%g1, 0, %%o2 \n\t" \
  484. "inc 4, %%o1 \n\t"
  485. #define MULADDC_X1_STOP \
  486. "st %%o2, %0 \n\t" \
  487. "st %%o1, %1 \n\t" \
  488. "st %%o0, %2 \n\t" \
  489. : "=m" (c), "=m" (d), "=m" (s) \
  490. : "m" (s), "m" (d), "m" (c), "m" (b) \
  491. : "g1", "o0", "o1", "o2", "o3", "o4", \
  492. "o5" \
  493. );
  494. #endif /* __sparc64__ */
  495. #endif /* __sparc__ */
  496. #if defined(__microblaze__) || defined(microblaze)
  497. #define MULADDC_X1_INIT \
  498. asm( \
  499. "lwi r3, %3 \n\t" \
  500. "lwi r4, %4 \n\t" \
  501. "lwi r5, %5 \n\t" \
  502. "lwi r6, %6 \n\t" \
  503. "andi r7, r6, 0xffff \n\t" \
  504. "bsrli r6, r6, 16 \n\t"
  505. #if(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
  506. #define MULADDC_LHUI \
  507. "lhui r9, r3, 0 \n\t" \
  508. "addi r3, r3, 2 \n\t" \
  509. "lhui r8, r3, 0 \n\t"
  510. #else
  511. #define MULADDC_LHUI \
  512. "lhui r8, r3, 0 \n\t" \
  513. "addi r3, r3, 2 \n\t" \
  514. "lhui r9, r3, 0 \n\t"
  515. #endif
  516. #define MULADDC_X1_CORE \
  517. MULADDC_LHUI \
  518. "addi r3, r3, 2 \n\t" \
  519. "mul r10, r9, r6 \n\t" \
  520. "mul r11, r8, r7 \n\t" \
  521. "mul r12, r9, r7 \n\t" \
  522. "mul r13, r8, r6 \n\t" \
  523. "bsrli r8, r10, 16 \n\t" \
  524. "bsrli r9, r11, 16 \n\t" \
  525. "add r13, r13, r8 \n\t" \
  526. "add r13, r13, r9 \n\t" \
  527. "bslli r10, r10, 16 \n\t" \
  528. "bslli r11, r11, 16 \n\t" \
  529. "add r12, r12, r10 \n\t" \
  530. "addc r13, r13, r0 \n\t" \
  531. "add r12, r12, r11 \n\t" \
  532. "addc r13, r13, r0 \n\t" \
  533. "lwi r10, r4, 0 \n\t" \
  534. "add r12, r12, r10 \n\t" \
  535. "addc r13, r13, r0 \n\t" \
  536. "add r12, r12, r5 \n\t" \
  537. "addc r5, r13, r0 \n\t" \
  538. "swi r12, r4, 0 \n\t" \
  539. "addi r4, r4, 4 \n\t"
  540. #define MULADDC_X1_STOP \
  541. "swi r5, %0 \n\t" \
  542. "swi r4, %1 \n\t" \
  543. "swi r3, %2 \n\t" \
  544. : "=m" (c), "=m" (d), "=m" (s) \
  545. : "m" (s), "m" (d), "m" (c), "m" (b) \
  546. : "r3", "r4", "r5", "r6", "r7", "r8", \
  547. "r9", "r10", "r11", "r12", "r13" \
  548. );
  549. #endif /* MicroBlaze */
  550. #if defined(__tricore__)
  551. #define MULADDC_X1_INIT \
  552. asm( \
  553. "ld.a %%a2, %3 \n\t" \
  554. "ld.a %%a3, %4 \n\t" \
  555. "ld.w %%d4, %5 \n\t" \
  556. "ld.w %%d1, %6 \n\t" \
  557. "xor %%d5, %%d5 \n\t"
  558. #define MULADDC_X1_CORE \
  559. "ld.w %%d0, [%%a2+] \n\t" \
  560. "madd.u %%e2, %%e4, %%d0, %%d1 \n\t" \
  561. "ld.w %%d0, [%%a3] \n\t" \
  562. "addx %%d2, %%d2, %%d0 \n\t" \
  563. "addc %%d3, %%d3, 0 \n\t" \
  564. "mov %%d4, %%d3 \n\t" \
  565. "st.w [%%a3+], %%d2 \n\t"
  566. #define MULADDC_X1_STOP \
  567. "st.w %0, %%d4 \n\t" \
  568. "st.a %1, %%a3 \n\t" \
  569. "st.a %2, %%a2 \n\t" \
  570. : "=m" (c), "=m" (d), "=m" (s) \
  571. : "m" (s), "m" (d), "m" (c), "m" (b) \
  572. : "d0", "d1", "e2", "d4", "a2", "a3" \
  573. );
  574. #endif /* TriCore */
  575. /*
  576. * Note, gcc -O0 by default uses r7 for the frame pointer, so it complains about
  577. * our use of r7 below, unless -fomit-frame-pointer is passed.
  578. *
  579. * On the other hand, -fomit-frame-pointer is implied by any -Ox options with
  580. * x !=0, which we can detect using __OPTIMIZE__ (which is also defined by
  581. * clang and armcc5 under the same conditions).
  582. *
  583. * So, only use the optimized assembly below for optimized build, which avoids
  584. * the build error and is pretty reasonable anyway.
  585. */
  586. #if defined(__GNUC__) && !defined(__OPTIMIZE__)
  587. #define MULADDC_CANNOT_USE_R7
  588. #endif
  589. #if defined(__arm__) && !defined(MULADDC_CANNOT_USE_R7)
  590. #if defined(__thumb__) && !defined(__thumb2__)
  591. #define MULADDC_X1_INIT \
  592. asm( \
  593. "ldr r0, %3 \n\t" \
  594. "ldr r1, %4 \n\t" \
  595. "ldr r2, %5 \n\t" \
  596. "ldr r3, %6 \n\t" \
  597. "lsr r7, r3, #16 \n\t" \
  598. "mov r9, r7 \n\t" \
  599. "lsl r7, r3, #16 \n\t" \
  600. "lsr r7, r7, #16 \n\t" \
  601. "mov r8, r7 \n\t"
  602. #define MULADDC_X1_CORE \
  603. "ldmia r0!, {r6} \n\t" \
  604. "lsr r7, r6, #16 \n\t" \
  605. "lsl r6, r6, #16 \n\t" \
  606. "lsr r6, r6, #16 \n\t" \
  607. "mov r4, r8 \n\t" \
  608. "mul r4, r6 \n\t" \
  609. "mov r3, r9 \n\t" \
  610. "mul r6, r3 \n\t" \
  611. "mov r5, r9 \n\t" \
  612. "mul r5, r7 \n\t" \
  613. "mov r3, r8 \n\t" \
  614. "mul r7, r3 \n\t" \
  615. "lsr r3, r6, #16 \n\t" \
  616. "add r5, r5, r3 \n\t" \
  617. "lsr r3, r7, #16 \n\t" \
  618. "add r5, r5, r3 \n\t" \
  619. "add r4, r4, r2 \n\t" \
  620. "mov r2, #0 \n\t" \
  621. "adc r5, r2 \n\t" \
  622. "lsl r3, r6, #16 \n\t" \
  623. "add r4, r4, r3 \n\t" \
  624. "adc r5, r2 \n\t" \
  625. "lsl r3, r7, #16 \n\t" \
  626. "add r4, r4, r3 \n\t" \
  627. "adc r5, r2 \n\t" \
  628. "ldr r3, [r1] \n\t" \
  629. "add r4, r4, r3 \n\t" \
  630. "adc r2, r5 \n\t" \
  631. "stmia r1!, {r4} \n\t"
  632. #define MULADDC_X1_STOP \
  633. "str r2, %0 \n\t" \
  634. "str r1, %1 \n\t" \
  635. "str r0, %2 \n\t" \
  636. : "=m" (c), "=m" (d), "=m" (s) \
  637. : "m" (s), "m" (d), "m" (c), "m" (b) \
  638. : "r0", "r1", "r2", "r3", "r4", "r5", \
  639. "r6", "r7", "r8", "r9", "cc" \
  640. );
  641. #elif (__ARM_ARCH >= 6) && \
  642. defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)
  643. #define MULADDC_X1_INIT \
  644. { \
  645. mbedtls_mpi_uint tmp_a, tmp_b; \
  646. asm volatile (
  647. #define MULADDC_X1_CORE \
  648. ".p2align 2 \n\t" \
  649. "ldr %[a], [%[in]], #4 \n\t" \
  650. "ldr %[b], [%[acc]] \n\t" \
  651. "umaal %[b], %[carry], %[scalar], %[a] \n\t" \
  652. "str %[b], [%[acc]], #4 \n\t"
  653. #define MULADDC_X1_STOP \
  654. : [a] "=&r" (tmp_a), \
  655. [b] "=&r" (tmp_b), \
  656. [in] "+r" (s), \
  657. [acc] "+r" (d), \
  658. [carry] "+l" (c) \
  659. : [scalar] "r" (b) \
  660. : "memory" \
  661. ); \
  662. }
  663. #define MULADDC_X2_INIT \
  664. { \
  665. mbedtls_mpi_uint tmp_a0, tmp_b0; \
  666. mbedtls_mpi_uint tmp_a1, tmp_b1; \
  667. asm volatile (
  668. /* - Make sure loop is 4-byte aligned to avoid stalls
  669. * upon repeated non-word aligned instructions in
  670. * some microarchitectures.
  671. * - Don't use ldm with post-increment or back-to-back
  672. * loads with post-increment and same address register
  673. * to avoid stalls on some microarchitectures.
  674. * - Bunch loads and stores to reduce latency on some
  675. * microarchitectures. E.g., on Cortex-M4, the first
  676. * in a series of load/store operations has latency
  677. * 2 cycles, while subsequent loads/stores are single-cycle. */
  678. #define MULADDC_X2_CORE \
  679. ".p2align 2 \n\t" \
  680. "ldr %[a0], [%[in]], #+8 \n\t" \
  681. "ldr %[b0], [%[acc]], #+8 \n\t" \
  682. "ldr %[a1], [%[in], #-4] \n\t" \
  683. "ldr %[b1], [%[acc], #-4] \n\t" \
  684. "umaal %[b0], %[carry], %[scalar], %[a0] \n\t" \
  685. "umaal %[b1], %[carry], %[scalar], %[a1] \n\t" \
  686. "str %[b0], [%[acc], #-8] \n\t" \
  687. "str %[b1], [%[acc], #-4] \n\t"
  688. #define MULADDC_X2_STOP \
  689. : [a0] "=&r" (tmp_a0), \
  690. [b0] "=&r" (tmp_b0), \
  691. [a1] "=&r" (tmp_a1), \
  692. [b1] "=&r" (tmp_b1), \
  693. [in] "+r" (s), \
  694. [acc] "+r" (d), \
  695. [carry] "+l" (c) \
  696. : [scalar] "r" (b) \
  697. : "memory" \
  698. ); \
  699. }
  700. #else
  701. #define MULADDC_X1_INIT \
  702. asm( \
  703. "ldr r0, %3 \n\t" \
  704. "ldr r1, %4 \n\t" \
  705. "ldr r2, %5 \n\t" \
  706. "ldr r3, %6 \n\t"
  707. #define MULADDC_X1_CORE \
  708. "ldr r4, [r0], #4 \n\t" \
  709. "mov r5, #0 \n\t" \
  710. "ldr r6, [r1] \n\t" \
  711. "umlal r2, r5, r3, r4 \n\t" \
  712. "adds r7, r6, r2 \n\t" \
  713. "adc r2, r5, #0 \n\t" \
  714. "str r7, [r1], #4 \n\t"
  715. #define MULADDC_X1_STOP \
  716. "str r2, %0 \n\t" \
  717. "str r1, %1 \n\t" \
  718. "str r0, %2 \n\t" \
  719. : "=m" (c), "=m" (d), "=m" (s) \
  720. : "m" (s), "m" (d), "m" (c), "m" (b) \
  721. : "r0", "r1", "r2", "r3", "r4", "r5", \
  722. "r6", "r7", "cc" \
  723. );
  724. #endif /* Thumb */
  725. #endif /* ARMv3 */
  726. #if defined(__alpha__)
  727. #define MULADDC_X1_INIT \
  728. asm( \
  729. "ldq $1, %3 \n\t" \
  730. "ldq $2, %4 \n\t" \
  731. "ldq $3, %5 \n\t" \
  732. "ldq $4, %6 \n\t"
  733. #define MULADDC_X1_CORE \
  734. "ldq $6, 0($1) \n\t" \
  735. "addq $1, 8, $1 \n\t" \
  736. "mulq $6, $4, $7 \n\t" \
  737. "umulh $6, $4, $6 \n\t" \
  738. "addq $7, $3, $7 \n\t" \
  739. "cmpult $7, $3, $3 \n\t" \
  740. "ldq $5, 0($2) \n\t" \
  741. "addq $7, $5, $7 \n\t" \
  742. "cmpult $7, $5, $5 \n\t" \
  743. "stq $7, 0($2) \n\t" \
  744. "addq $2, 8, $2 \n\t" \
  745. "addq $6, $3, $3 \n\t" \
  746. "addq $5, $3, $3 \n\t"
  747. #define MULADDC_X1_STOP \
  748. "stq $3, %0 \n\t" \
  749. "stq $2, %1 \n\t" \
  750. "stq $1, %2 \n\t" \
  751. : "=m" (c), "=m" (d), "=m" (s) \
  752. : "m" (s), "m" (d), "m" (c), "m" (b) \
  753. : "$1", "$2", "$3", "$4", "$5", "$6", "$7" \
  754. );
  755. #endif /* Alpha */
  756. #if defined(__mips__) && !defined(__mips64)
  757. #define MULADDC_X1_INIT \
  758. asm( \
  759. "lw $10, %3 \n\t" \
  760. "lw $11, %4 \n\t" \
  761. "lw $12, %5 \n\t" \
  762. "lw $13, %6 \n\t"
  763. #define MULADDC_X1_CORE \
  764. "lw $14, 0($10) \n\t" \
  765. "multu $13, $14 \n\t" \
  766. "addi $10, $10, 4 \n\t" \
  767. "mflo $14 \n\t" \
  768. "mfhi $9 \n\t" \
  769. "addu $14, $12, $14 \n\t" \
  770. "lw $15, 0($11) \n\t" \
  771. "sltu $12, $14, $12 \n\t" \
  772. "addu $15, $14, $15 \n\t" \
  773. "sltu $14, $15, $14 \n\t" \
  774. "addu $12, $12, $9 \n\t" \
  775. "sw $15, 0($11) \n\t" \
  776. "addu $12, $12, $14 \n\t" \
  777. "addi $11, $11, 4 \n\t"
  778. #define MULADDC_X1_STOP \
  779. "sw $12, %0 \n\t" \
  780. "sw $11, %1 \n\t" \
  781. "sw $10, %2 \n\t" \
  782. : "=m" (c), "=m" (d), "=m" (s) \
  783. : "m" (s), "m" (d), "m" (c), "m" (b) \
  784. : "$9", "$10", "$11", "$12", "$13", "$14", "$15", "lo", "hi" \
  785. );
  786. #endif /* MIPS */
  787. #endif /* GNUC */
  788. #if (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
  789. #define MULADDC_X1_INIT \
  790. __asm mov esi, s \
  791. __asm mov edi, d \
  792. __asm mov ecx, c \
  793. __asm mov ebx, b
  794. #define MULADDC_X1_CORE \
  795. __asm lodsd \
  796. __asm mul ebx \
  797. __asm add eax, ecx \
  798. __asm adc edx, 0 \
  799. __asm add eax, [edi] \
  800. __asm adc edx, 0 \
  801. __asm mov ecx, edx \
  802. __asm stosd
  803. #define MULADDC_X1_STOP \
  804. __asm mov c, ecx \
  805. __asm mov d, edi \
  806. __asm mov s, esi
  807. #if defined(MBEDTLS_HAVE_SSE2)
  808. #define EMIT __asm _emit
  809. #define MULADDC_X8_INIT MULADDC_X1_INIT
  810. #define MULADDC_X8_CORE \
  811. EMIT 0x0F EMIT 0x6E EMIT 0xC9 \
  812. EMIT 0x0F EMIT 0x6E EMIT 0xC3 \
  813. EMIT 0x0F EMIT 0x6E EMIT 0x1F \
  814. EMIT 0x0F EMIT 0xD4 EMIT 0xCB \
  815. EMIT 0x0F EMIT 0x6E EMIT 0x16 \
  816. EMIT 0x0F EMIT 0xF4 EMIT 0xD0 \
  817. EMIT 0x0F EMIT 0x6E EMIT 0x66 EMIT 0x04 \
  818. EMIT 0x0F EMIT 0xF4 EMIT 0xE0 \
  819. EMIT 0x0F EMIT 0x6E EMIT 0x76 EMIT 0x08 \
  820. EMIT 0x0F EMIT 0xF4 EMIT 0xF0 \
  821. EMIT 0x0F EMIT 0x6E EMIT 0x7E EMIT 0x0C \
  822. EMIT 0x0F EMIT 0xF4 EMIT 0xF8 \
  823. EMIT 0x0F EMIT 0xD4 EMIT 0xCA \
  824. EMIT 0x0F EMIT 0x6E EMIT 0x5F EMIT 0x04 \
  825. EMIT 0x0F EMIT 0xD4 EMIT 0xDC \
  826. EMIT 0x0F EMIT 0x6E EMIT 0x6F EMIT 0x08 \
  827. EMIT 0x0F EMIT 0xD4 EMIT 0xEE \
  828. EMIT 0x0F EMIT 0x6E EMIT 0x67 EMIT 0x0C \
  829. EMIT 0x0F EMIT 0xD4 EMIT 0xFC \
  830. EMIT 0x0F EMIT 0x7E EMIT 0x0F \
  831. EMIT 0x0F EMIT 0x6E EMIT 0x56 EMIT 0x10 \
  832. EMIT 0x0F EMIT 0xF4 EMIT 0xD0 \
  833. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  834. EMIT 0x0F EMIT 0x6E EMIT 0x66 EMIT 0x14 \
  835. EMIT 0x0F EMIT 0xF4 EMIT 0xE0 \
  836. EMIT 0x0F EMIT 0xD4 EMIT 0xCB \
  837. EMIT 0x0F EMIT 0x6E EMIT 0x76 EMIT 0x18 \
  838. EMIT 0x0F EMIT 0xF4 EMIT 0xF0 \
  839. EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x04 \
  840. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  841. EMIT 0x0F EMIT 0x6E EMIT 0x5E EMIT 0x1C \
  842. EMIT 0x0F EMIT 0xF4 EMIT 0xD8 \
  843. EMIT 0x0F EMIT 0xD4 EMIT 0xCD \
  844. EMIT 0x0F EMIT 0x6E EMIT 0x6F EMIT 0x10 \
  845. EMIT 0x0F EMIT 0xD4 EMIT 0xD5 \
  846. EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x08 \
  847. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  848. EMIT 0x0F EMIT 0xD4 EMIT 0xCF \
  849. EMIT 0x0F EMIT 0x6E EMIT 0x6F EMIT 0x14 \
  850. EMIT 0x0F EMIT 0xD4 EMIT 0xE5 \
  851. EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x0C \
  852. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  853. EMIT 0x0F EMIT 0xD4 EMIT 0xCA \
  854. EMIT 0x0F EMIT 0x6E EMIT 0x6F EMIT 0x18 \
  855. EMIT 0x0F EMIT 0xD4 EMIT 0xF5 \
  856. EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x10 \
  857. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  858. EMIT 0x0F EMIT 0xD4 EMIT 0xCC \
  859. EMIT 0x0F EMIT 0x6E EMIT 0x6F EMIT 0x1C \
  860. EMIT 0x0F EMIT 0xD4 EMIT 0xDD \
  861. EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x14 \
  862. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  863. EMIT 0x0F EMIT 0xD4 EMIT 0xCE \
  864. EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x18 \
  865. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  866. EMIT 0x0F EMIT 0xD4 EMIT 0xCB \
  867. EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x1C \
  868. EMIT 0x83 EMIT 0xC7 EMIT 0x20 \
  869. EMIT 0x83 EMIT 0xC6 EMIT 0x20 \
  870. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  871. EMIT 0x0F EMIT 0x7E EMIT 0xC9
  872. #define MULADDC_X8_STOP \
  873. EMIT 0x0F EMIT 0x77 \
  874. __asm mov c, ecx \
  875. __asm mov d, edi \
  876. __asm mov s, esi
  877. #endif /* SSE2 */
  878. #endif /* MSVC */
  879. #endif /* MBEDTLS_HAVE_ASM */
  880. #if !defined(MULADDC_X1_CORE)
  881. #if defined(MBEDTLS_HAVE_UDBL)
  882. #define MULADDC_X1_INIT \
  883. { \
  884. mbedtls_t_udbl r; \
  885. mbedtls_mpi_uint r0, r1;
  886. #define MULADDC_X1_CORE \
  887. r = *(s++) * (mbedtls_t_udbl) b; \
  888. r0 = (mbedtls_mpi_uint) r; \
  889. r1 = (mbedtls_mpi_uint)( r >> biL ); \
  890. r0 += c; r1 += (r0 < c); \
  891. r0 += *d; r1 += (r0 < *d); \
  892. c = r1; *(d++) = r0;
  893. #define MULADDC_X1_STOP \
  894. }
  895. #else /* MBEDTLS_HAVE_UDBL */
  896. #define MULADDC_X1_INIT \
  897. { \
  898. mbedtls_mpi_uint s0, s1, b0, b1; \
  899. mbedtls_mpi_uint r0, r1, rx, ry; \
  900. b0 = ( b << biH ) >> biH; \
  901. b1 = ( b >> biH );
  902. #define MULADDC_X1_CORE \
  903. s0 = ( *s << biH ) >> biH; \
  904. s1 = ( *s >> biH ); s++; \
  905. rx = s0 * b1; r0 = s0 * b0; \
  906. ry = s1 * b0; r1 = s1 * b1; \
  907. r1 += ( rx >> biH ); \
  908. r1 += ( ry >> biH ); \
  909. rx <<= biH; ry <<= biH; \
  910. r0 += rx; r1 += (r0 < rx); \
  911. r0 += ry; r1 += (r0 < ry); \
  912. r0 += c; r1 += (r0 < c); \
  913. r0 += *d; r1 += (r0 < *d); \
  914. c = r1; *(d++) = r0;
  915. #define MULADDC_X1_STOP \
  916. }
  917. #endif /* C (longlong) */
  918. #endif /* C (generic) */
  919. #if !defined(MULADDC_X2_CORE)
  920. #define MULADDC_X2_INIT MULADDC_X1_INIT
  921. #define MULADDC_X2_STOP MULADDC_X1_STOP
  922. #define MULADDC_X2_CORE MULADDC_X1_CORE MULADDC_X1_CORE
  923. #endif /* MULADDC_X2_CORE */
  924. #if !defined(MULADDC_X4_CORE)
  925. #define MULADDC_X4_INIT MULADDC_X2_INIT
  926. #define MULADDC_X4_STOP MULADDC_X2_STOP
  927. #define MULADDC_X4_CORE MULADDC_X2_CORE MULADDC_X2_CORE
  928. #endif /* MULADDC_X4_CORE */
  929. #if !defined(MULADDC_X8_CORE)
  930. #define MULADDC_X8_INIT MULADDC_X4_INIT
  931. #define MULADDC_X8_STOP MULADDC_X4_STOP
  932. #define MULADDC_X8_CORE MULADDC_X4_CORE MULADDC_X4_CORE
  933. #endif /* MULADDC_X8_CORE */
  934. /* *INDENT-ON* */
  935. #endif /* bn_mul.h */