Line data Source code
1 : #line 2 "../src/kernel/none/gcdll.c"
2 : /* Copyright (C) 2000 The PARI group.
3 :
4 : This file is part of the PARI/GP package.
5 :
6 : PARI/GP is free software; you can redistribute it and/or modify it under the
7 : terms of the GNU General Public License as published by the Free Software
8 : Foundation; either version 2 of the License, or (at your option) any later
9 : version. It is distributed in the hope that it will be useful, but WITHOUT
10 : ANY WARRANTY WHATSOEVER.
11 :
12 : Check the License for details. You should have received a copy of it, along
13 : with the package; see the file 'COPYING'. If not, write to the Free Software
14 : Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */
15 :
16 : /***********************************************************************/
17 : /** **/
18 : /** GCD **/
19 : /** **/
20 : /***********************************************************************/
21 : /* Fast ulong gcd. Called with y odd; x can be arbitrary (but will most of
22 : * the time be smaller than y). */
23 :
24 : /* Gotos are Harmful, and Programming is a Science. E.W.Dijkstra. */
25 : INLINE ulong
26 293628500 : gcduodd(ulong x, ulong y) /* assume y&1==1, y > 1 */
27 : {
28 293628500 : if (!x) return y;
29 : /* fix up x */
30 687873548 : while (!(x&1)) x>>=1;
31 293628500 : if (x==1) return 1;
32 228948855 : if (x==y) return y;
33 220321148 : else if (x>y) goto xislarger;/* will be rare, given how we'll use this */
34 : /* loop invariants: x,y odd and distinct. */
35 176256024 : yislarger:
36 812737595 : if ((x^y)&2) /* ...01, ...11 or vice versa */
37 425699860 : y=(x>>2)+(y>>2)+1; /* ==(x+y)>>2 except it can't overflow */
38 : else /* ...01,...01 or ...11,...11 */
39 387037735 : y=(y-x)>>2; /* now y!=0 in either case */
40 1632479346 : while (!(y&1)) y>>=1; /* kill any windfall-gained powers of 2 */
41 812737595 : if (y==1) return 1; /* comparand == return value... */
42 723482254 : if (x==y) return y; /* this and the next is just one comparison */
43 689100817 : else if (x<y) goto yislarger;/* else fall through to xislarger */
44 :
45 450837236 : xislarger: /* same as above, seen through a mirror */
46 703003426 : if ((x^y)&2)
47 363546203 : x=(x>>2)+(y>>2)+1;
48 : else
49 339457223 : x=(x-y)>>2; /* x!=0 */
50 1362279946 : while (!(x&1)) x>>=1;
51 703003426 : if (x==1) return 1;
52 632487068 : if (x==y) return y;
53 606319056 : else if (x>y) goto xislarger;
54 :
55 398217990 : goto yislarger;
56 : }
57 : /* Gotos are useful, and Programming is an Art. D.E.Knuth. */
58 : /* PS: Of course written with Dijkstra's lessons firmly in mind... --GN */
59 :
60 : /* at least one of a or b is odd, return gcd(a,b) */
61 : INLINE ulong
62 379029220 : mygcduodd(ulong a, ulong b)
63 : {
64 : ulong c;
65 379029220 : if (b&1)
66 : {
67 227419159 : if (a==1 || b==1)
68 40665631 : c = 1;
69 : else
70 186753528 : c = gcduodd(a, b);
71 : }
72 : else
73 : {
74 151610061 : if (a==1)
75 51485952 : c = 1;
76 : else
77 100124109 : c = gcduodd(b, a);
78 : }
79 379048029 : return c;
80 : }
81 :
82 : /* modified right shift binary algorithm with at most one division */
83 : ulong
84 348016818 : ugcd(ulong a,ulong b)
85 : {
86 : long v;
87 348016818 : if (!b) return a;
88 337495757 : if (!a) return b;
89 322765736 : if (a>b) { a %= b; if (!a) return b; }
90 96794417 : else { b %= a; if (!b) return a; }
91 209312935 : v = vals(a|b);
92 209649850 : return mygcduodd(a>>v, b>>v) << v;
93 : }
94 : long
95 28660717 : cgcd(long a,long b) { return (long)ugcd(labs(a), labs(b)); }
96 :
97 : /* For gcdii(): assume a>b>0, return gcd(a,b) as a GEN */
98 : static GEN
99 540787515 : igcduu(ulong a, ulong b)
100 : {
101 : long v;
102 540787515 : a %= b; if (!a) return utoipos(b);
103 169321121 : v = vals(a|b);
104 169377092 : return utoipos( mygcduodd(a>>v, b>>v) << v );
105 : }
106 :
107 : /*Warning: overflows silently if lcm does not fit*/
108 : ulong
109 3010922 : ulcm(ulong a, ulong b)
110 : {
111 3010922 : ulong d = ugcd(a,b);
112 3010909 : if (!d) return 0;
113 3010909 : return d == 1? a*b: a*(b/d);
114 : }
115 : long
116 36952 : clcm(long a,long b) { return ulcm(labs(a), labs(b)); }
117 :
118 : /********************************************************************/
119 : /** **/
120 : /** INTEGER EXTENDED GCD (AND INVMOD) **/
121 : /** **/
122 : /********************************************************************/
123 : /* Two basic ideas - (1) avoid many integer divisions, especially when the
124 : * quotient is 1 which happens ~ 40% of the time. (2) Use Lehmer's trick as
125 : * modified by Jebelean of extracting a couple of words' worth of leading bits
126 : * from both operands, and compute partial quotients from them as long as we
127 : * can be sure of their values. Jebelean's modifications consist in
128 : * inequalities from which we can quickly decide whether to carry on or to
129 : * return to the outer loop, and in re-shifting after the first word's worth of
130 : * bits has been used up. All of this is described in R. Lercier's thesis
131 : * [pp148-153 & 163f.], except his outer loop isn't quite right: the catch-up
132 : * divisions needed when one partial quotient is larger than a word are missing.
133 : *
134 : * The API consists of invmod() and bezout() below; the single-word routines
135 : * xgcduu and xxgcduu may be called directly if desired; lgcdii() probably
136 : * doesn't make much sense out of context.
137 : *
138 : * The whole lot is a factor 6 .. 8 faster on word-sized operands, and asym-
139 : * ptotically about a factor 2.5 .. 3, depending on processor architecture,
140 : * than the naive continued-division code. Unfortunately, thanks to the
141 : * unrolled loops and all, the code is lengthy. */
142 :
143 : /*==================================
144 : * xgcduu(d,d1,f,v,v1,s)
145 : * xxgcduu(d,d1,f,u,u1,v,v1,s)
146 : * rgcduu(d,d1,vmax,u,u1,v,v1,s)
147 : *==================================*/
148 : /*
149 : * Fast `final' extended gcd algorithm, acting on two ulongs. Ideally this
150 : * should be replaced with assembler versions wherever possible. The present
151 : * code essentially does `subtract, compare, and possibly divide' at each step,
152 : * which is reasonable when hardware division (a) exists, (b) is a bit slowish
153 : * and (c) does not depend a lot on the operand values (as on i486). When
154 : * wordsize division is in fact an assembler routine based on subtraction,
155 : * this strategy may not be the most efficient one.
156 : *
157 : * xxgcduu() should be called with d > d1 > 0, returns gcd(d,d1), and assigns
158 : * the usual signless cont.frac. recurrence matrix to [u, u1; v, v1] (i.e.,
159 : * the product of all the [0, 1; 1 q_j] where the leftmost factor arises from
160 : * the quotient of the first division step), and the information about the
161 : * implied signs to s (-1 when an odd number of divisions has been done,
162 : * 1 otherwise). xgcduu() is exactly the same except that u,u1 are not com-
163 : * puted (and not returned, of course).
164 : *
165 : * The input flag f should be set to 1 if we know in advance that gcd(d,d1)==1
166 : * (so we can stop the chain division one step early: as soon as the remainder
167 : * equals 1). Use this when you intend to use only what would be v, or only
168 : * what would be u and v, after that final division step, but not u1 and v1.
169 : * With the flag in force and thus without that final step, the interesting
170 : * quantity/ies will still sit in [u1 and] v1, of course.
171 : *
172 : * For computing the inverse of a single-word INTMOD known to exist, pass f=1
173 : * to xgcduu(), and obtain the result from s and v1. (The routine does the
174 : * right thing when d1==1 already.) For finishing a multiword modinv known
175 : * to exist, pass f=1 to xxgcduu(), and multiply the returned matrix (with
176 : * properly adjusted signs) onto the values v' and v1' previously obtained
177 : * from the multiword division steps. Actually, just take the scalar product
178 : * of [v',v1'] with [u1,-v1], and change the sign if s==-1. (If the final
179 : * step had been carried out, it would be [-u,v], and s would also change.)
180 : * For reducing a rational number to lowest terms, pass f=0 to xgcduu().
181 : * Finally, f=0 with xxgcduu() is useful for Bezout computations.
182 : * (It is safe for invmod() to call xgcduu() with f=1, because f&1 doesn't
183 : * make a difference when gcd(d,d1)>1. The speedup is negligible.)
184 : *
185 : * In principle, when gcd(d,d1) is known to be 1, it is straightforward to
186 : * recover the final u,u1 given only v,v1 and s. However, it probably isn't
187 : * worthwhile, as it trades a few multiplications for a division.
188 : *
189 : * rgcduu() is a variant of xxgcduu() which does not have f (the effect is
190 : * that of f=0), but instead has a ulong vmax parameter, for use in rational
191 : * reconstruction below. It returns when v1 exceeds vmax; v will never
192 : * exceed vmax. (vmax=0 is taken as a synonym of ULONG_MAX i.e. unlimited,
193 : * in which case rgcduu behaves exactly like xxgcduu with f=0.) The return
194 : * value of rgcduu() is typically meaningless; the interesting part is the
195 : * matrix. */
196 :
197 : ulong
198 505471064 : xgcduu(ulong d, ulong d1, int f, ulong* v, ulong* v1, long *s)
199 : {
200 : ulong xv,xv1, xs, q,res;
201 : LOCAL_HIREMAINDER;
202 :
203 : /* The above blurb contained a lie. The main loop always stops when d1
204 : * has become equal to 1. If (d1 == 1 && !(f&1)) after the loop, we do
205 : * the final `division' of d by 1 `by hand' as it were.
206 : *
207 : * The loop has already been unrolled once. Aggressive optimization could
208 : * well lead to a totally unrolled assembler version.
209 : *
210 : * On modern x86 architectures, this loop is a pig anyway. The division
211 : * instruction always puts its result into the same pair of registers, and
212 : * we always want to use one of them straight away, so pipeline performance
213 : * will suck big time. An assembler version should probably do a first loop
214 : * computing and storing all the quotients -- their number is bounded in
215 : * advance -- and then assembling the matrix in a second pass. On other
216 : * architectures where we can cycle through four or so groups of registers
217 : * and exploit a fast ALU result-to-operand feedback path, this is much less
218 : * of an issue. */
219 505471064 : xs = res = 0;
220 505471064 : xv = 0UL; xv1 = 1UL;
221 3514023635 : while (d1 > 1UL)
222 : {
223 3268245575 : d -= d1; /* no need to use subll */
224 3268245575 : if (d >= d1)
225 : {
226 1826458218 : hiremainder = 0; q = 1 + divll(d,d1); d = hiremainder;
227 1826458218 : xv += q * xv1;
228 : }
229 : else
230 1441787357 : xv += xv1;
231 3268245575 : if (d <= 1UL) { xs=1; break; } /* possible loop exit */
232 : /* repeat with inverted roles */
233 3008552571 : d1 -= d;
234 3008552571 : if (d1 >= d)
235 : {
236 1721241607 : hiremainder = 0; q = 1 + divll(d1,d); d1 = hiremainder;
237 1721241607 : xv1 += q * xv;
238 : }
239 : else
240 1287310964 : xv1 += xv;
241 : }
242 :
243 505471064 : if (!(f&1))
244 : { /* division by 1 postprocessing if needed */
245 5744934 : if (xs && d==1)
246 1779215 : { xv1 += d1 * xv; xs = 0; res = 1UL; }
247 3965719 : else if (!xs && d1==1)
248 1256646 : { xv += d * xv1; xs = 1; res = 1UL; }
249 : }
250 :
251 505471064 : if (xs)
252 : {
253 259133601 : *s = -1; *v = xv1; *v1 = xv;
254 259133601 : return (res ? res : (d==1 ? 1UL : d1));
255 : }
256 : else
257 : {
258 246337463 : *s = 1; *v = xv; *v1 = xv1;
259 246337463 : return (res ? res : (d1==1 ? 1UL : d));
260 : }
261 : }
262 :
263 : ulong
264 126640437 : xxgcduu(ulong d, ulong d1, int f,
265 : ulong* u, ulong* u1, ulong* v, ulong* v1, long *s)
266 : {
267 : ulong xu,xu1, xv,xv1, xs, q,res;
268 : LOCAL_HIREMAINDER;
269 :
270 126640437 : xs = res = 0;
271 126640437 : xu = xv1 = 1UL;
272 126640437 : xu1 = xv = 0UL;
273 270633994 : while (d1 > 1UL)
274 : {
275 : /* no need to use subll */
276 208102479 : d -= d1;
277 208102479 : if (d >= d1)
278 : {
279 133034948 : hiremainder = 0; q = 1 + divll(d,d1); d = hiremainder;
280 133034948 : xv += q * xv1;
281 133034948 : xu += q * xu1;
282 : }
283 : else
284 75067531 : { xv += xv1; xu += xu1; }
285 208102479 : if (d <= 1UL) { xs=1; break; } /* possible loop exit */
286 : /* repeat with inverted roles */
287 143993557 : d1 -= d;
288 143993557 : if (d1 >= d)
289 : {
290 79367064 : hiremainder = 0; q = 1 + divll(d1,d); d1 = hiremainder;
291 79367064 : xv1 += q * xv;
292 79367064 : xu1 += q * xu;
293 : }
294 : else
295 64626493 : { xv1 += xv; xu1 += xu; }
296 : }
297 :
298 126640437 : if (!(f&1))
299 : { /* division by 1 postprocessing if needed */
300 121425678 : if (xs && d==1)
301 : {
302 27325652 : xv1 += d1 * xv;
303 27325652 : xu1 += d1 * xu;
304 27325652 : xs = 0; res = 1UL;
305 : }
306 94100026 : else if (!xs && d1==1)
307 : {
308 52810331 : xv += d * xv1;
309 52810331 : xu += d * xu1;
310 52810331 : xs = 1; res = 1UL;
311 : }
312 : }
313 :
314 126640437 : if (xs)
315 : {
316 89595938 : *s = -1; *u = xu1; *u1 = xu; *v = xv1; *v1 = xv;
317 89595938 : return (res ? res : (d==1 ? 1UL : d1));
318 : }
319 : else
320 : {
321 37044499 : *s = 1; *u = xu; *u1 = xu1; *v = xv; *v1 = xv1;
322 37044499 : return (res ? res : (d1==1 ? 1UL : d));
323 : }
324 : }
325 :
326 : ulong
327 41878 : rgcduu(ulong d, ulong d1, ulong vmax,
328 : ulong* u, ulong* u1, ulong* v, ulong* v1, long *s)
329 : {
330 41878 : ulong xu,xu1, xv,xv1, xs, q, res=0;
331 41878 : int f = 0;
332 : LOCAL_HIREMAINDER;
333 :
334 41878 : if (vmax == 0) vmax = ULONG_MAX;
335 41878 : xs = res = 0;
336 41878 : xu = xv1 = 1UL;
337 41878 : xu1 = xv = 0UL;
338 83800 : while (d1 > 1UL)
339 : {
340 82934 : d -= d1; /* no need to use subll */
341 82934 : if (d >= d1)
342 : {
343 38554 : hiremainder = 0; q = 1 + divll(d,d1); d = hiremainder;
344 38554 : xv += q * xv1;
345 38554 : xu += q * xu1;
346 : }
347 : else
348 44380 : { xv += xv1; xu += xu1; }
349 : /* possible loop exit */
350 82934 : if (xv > vmax) { f=xs=1; break; }
351 69729 : if (d <= 1UL) { xs=1; break; }
352 : /* repeat with inverted roles */
353 62767 : d1 -= d;
354 62767 : if (d1 >= d)
355 : {
356 39740 : hiremainder = 0; q = 1 + divll(d1,d); d1 = hiremainder;
357 39740 : xv1 += q * xv;
358 39740 : xu1 += q * xu;
359 : }
360 : else
361 23027 : { xv1 += xv; xu1 += xu; }
362 : /* possible loop exit */
363 62767 : if (xv1 > vmax) { f=1; break; }
364 : }
365 :
366 41878 : if (!(f&1))
367 : { /* division by 1 postprocessing if needed */
368 7828 : if (xs && d==1)
369 : {
370 6962 : xv1 += d1 * xv;
371 6962 : xu1 += d1 * xu;
372 6962 : xs = 0; res = 1UL;
373 : }
374 866 : else if (!xs && d1==1)
375 : {
376 866 : xv += d * xv1;
377 866 : xu += d * xu1;
378 866 : xs = 1; res = 1UL;
379 : }
380 : }
381 :
382 41878 : if (xs)
383 : {
384 14071 : *s = -1; *u = xu1; *u1 = xu; *v = xv1; *v1 = xv;
385 14071 : return (res ? res : (d==1 ? 1UL : d1));
386 : }
387 : else
388 : {
389 27807 : *s = 1; *u = xu; *u1 = xu1; *v = xv; *v1 = xv1;
390 27807 : return (res ? res : (d1==1 ? 1UL : d));
391 : }
392 : }
393 :
394 : /*==================================
395 : * cbezout(a,b,uu,vv)
396 : *==================================
397 : * Same as bezout() but for C longs.
398 : * Return g = gcd(a,b) >= 0, and assign longs u,v through pointers uu,vv
399 : * such that g = u*a + v*b.
400 : * Special cases:
401 : * a = b = 0 ==> pick u=1, v=0 (and return 1, surprisingly)
402 : * a != 0 = b ==> keep v=0
403 : * a = 0 != b ==> keep u=0
404 : * |a| = |b| != 0 ==> keep u=0, set v=+-1
405 : * Assignments through uu,vv happen unconditionally. */
406 : long
407 667160 : cbezout(long a,long b,long *uu,long *vv)
408 : {
409 : long s,*t;
410 667160 : ulong d = labs(a), d1 = labs(b);
411 : ulong r,u,u1,v,v1;
412 :
413 667160 : if (!b)
414 : {
415 2688 : *vv=0L;
416 2688 : if (!a) { *uu=1L; return 0L; }
417 2688 : *uu = a < 0 ? -1L : 1L;
418 2688 : return (long)d;
419 : }
420 664472 : else if (!a || (d == d1))
421 : {
422 56756 : *uu = 0L; *vv = b < 0 ? -1L : 1L;
423 56756 : return (long)d1;
424 : }
425 607716 : else if (d == 1) /* frequently used by nfinit */
426 : {
427 356391 : *uu = a; *vv = 0L;
428 356391 : return 1L;
429 : }
430 251325 : else if (d < d1)
431 : {
432 : /* bug in gcc-2.95.3:
433 : * s = a; a = b; b = s; produces wrong result a = b. This is OK: */
434 170737 : { long _x = a; a = b; b = _x; } /* in order to keep the right signs */
435 170737 : r = d; d = d1; d1 = r;
436 170737 : t = uu; uu = vv; vv = t;
437 : }
438 : /* d > d1 > 0 */
439 251325 : r = xxgcduu(d, d1, 0, &u, &u1, &v, &v1, &s);
440 251329 : if (s < 0)
441 : {
442 93821 : *uu = a < 0 ? (long)u : -(long)u;
443 93821 : *vv = b < 0 ? -(long)v : (long)v;
444 : }
445 : else
446 : {
447 157508 : *uu = a < 0 ? -(long)u : (long)u;
448 157508 : *vv = b < 0 ? (long)v : -(long)v;
449 : }
450 251329 : return (long)r;
451 : }
452 :
453 : /*==================================
454 : * lgcdii(d,d1,u,u1,v,v1,vmax)
455 : *==================================*/
456 : /* Lehmer's partial extended gcd algorithm, acting on two t_INT GENs.
457 : *
458 : * Tries to determine, using the leading 2*BITS_IN_LONG significant bits of d
459 : * and a quantity of bits from d1 obtained by a shift of the same displacement,
460 : * as many partial quotients of d/d1 as possible, and assigns to [u,u1;v,v1]
461 : * the product of all the [0,1; 1,qj] thus obtained, where the leftmost
462 : * factor arises from the quotient of the first division step.
463 : *
464 : * For use in rational reconstruction, vmax can be given a nonzero value.
465 : * In this case, we will return early as soon as v1 > vmax (i.e. v <= vmax)
466 : *
467 : * MUST be called with d > d1 > 0, and with d occupying more than one
468 : * significant word. Returns the number of reduction/swap steps carried out,
469 : * possibly zero, or under certain conditions minus that number. When the
470 : * return value is nonzero, the caller should use the returned recurrence
471 : * matrix to update its own copies of d,d1. When the return value is
472 : * nonpositive, and the latest remainder after updating turns out to be
473 : * nonzero, the caller should at once attempt a full division, rather than
474 : * trying lgcdii() again -- this typically happens when we are about to
475 : * encounter a quotient larger than half a word. (This is not detected
476 : * infallibly -- after a positive return value, it is possible that the next
477 : * stage will end up needing a full division. After a negative return value,
478 : * however, this is certain, and should be acted upon.)
479 : *
480 : * The sign information, for which xgcduu() has its return argument s, is now
481 : * implicit in the LSB of our return value, and the caller may take advantage
482 : * of the fact that a return value of +-1 implies u==0,u1==v==1 [only v1 pro-
483 : * vides interesting information in this case]. One might also use the fact
484 : * that if the return value is +-2, then u==1, but this is rather marginal.
485 : *
486 : * If it was not possible to determine even the first quotient, either because
487 : * we're too close to an integer quotient or because the quotient would be
488 : * larger than one word (if the `leading digit' of d1 after shifting is all
489 : * zeros), we return 0 and do not assign anything to the last four args.
490 : *
491 : * The division chain might even run to completion. It is up to the caller to
492 : * detect this case. This routine does not change d or d1; this is also up to
493 : * the caller */
494 : int
495 29147834 : lgcdii(ulong* d, ulong* d1, ulong* u, ulong* u1, ulong* v, ulong* v1,
496 : ulong vmax)
497 : {
498 : /* Strategy: (1) Extract/shift most significant bits. We assume that d
499 : * has at least two significant words, but we can cope with a one-word d1.
500 : * Let dd,dd1 be the most significant dividend word and matching part of the
501 : * divisor.
502 : * (2) Check for overflow on the first division. For our purposes, this
503 : * happens when the upper half of dd1 is zero. (Actually this is detected
504 : * during extraction.)
505 : * (3) Get a fix on the first quotient. We compute q = floor(dd/dd1), which
506 : * is an upper bound for floor(d/d1), and which gives the true value of the
507 : * latter if (and-almost-only-if) the remainder dd' = dd-q*dd1 is >= q.
508 : * (If it isn't, we give up. This is annoying because the subsequent full
509 : * division will repeat some work already done, but it happens infrequently.
510 : * Doing the extra-bit-fetch in this case would be awkward.)
511 : * (4) Finish initializations.
512 : *
513 : * The remainder of the action is comparatively boring... The main loop has
514 : * been unrolled once (so we don't swap things and we can apply Jebelean's
515 : * termination conditions which alternatingly take two different forms during
516 : * successive iterations). When we first run out of sufficient bits to form
517 : * a quotient, and have an extra word of each operand, we pull out two whole
518 : * word's worth of dividend bits, and divisor bits of matching significance;
519 : * to these we apply our partial matrix (disregarding overflow because the
520 : * result mod 2^(2*BITS_IN_LONG) will in fact give the correct values), and
521 : * re-extract one word's worth of the current dividend and a matching amount
522 : * of divisor bits. The affair will normally terminate with matrix entries
523 : * just short of a whole word. (We terminate the inner loop before these can
524 : * possibly overflow.) */
525 : ulong dd,dd1,ddlo,dd1lo, sh,shc; /* `digits', shift count */
526 : ulong xu,xu1, xv,xv1, q,res; /* recurrences, partial quotient, count */
527 : ulong tmp0,tmp1,tmp2,tmpd,tmpu,tmpv; /* temps */
528 : ulong dm1, d1m1;
529 : long ld, ld1, lz;
530 29147834 : int skip = 0;
531 : LOCAL_OVERFLOW;
532 : LOCAL_HIREMAINDER;
533 :
534 : /* following is just for convenience: vmax==0 means no bound */
535 29147834 : if (vmax == 0) vmax = ULONG_MAX;
536 29147834 : ld = lgefint(d); ld1 = lgefint(d1); lz = ld - ld1; /* >= 0 */
537 29147834 : if (lz > 1) return 0; /* rare */
538 :
539 28715703 : d = int_MSW(d); dm1 = *int_precW(d);
540 28715703 : d1 = int_MSW(d1);d1m1 = *int_precW(d1);
541 28715703 : dd1lo = 0; /* unless we find something better */
542 28715703 : sh = bfffo(*d);
543 :
544 28715703 : if (sh)
545 : { /* do the shifting */
546 28171261 : shc = BITS_IN_LONG - sh;
547 28171261 : if (lz)
548 : { /* dividend longer than divisor */
549 4313510 : dd1 = (*d1 >> shc);
550 4313510 : if (!(HIGHMASK & dd1)) return 0; /* overflow detected */
551 3725699 : if (ld1 > 3)
552 866128 : dd1lo = (*d1 << sh) + (d1m1 >> shc);
553 : else
554 2859571 : dd1lo = (*d1 << sh);
555 : }
556 : else
557 : { /* dividend and divisor have the same length */
558 23857751 : dd1 = (*d1 << sh);
559 23857751 : if (!(HIGHMASK & dd1)) return 0;
560 23531902 : if (ld1 > 3)
561 : {
562 23532863 : dd1 += (d1m1 >> shc);
563 23532863 : if (ld1 > 4)
564 20552398 : dd1lo = (d1m1 << sh) + (*int_precW(int_precW(d1)) >> shc);
565 : else
566 2980465 : dd1lo = (d1m1 << sh);
567 : }
568 : }
569 : /* following lines assume d to have 2 or more significant words */
570 27257601 : dd = (*d << sh) + (dm1 >> shc);
571 27257601 : if (ld > 4)
572 21418590 : ddlo = (dm1 << sh) + (*int_precW(int_precW(d)) >> shc);
573 : else
574 5839011 : ddlo = (dm1 << sh);
575 : }
576 : else
577 : { /* no shift needed */
578 544442 : if (lz) return 0; /* dividend longer than divisor: overflow */
579 540227 : dd1 = *d1;
580 540227 : if (!(HIGHMASK & dd1)) return 0;
581 537017 : if(ld1 > 3) dd1lo = d1m1;
582 : /* assume again that d has another significant word */
583 537017 : dd = *d; ddlo = dm1;
584 : }
585 : /* First subtraction/division stage. (If a subtraction initially suffices,
586 : * we don't divide at all.) If a Jebelean condition is violated, and we
587 : * can't fix it even by looking at the low-order bits in ddlo,dd1lo, we
588 : * give up and ask for a full division. Otherwise we commit the result,
589 : * possibly deciding to re-shift immediately afterwards. */
590 27794618 : dd -= dd1;
591 27794618 : if (dd < dd1)
592 : { /* first quotient known to be == 1 */
593 6279553 : xv1 = 1UL;
594 6279553 : if (!dd) /* !(Jebelean condition), extraspecial case */
595 : { /* This actually happens. Now q==1 is known, but we underflow already.
596 : * OTOH we've just shortened d by a whole word. Thus we are happy and
597 : * return. */
598 406639 : *u = 0; *v = *u1 = *v1 = 1UL;
599 406639 : return -1; /* Next step will be a full division. */
600 : }
601 : }
602 : else
603 : { /* division indicated */
604 21515065 : hiremainder = 0;
605 21515065 : xv1 = 1 + divll(dd, dd1); /* xv1: alternative spelling of `q', here ;) */
606 21515065 : dd = hiremainder;
607 21515065 : if (dd < xv1) /* !(Jebelean cond'), nonextra special case */
608 : { /* Attempt to complete the division using the less significant bits,
609 : * before skipping right past the 1st loop to the reshift stage. */
610 1030737 : ddlo = subll(ddlo, mulll(xv1, dd1lo));
611 1030737 : dd = subllx(dd, hiremainder);
612 :
613 : /* If we now have an overflow, q was too large. Thanks to our decision
614 : * not to get here unless the original dd1 had bits set in the upper half
615 : * of the word, we now know that the correct quotient is in fact q-1. */
616 1030737 : if (overflow)
617 : {
618 23902 : xv1--;
619 23902 : ddlo = addll(ddlo,dd1lo);
620 23902 : dd = addllx(dd,dd1); /* overflows again which cancels the borrow */
621 : /* ...and fall through to skip=1 below */
622 : }
623 : else
624 : /* Test Jebelean condition anew, at this point using _all_ the extracted
625 : * bits we have. This is clutching at straws; we have a more or less
626 : * even chance of succeeding this time. Note that if we fail, we really
627 : * do not know whether the correct quotient would have been q or some
628 : * smaller value. */
629 1006835 : if (!dd && ddlo < xv1) return 0;
630 :
631 : /* Otherwise, we now know that q is correct, but we cannot go into the
632 : * 1st loop. Raise a flag so we'll remember to skip past the loop.
633 : * Get here also after the q-1 adjustment case. */
634 50739 : skip = 1;
635 : } /* if !(Jebelean) then */
636 : }
637 26407981 : res = 1;
638 26407981 : if (xv1 > vmax)
639 : { /* gone past the bound already */
640 746 : *u = 0UL; *u1 = 1UL; *v = 1UL; *v1 = xv1;
641 746 : return res;
642 : }
643 26407235 : xu = 0UL; xv = xu1 = 1UL;
644 :
645 : /* Some invariants from here across the first loop:
646 : *
647 : * At this point, and again after we are finished with the first loop and
648 : * subsequent conditional, a division and the attached update of the
649 : * recurrence matrix have just been carried out completely. The matrix
650 : * xu,xu1;xv,xv1 has been initialized (or updated, possibly with permuted
651 : * columns), and the current remainder == next divisor (dd at the moment)
652 : * is nonzero (it might be zero here, but then skip will have been set).
653 : *
654 : * After the first loop, or when skip is set already, it will also be the
655 : * case that there aren't sufficiently many bits to continue without re-
656 : * shifting. If the divisor after reshifting is zero, or indeed if it
657 : * doesn't have more than half a word's worth of bits, we will have to
658 : * return at that point. Otherwise, we proceed into the second loop.
659 : *
660 : * Furthermore, when we reach the re-shift stage, dd:ddlo and dd1:dd1lo will
661 : * already reflect the result of applying the current matrix to the old
662 : * ddorig:ddlo and dd1orig:dd1lo. (For the first iteration above, this
663 : * was easy to achieve, and we didn't even need to peek into the (now
664 : * no longer existent!) saved words. After the loop, we'll stop for a
665 : * moment to merge in the ddlo,dd1lo contributions.)
666 : *
667 : * Note that after the 1st division, even an a priori quotient of 1 cannot be
668 : * trusted until we've checked Jebelean's condition: it might be too small */
669 26407235 : if (!skip)
670 : {
671 : for(;;)
672 : { /* First half of loop divides dd into dd1, and leaves the recurrence
673 : * matrix xu,...,xv1 groomed the wrong way round (xu,xv will be the newer
674 : * entries) when successful. */
675 219764804 : tmpd = dd1 - dd;
676 219764804 : if (tmpd < dd)
677 : { /* quotient suspected to be 1 */
678 92955165 : tmpu = xu + xu1; /* cannot overflow -- everything bounded by
679 : * the original dd during first loop */
680 92955165 : tmpv = xv + xv1;
681 : }
682 : else
683 : { /* division indicated */
684 126809639 : hiremainder = 0;
685 126809639 : q = 1 + divll(tmpd, dd);
686 126809639 : tmpd = hiremainder;
687 126809639 : tmpu = xu + q*xu1; /* can't overflow, but may need to be undone */
688 126809639 : tmpv = xv + q*xv1;
689 : }
690 :
691 219764804 : tmp0 = addll(tmpv, xv1);
692 219764804 : if ((tmpd < tmpu) || overflow ||
693 214918948 : (dd - tmpd < tmp0)) /* !(Jebelean cond.) */
694 : break; /* skip ahead to reshift stage */
695 : else
696 : { /* commit dd1, xu, xv */
697 206553971 : res++;
698 206553971 : dd1 = tmpd; xu = tmpu; xv = tmpv;
699 206553971 : if (xv > vmax) { *u = xu1; *u1 = xu; *v = xv1; *v1 = xv; return res; }
700 : }
701 :
702 : /* Second half of loop divides dd1 into dd, and the matrix returns to its
703 : * normal arrangement. */
704 206552077 : tmpd = dd - dd1;
705 206552077 : if (tmpd < dd1)
706 : { /* quotient suspected to be 1 */
707 82615500 : tmpu = xu1 + xu; /* cannot overflow */
708 82615500 : tmpv = xv1 + xv;
709 : }
710 : else
711 : { /* division indicated */
712 123936577 : hiremainder = 0;
713 123936577 : q = 1 + divll(tmpd, dd1);
714 123936577 : tmpd = hiremainder;
715 123936577 : tmpu = xu1 + q*xu;
716 123936577 : tmpv = xv1 + q*xv;
717 : }
718 :
719 206552077 : tmp0 = addll(tmpu, xu);
720 206552077 : if ((tmpd < tmpv) || overflow ||
721 194774279 : (dd1 - tmpd < tmp0)) /* !(Jebelean cond.) */
722 : break; /* skip ahead to reshift stage */
723 : else
724 : { /* commit dd, xu1, xv1 */
725 193399524 : res++;
726 193399524 : dd = tmpd; xu1 = tmpu; xv1 = tmpv;
727 193399524 : if (xv1 > vmax) { *u = xu; *u1 = xu1; *v = xv; *v1 = xv1; return res; }
728 : }
729 : } /* end of first loop */
730 :
731 : /* Intermezzo: update dd:ddlo, dd1:dd1lo. (But not if skip is set.) */
732 26363386 : if (res&1)
733 : { /* after failed division in 1st half of loop:
734 : * [dd1:dd1lo,dd:ddlo] = [ddorig:ddlo,dd1orig:dd1lo]
735 : * * [ -xu, xu1 ; xv, -xv1 ]
736 : * Actually, we only multiply [ddlo,dd1lo] onto the matrix and add the
737 : * high-order remainders + overflows onto [dd1,dd] */
738 13205421 : tmp1 = mulll(ddlo, xu); tmp0 = hiremainder;
739 13205421 : tmp1 = subll(mulll(dd1lo,xv), tmp1);
740 13205421 : dd1 += subllx(hiremainder, tmp0);
741 13205421 : tmp2 = mulll(ddlo, xu1); tmp0 = hiremainder;
742 13205421 : ddlo = subll(tmp2, mulll(dd1lo,xv1));
743 13205421 : dd += subllx(tmp0, hiremainder);
744 13205421 : dd1lo = tmp1;
745 : }
746 : else
747 : { /* after failed division in 2nd half of loop:
748 : * [dd:ddlo,dd1:dd1lo] = [ddorig:ddlo,dd1orig:dd1lo]
749 : * * [ xu1, -xu ; -xv1, xv ]
750 : * Actually, we only multiply [ddlo,dd1lo] onto the matrix and add the
751 : * high-order remainders + overflows onto [dd,dd1] */
752 13157965 : tmp1 = mulll(ddlo, xu1); tmp0 = hiremainder;
753 13157965 : tmp1 = subll(tmp1, mulll(dd1lo,xv1));
754 13157965 : dd += subllx(tmp0, hiremainder);
755 13157965 : tmp2 = mulll(ddlo, xu); tmp0 = hiremainder;
756 13157965 : dd1lo = subll(mulll(dd1lo,xv), tmp2);
757 13157965 : dd1 += subllx(hiremainder, tmp0);
758 13157965 : ddlo = tmp1;
759 : }
760 : } /* end of skip-pable section: get here also, with res==1, when there
761 : * was a problem immediately after the very first division. */
762 :
763 : /* Re-shift. Note: the shift count _can_ be zero, viz. under the following
764 : * precise conditions: The original dd1 had its topmost bit set, so the 1st
765 : * q was 1, and after subtraction, dd had its topmost bit unset. If now dd=0,
766 : * we'd have taken the return exit already, so we couldn't have got here.
767 : * If not, then it must have been the second division which has gone amiss
768 : * (because dd1 was very close to an exact multiple of the remainder dd value,
769 : * so this will be very rare). At this point, we'd have a fairly slim chance
770 : * of fixing things by re-examining dd1:dd1lo vs. dd:ddlo, but this is not
771 : * guaranteed to work. Instead of trying, we return at once and let caller
772 : * see to it that the initial subtraction is re-done usingall the bits of
773 : * both operands, which already helps, and the next round will either be a
774 : * full division (if dd occupied a halfword or less), or another llgcdii()
775 : * first step. In the latter case, since we try a little harder during our
776 : * first step, we may actually be able to fix the problem, and get here again
777 : * with improved low-order bits and with another step under our belt.
778 : * Otherwise we'll have given up above and forced a full division.
779 : *
780 : * If res is even, the shift count _cannot_ be zero. (The first step forces
781 : * a zero into the remainder's MSB, and all subsequent remainders will have
782 : * inherited it.)
783 : *
784 : * The re-shift stage exits if the next divisor has at most half a word's
785 : * worth of bits.
786 : *
787 : * For didactic reasons, the second loop will be arranged in the same way
788 : * as the first -- beginning with the division of dd into dd1, as if res
789 : * was odd. To cater for this, if res is actually even, we swap things
790 : * around during reshifting. (During the second loop, the parity of res
791 : * does not matter; we know in which half of the loop we are when we decide
792 : * to return.) */
793 26404297 : if (res&1)
794 : { /* after odd number of division(s) */
795 13256373 : if (dd1 && (sh = bfffo(dd1)))
796 : {
797 13100743 : shc = BITS_IN_LONG - sh;
798 13100743 : dd = (ddlo >> shc) + (dd << sh);
799 13100743 : if (!(HIGHMASK & dd))
800 : {
801 46003 : *u = xu; *u1 = xu1; *v = xv; *v1 = xv1;
802 46003 : return -res; /* full division asked for */
803 : }
804 13054740 : dd1 = (dd1lo >> shc) + (dd1 << sh);
805 : }
806 : else
807 : { /* time to return: <= 1 word left, or sh==0 */
808 155630 : *u = xu; *u1 = xu1; *v = xv; *v1 = xv1;
809 155630 : return res;
810 : }
811 : }
812 : else
813 : { /* after even number of divisions */
814 13147924 : if (dd)
815 : {
816 13156118 : sh = bfffo(dd); /* > 0 */
817 13156118 : shc = BITS_IN_LONG - sh;
818 : /* dd:ddlo will become the new dd1, and v.v. */
819 13156118 : tmpd = (ddlo >> shc) + (dd << sh);
820 13156118 : dd = (dd1lo >> shc) + (dd1 << sh);
821 13156118 : dd1 = tmpd;
822 : /* This completes the swap; now dd is again the current divisor */
823 13156118 : if (HIGHMASK & dd)
824 : { /* recurrence matrix is the wrong way round; swap it */
825 12996448 : tmp0 = xu; xu = xu1; xu1 = tmp0;
826 12996448 : tmp0 = xv; xv = xv1; xv1 = tmp0;
827 : }
828 : else
829 : { /* recurrence matrix is the wrong way round; fix this */
830 159670 : *u = xu1; *u1 = xu; *v = xv1; *v1 = xv;
831 159670 : return -res; /* full division asked for */
832 : }
833 : }
834 : else
835 : { /* time to return: <= 1 word left */
836 0 : *u = xu1; *u1 = xu; *v = xv1; *v1 = xv;
837 0 : return res;
838 : }
839 : } /* end reshift */
840 :
841 : /* The Second Loop. Rip-off of the first, but we now check for overflow
842 : * in the recurrences. Returns instead of breaking when we cannot fix the
843 : * quotient any longer. */
844 : for(;;)
845 : { /* First half of loop divides dd into dd1, and leaves the recurrence
846 : * matrix xu,...,xv1 groomed the wrong way round (xu,xv will be the newer
847 : * entries) when successful */
848 123847972 : tmpd = dd1 - dd;
849 123847972 : if (tmpd < dd)
850 : { /* quotient suspected to be 1 */
851 42842352 : tmpu = xu + xu1;
852 42842352 : tmpv = addll(xv, xv1); /* xv,xv1 will overflow first */
853 42842352 : tmp1 = overflow;
854 : }
855 : else
856 : { /* division indicated */
857 81005620 : hiremainder = 0;
858 81005620 : q = 1 + divll(tmpd, dd);
859 81005620 : tmpd = hiremainder;
860 81005620 : tmpu = xu + q*xu1;
861 81005620 : tmpv = addll(xv, mulll(q,xv1));
862 81005620 : tmp1 = overflow | hiremainder;
863 : }
864 :
865 123847972 : tmp0 = addll(tmpv, xv1);
866 123847972 : if ((tmpd < tmpu) || overflow || tmp1 ||
867 118618375 : (dd - tmpd < tmp0)) /* !(Jebelean cond.) */
868 : { /* The recurrence matrix has not yet been warped... */
869 13220657 : *u = xu; *u1 = xu1; *v = xv; *v1 = xv1;
870 13220657 : break;
871 : }
872 : /* commit dd1, xu, xv */
873 110627315 : res++;
874 110627315 : dd1 = tmpd; xu = tmpu; xv = tmpv;
875 110627315 : if (xv > vmax) { *u = xu1; *u1 = xu; *v = xv1; *v1 = xv; return res; }
876 :
877 : /* Second half of loop divides dd1 into dd, and the matrix returns to its
878 : * normal arrangement */
879 110598988 : tmpd = dd - dd1;
880 110598988 : if (tmpd < dd1)
881 : { /* quotient suspected to be 1 */
882 46686776 : tmpu = xu1 + xu;
883 46686776 : tmpv = addll(xv1, xv);
884 46686776 : tmp1 = overflow;
885 : }
886 : else
887 : { /* division indicated */
888 63912212 : hiremainder = 0;
889 63912212 : q = 1 + divll(tmpd, dd1);
890 63912212 : tmpd = hiremainder;
891 63912212 : tmpu = xu1 + q*xu;
892 63912212 : tmpv = addll(xv1, mulll(q, xv));
893 63912212 : tmp1 = overflow | hiremainder;
894 : }
895 :
896 110598988 : tmp0 = addll(tmpu, xu);
897 110598988 : if ((tmpd < tmpv) || overflow || tmp1 ||
898 99177206 : (dd1 - tmpd < tmp0)) /* !(Jebelean cond.) */
899 : { /* The recurrence matrix has not yet been unwarped, so it is
900 : * the wrong way round; fix this. */
901 12777272 : *u = xu1; *u1 = xu; *v = xv1; *v1 = xv;
902 12777272 : break;
903 : }
904 :
905 97821716 : res++; /* commit dd, xu1, xv1 */
906 97821716 : dd = tmpd; xu1 = tmpu; xv1 = tmpv;
907 97821716 : if (xv1 > vmax) { *u = xu; *u1 = xu1; *v = xv; *v1 = xv1; return res; }
908 : } /* end of second loop */
909 :
910 25997929 : return res;
911 : }
912 :
913 : /* 1 / Mod(x,p). Assume x < p */
914 : ulong
915 469418369 : Fl_invsafe(ulong x, ulong p)
916 : {
917 : long s;
918 469418369 : ulong xv, xv1, g = xgcduu(p, x, 1, &xv, &xv1, &s);
919 471308114 : if (g != 1UL) return 0UL;
920 470840781 : xv = xv1 % p; if (s < 0) xv = p - xv;
921 470840781 : return xv;
922 : }
923 :
924 : /* 1 / Mod(x,p). Assume x < p */
925 : ulong
926 456628958 : Fl_inv(ulong x, ulong p)
927 : {
928 456628958 : ulong xv = Fl_invsafe(x, p);
929 458086204 : if (!xv && p!=1UL) pari_err_INV("Fl_inv", mkintmod(utoi(x), utoi(p)));
930 457982852 : return xv;
931 : }
|