Skip to content

Commit f7bf043

Browse files
committed
select the best variants so far, drop the rest
1 parent 83da95a commit f7bf043

3 files changed

Lines changed: 12 additions & 292 deletions

File tree

src/main/java/de/tilman_neumann/jml/base/Uint128.java

Lines changed: 5 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -55,40 +55,7 @@ public long getLow() {
5555
* @param b
5656
* @return this + b
5757
*/
58-
public Uint128 add_v1(Uint128 b) {
59-
// We know for sure that low overflows if both low and b_lo are 64 bit. If only one of the input 'low's
60-
// is 64 bit, then we can recognize an overflow if r_lo is not 64 bit.
61-
final long b_lo = b.getLow();
62-
final long b_hi = b.getHigh();
63-
final long r_lo = low + b_lo;
64-
long r_hi = high + b_hi;
65-
if ((low<0 && b_lo<0) || ((low<0 || b_lo<0) && (r_lo >= 0))) r_hi++;
66-
return new Uint128(r_hi, r_lo);
67-
}
68-
69-
/**
70-
* Add two 128 bit integers.
71-
*
72-
* Simpler carry recognition thanks to Ben Buhrow,
73-
* see https://www.mersenneforum.org/showpost.php?p=524300&postcount=173.
74-
*
75-
* @param b
76-
* @return this + b
77-
*/
78-
public Uint128 add/*_v2*/(Uint128 b) {
79-
long r_lo = low + b.getLow();
80-
long r_hi = high + b.getHigh();
81-
if (r_lo+Long.MIN_VALUE < low+Long.MIN_VALUE) r_hi++;
82-
return new Uint128(r_hi, r_lo);
83-
}
84-
85-
/**
86-
* Add two 128 bit integers, AI-generated version.
87-
*
88-
* @param b
89-
* @return this + b
90-
*/
91-
public Uint128 add_v3(Uint128 b) {
58+
public Uint128 add(Uint128 b) {
9259
long r_lo = low + b.getLow();
9360
long carry = Long.compareUnsigned(r_lo, low) < 0 ? 1 : 0;
9461
long r_hi = high + b.getHigh() + carry;
@@ -102,8 +69,8 @@ public Uint128 add_v3(Uint128 b) {
10269
*/
10370
public long add_getHigh(Uint128 b) {
10471
long r_lo = low + b.getLow();
105-
long r_hi = high + b.getHigh();
106-
return r_lo+Long.MIN_VALUE < low+Long.MIN_VALUE ? r_hi + 1 : r_hi;
72+
long carry = Long.compareUnsigned(r_lo, low) < 0 ? 1 : 0;
73+
return high + b.getHigh() + carry;
10774
}
10875

10976
/**
@@ -112,21 +79,7 @@ public long add_getHigh(Uint128 b) {
11279
* @param b
11380
* @return this - b, may be negative
11481
*/
115-
public Uint128 subtract(Uint128 b) {
116-
long b_lo = b.getLow();
117-
long r_lo = low - b_lo;
118-
long r_hi = high - b.getHigh();
119-
if (Long.compareUnsigned(low, b_lo) < 0) --r_hi;
120-
return new Uint128(r_hi, r_lo);
121-
}
122-
123-
/**
124-
* Subtract two 128 bit integers. AI-generated version.
125-
*
126-
* @param b
127-
* @return this - b, may be negative
128-
*/
129-
public Uint128 subtract_v2(Uint128 b) {
82+
public Uint128 subtract(Uint128 b) {
13083
long b_lo = b.getLow();
13184
long r_lo = low - b_lo;
13285
long borrow = Long.compareUnsigned(low, b_lo) < 0 ? 1 : 0;
@@ -158,46 +111,14 @@ public static Uint128 mul63(long a, long b) {
158111
return new Uint128(r_hi, r_lo);
159112
}
160113

161-
/**
162-
* Multiplication of unsigned 64 bit integers,
163-
* following https://stackoverflow.com/questions/18859207/high-bits-of-long-multiplication-in-java.
164-
*
165-
* This method takes notice of overflows of the "middle term".
166-
* As such it works for 64 bit inputs but is slightly slower than mul63().
167-
*
168-
* @param a unsigned long
169-
* @param b unsigned long
170-
* @return a*b
171-
*/
172-
public static Uint128 mul64_v1(long a, long b) {
173-
final long a_hi = a >>> 32;
174-
final long b_hi = b >>> 32;
175-
final long a_lo = a & 0xFFFFFFFFL;
176-
final long b_lo = b & 0xFFFFFFFFL;
177-
178-
final long lo_prod = a_lo * b_lo;
179-
final long med_prod1 = a_hi * b_lo;
180-
final long med_prod2 = a_lo * b_hi;
181-
final long med_term = med_prod1 + med_prod2;
182-
final long hi_prod = a_hi * b_hi;
183-
184-
// the medium term could overflow
185-
long r_hi = (((lo_prod >>> 32) + med_term) >>> 32) + hi_prod;
186-
if ((med_prod1<0 && med_prod2<0) || ((med_prod1<0 || med_prod2<0) && med_term>=0)) r_hi += 1L<<32;
187-
final long r_lo = ((med_term & 0xFFFFFFFFL) << 32) + lo_prod;
188-
return new Uint128(r_hi, r_lo);
189-
}
190-
191114
/**
192115
* Multiplication of unsigned 64 bit integers with simplified carry recognition.
193116
*
194-
* Faster than v1 except for N>=52 bit in PollardRhoBrentMontgomery64 (strange)
195-
*
196117
* @param a unsigned long
197118
* @param b unsigned long
198119
* @return a*b
199120
*/
200-
public static Uint128 mul64/*_v2*/(long a, long b) {
121+
public static Uint128 mul64(long a, long b) {
201122
final long a_hi = a >>> 32;
202123
final long b_hi = b >>> 32;
203124
final long a_lo = a & 0xFFFFFFFFL;
@@ -216,38 +137,6 @@ public static Uint128 mul64_v1(long a, long b) {
216137

217138
return new Uint128(r_hi, r_lo);
218139
}
219-
220-
/**
221-
* Multiplication of unsigned 64 bit integers.
222-
*
223-
* <strong>Experimental version</strong>, pretty slow when used in TinyEcm.
224-
*
225-
* @param a unsigned long
226-
* @param b unsigned long
227-
* @return a*b
228-
*/
229-
public static Uint128 mul64_v3(long a, long b) { // derived from mul64Signed()
230-
final long a_hi = a >> 32;
231-
final long a_lo = a & 0xFFFFFFFFL;
232-
final long b_hi = b >> 32;
233-
final long b_lo = b & 0xFFFFFFFFL;
234-
235-
// use b_lo twice as first argument hoping that this optimizes register usage
236-
final long w0 = b_lo * a_lo;
237-
final long t = b_lo * a_hi + (w0 >>> 32);
238-
// same with t
239-
final long w2 = t >> 32;
240-
final long w1 = (t & 0xFFFFFFFFL) + a_lo * b_hi;
241-
242-
long r_hi = a_hi * b_hi + w2 + (w1 >> 32);
243-
// so far we computed the signed solution; now make it unsigned
244-
if (a<0) r_hi += b;
245-
if (b<0) r_hi += a;
246-
247-
final long r_lo = a * b;
248-
249-
return new Uint128(r_hi, r_lo);
250-
}
251140

252141
/**
253142
* Multiplication of two unsigned 64-bit integers using Math.multiplyHigh().

src/test/java/de/tilman_neumann/jml/base/Uint128PerformanceTest.java

Lines changed: 4 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ private static void testPerformance() {
9797
t0 = System.currentTimeMillis();
9898
for (int i=0; i<NCOUNT_ADD; i++) {
9999
for (int j=0; j<NCOUNT_ADD; j++) {
100-
a128_arr[i].add_v1(a128_arr[j]);
100+
a128_arr[i].add(a128_arr[j]);
101101
}
102102
}
103103
t1 = System.currentTimeMillis();
@@ -107,41 +107,7 @@ private static void testPerformance() {
107107
allDurations[r - WARMUPS] = duration;
108108
}
109109
}
110-
LOG.info("add_v1 took " + totalDuration + "ms " + Arrays.toString(allDurations));
111-
112-
totalDuration = 0;
113-
for (int r=0; r<WARMUPS+REPEATS; r++) {
114-
t0 = System.currentTimeMillis();
115-
for (int i=0; i<NCOUNT_ADD; i++) {
116-
for (int j=0; j<NCOUNT_ADD; j++) {
117-
a128_arr[i].add/*_v2*/(a128_arr[j]);
118-
}
119-
}
120-
t1 = System.currentTimeMillis();
121-
duration = t1-t0;
122-
if (r >= WARMUPS) {
123-
totalDuration += duration;
124-
allDurations[r - WARMUPS] = duration;
125-
}
126-
}
127-
LOG.info("add_v2 took " + totalDuration + "ms " + Arrays.toString(allDurations));
128-
129-
totalDuration = 0;
130-
for (int r=0; r<WARMUPS+REPEATS; r++) {
131-
t0 = System.currentTimeMillis();
132-
for (int i=0; i<NCOUNT_ADD; i++) {
133-
for (int j=0; j<NCOUNT_ADD; j++) {
134-
a128_arr[i].add_v3(a128_arr[j]);
135-
}
136-
}
137-
t1 = System.currentTimeMillis();
138-
duration = t1-t0;
139-
if (r >= WARMUPS) {
140-
totalDuration += duration;
141-
allDurations[r - WARMUPS] = duration;
142-
}
143-
}
144-
LOG.info("add_v3 took " + totalDuration + "ms " + Arrays.toString(allDurations));
110+
LOG.info("add took " + totalDuration + "ms " + Arrays.toString(allDurations));
145111

146112
totalDuration = 0;
147113
for (int r=0; r<WARMUPS+REPEATS; r++) {
@@ -160,23 +126,6 @@ private static void testPerformance() {
160126
}
161127
LOG.info("subtract took " + totalDuration + "ms " + Arrays.toString(allDurations));
162128

163-
totalDuration = 0;
164-
for (int r=0; r<WARMUPS+REPEATS; r++) {
165-
t0 = System.currentTimeMillis();
166-
for (int i=0; i<NCOUNT_ADD; i++) {
167-
for (int j=0; j<NCOUNT_ADD; j++) {
168-
a128_arr[i].subtract_v2(a128_arr[j]);
169-
}
170-
}
171-
t1 = System.currentTimeMillis();
172-
duration = t1-t0;
173-
if (r >= WARMUPS) {
174-
totalDuration += duration;
175-
allDurations[r - WARMUPS] = duration;
176-
}
177-
}
178-
LOG.info("subtract_v2 took " + totalDuration + "ms " + Arrays.toString(allDurations));
179-
180129
// Test performance of mul64 implementations:
181130
// Here we need to do something with the results to avoid the compiler optimizing thhe tests to nothing
182131

@@ -199,54 +148,14 @@ private static void testPerformance() {
199148
}
200149
}
201150
LOG.info("mul63 took " + totalDuration + "ms " + Arrays.toString(allDurations));
202-
203-
totalDuration = 0;
204-
for (int r=0; r<WARMUPS+REPEATS; r++) {
205-
long dummy = 0;
206-
t0 = System.currentTimeMillis();
207-
for (int i=0; i<NCOUNT_MUL; i++) {
208-
for (int j=0; j<NCOUNT_MUL; j++) {
209-
Uint128 result = Uint128.mul64_v1(a_arr[i], a_arr[j]);
210-
dummy += result.getHigh() + result.getLow();
211-
}
212-
}
213-
t1 = System.currentTimeMillis();
214-
LOG.trace("dummy = " + dummy);
215-
duration = t1-t0;
216-
if (r >= WARMUPS) {
217-
totalDuration += duration;
218-
allDurations[r - WARMUPS] = duration;
219-
}
220-
}
221-
LOG.info("mul64_v1 took " + totalDuration + "ms " + Arrays.toString(allDurations));
222151

223152
totalDuration = 0;
224153
for (int r=0; r<WARMUPS+REPEATS; r++) {
225154
long dummy = 0;
226155
t0 = System.currentTimeMillis();
227156
for (int i=0; i<NCOUNT_MUL; i++) {
228157
for (int j=0; j<NCOUNT_MUL; j++) {
229-
Uint128 result = Uint128.mul64/*_v2*/(a_arr[i], a_arr[j]);
230-
dummy += result.getHigh() + result.getLow();
231-
}
232-
}
233-
t1 = System.currentTimeMillis();
234-
LOG.trace("dummy = " + dummy);
235-
duration = t1-t0;
236-
if (r >= WARMUPS) {
237-
totalDuration += duration;
238-
allDurations[r - WARMUPS] = duration;
239-
}
240-
}
241-
LOG.info("mul64_v2 took " + totalDuration + "ms " + Arrays.toString(allDurations));
242-
243-
totalDuration = 0;
244-
for (int r=0; r<WARMUPS+REPEATS; r++) {
245-
long dummy = 0;
246-
t0 = System.currentTimeMillis();
247-
for (int i=0; i<NCOUNT_MUL; i++) {
248-
for (int j=0; j<NCOUNT_MUL; j++) {
249-
Uint128 result = Uint128.mul64_v3(a_arr[i], a_arr[j]);
158+
Uint128 result = Uint128.mul64(a_arr[i], a_arr[j]);
250159
dummy += result.getHigh() + result.getLow();
251160
}
252161
}
@@ -258,7 +167,7 @@ private static void testPerformance() {
258167
allDurations[r - WARMUPS] = duration;
259168
}
260169
}
261-
LOG.info("mul64_v3 took " + totalDuration + "ms " + Arrays.toString(allDurations));
170+
LOG.info("mul64 took " + totalDuration + "ms " + Arrays.toString(allDurations));
262171

263172
totalDuration = 0;
264173
for (int r=0; r<WARMUPS+REPEATS; r++) {

0 commit comments

Comments
 (0)