Download

Download TrialDivLP.zip - 56 KB

Contents

Background
Introduction
First Approach: Plain List of Prime Numbers
Second Approach: Compress the List of Prime Numbers by Encoding in a Bitfield Array
Encoding the List of Primes
Benchmark
1. 128 bits integers
Accelerating IsPrime with the compressed list of primes
Points of Interest
Links
History

Background

This article is part of a set of articles. Each article highlights an aspect around Integer Factorization.

Integer Factorization: Trial Division Algorithm[^] : focuses on variants of Trial Division algorithm and their respective efficiency.
Integer Factorization: Dreaded list of primes[^] : focuses on a method to handle a large list of Primes by compression.
Integer Factorization: Optimizing Small Factors Checking[^] : focuses on a method to check small factors faster than division.
Integer factorization: Reversing the Multiplication[^] : focuses on another approach to optimize the Trial Division algorithm.

Introduction

The Trial Division algorithm is about factorizing an integer by checking all possible factors, and all interesting factors are prime numbers. The problem is how to handle a large list of prime numbers.

First Approach: Plain List of Prime Numbers

When using a list of primes, the first approach is to use a simple array of integers containing the prime numbers.

Under 64K, a prime number fits in an int16 (2 bytes), beyond, a prime number rather fits in an int32 (4 bytes).

Figures for a simple list of primes:

Upper limit	Number of primes	Size of array
65,536	6,542	13,084
100,000	9,592	38,368
500,000	41,538	166,152
1,000,000	78,498	313,992
5,000,000	348,513	1,394,052
10,000,000	664,579	2,658,316
100,000,000	5,761,455	23,045,820,
2^32-1 4,294,967,295	203,280,221	813,120,884

As one can see, things get really ugly when the list is huge.

Sample code:

// Trial Division: Square Root + Prime list + Wheel
// Check all numbers until Square Root, start with a list of primes
// and Skip useless factors with a wheel
long long TD_SRPW(long long Cand) {
    long long SPrimes[] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43,
            47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113,
            127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191,
            193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263,
            269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347,
            349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421,
            431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499,
            503, 509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593,
            599, 601, 0 };
    long long Wheel[] = { 6, 4, 2, 4, 2, 4, 6, 2, 0 };
    long long Div;

    Count = 0;
    long long Top = sqrt(Cand);
    // Check small primes
    for (long long Ind = 0; SPrimes[Ind] != 0; Ind++) {
        Div = SPrimes[Ind]; // for debug purpose
        if (SPrimes[Ind] > Top) {
            return Cand;
        }
        Count++;
        if (Cand % SPrimes[Ind] == 0) {
            return SPrimes[Ind];
        }
    }
    // Start the Wheel
    Div = 601;
    while (Div <= Top) {
        for (long long Ind = 0; Wheel[Ind] != 0; Ind++) {
            if (Div > Top) {
                return Cand;
            }
            Count++;
            if (Cand % Div == 0) {
                return Div;
            }
            Div += Wheel[Ind];
        }
    }
    return Cand;
}

TrialDivLP.cpp in TrialDivLP.zip.

Second Approach: Compress the List of Prime Numbers by Encoding in a Bitfield Array

Rather than a list of primes, the principle is to encode if a number is a prime or not, it is boolean information and fits in a single bit.

With this approach, the same optimizations than in the classic variants of Trial Division algorithm can apply and the saving is huge.

Bitfield of All Numbers

Very simple minded, 1 bit per number, or 8 numbers encoded in a byte.

The figures are as follows:

Upper limit	Number of primes	Size of array
65,536	6,542	8,192
100,000	9,592	12,500
500,000	41,538	62,500
1,000,000	78,498	125,000
5,000,000	348,513	625,000
10,000,000	664,579	1,250,000
100,000,000	5,761,455	12,500,000,
2^32-1 4,294,967,295	203,280,221	536,870,912

Bitfield of All Odd Numbers

A little more optimized, 1 bit per odd number, or 16 numbers (8 odd numbers) encoded in a byte.

The figures are as shown below:

Upper limit	Number of primes	Size of array
65,536	6,542	4,096
100,000	9,592	6,250
500,000	41,538	31,250
1,000,000	78,498	62,500
5,000,000	348,513	312,500
10,000,000	664,579	,625,000
100,000,000	5,761,455	6,250,000,
2^32-1 4,294,967,295	203,280,221	268,435,456

Bitfield of All Numbers Filtered With the Wheel 30

Then really more optimized, with the wheel, only 8 possible primes per slice of 30, or 30 numbers (8 possible primes) encoded in a byte.

The figures are:

Upper limit	Number of primes	Size of array
65,536	6,542	2,188
100,000	9,592	3,333
500,000	41,538	16,667
1,000,000	78,498	33,333
5,000,000	348,513	166,667
10,000,000	664,579	333,334
100,000,000	5,761,455	3,333,334,
2^32-1 4,294,967,295	203,280,221	143,165,577

Code exploiting the list of primes (stored in header file).

// Trial Division: Square Root + Long Prime list + Wheel
// Check all numbers until Square Root, start with a list of primes
// and Skip useless factors with a wheel
long long TD_SRLPW(long long Cand) {
	long long WPrimes[] = { 2, 3, 5, 0 };
	long long Wheel[] = { 6, 4, 2, 4, 2, 4, 6, 2, 0 };
	// unsigned char EPrimes[] = { 0xfe, 0xdf, 0xef, 0x7e, 0xb6, 0xdb, 0x3d, ...
	// Encoded list moved to header file

	long long Div;

	Count = 0;
	long long Top = sqrt(Cand);
	// Check small primes
	for (long long Ind = 0; WPrimes[Ind] != 0; Ind++) {
		Div = WPrimes[Ind]; // for debug purpose
		if (WPrimes[Ind] > Top) {
			return Cand;
		}
		Count++;
		if (Cand % WPrimes[Ind] == 0) {
			return WPrimes[Ind];
		}
	}
	// start the wheel with encoded primes
	Div = 1;
	int EInd = 0;
	while (Div <= Top) {
		unsigned char Flag = EPrimes[EInd];
		unsigned char Mask = 1;
		// if (Flag == 0) {
		if (EInd >= EPrimesLen) {
			break;
		}
		for (long long Ind = 0; Wheel[Ind] != 0; Ind++) {
			if (Div > Top) {
				break;
			}
			if (Flag & Mask) {
				Count++;
				if (Cand % Div == 0) {
					return Div;
				}
			}
			Div += Wheel[Ind];
			Mask = Mask << 1;
		}
		EInd++;
	}

	// Start the Wheel after encoded primes
	while (Div <= Top) {
		for (long long Ind = 0; Wheel[Ind] != 0; Ind++) {
			if (Div > Top) {
				return Cand;
			}
			Count++;
			if (Cand % Div == 0) {
				return Div;
			}
			Div += Wheel[Ind];
		}
	}
	return Cand;
}

TrialDivLP.cpp in TrialDivLP.zip.

Encoding the List of Primes

The Wheel of 30 have 8 slots, a char have 8 bits, so both are in synch, it simplify its usage. Each slot is assigned a bit. The value of a char is the sum of bits of slots which are Primes.

Here is how the list is encoded:

0x01    0x02    0x04    0x08    0x10    0x20    0x40    0x80    code
        7       11      13      17      19      23      29      0xfe
31      37      41      43      47              53      59      0xdf
61      67      71      73              79      83      89      0xef
        97      101     103     107     109     113             0x7e
        127     131             137     139             149     0xb6
151     157             163     167             173     179     0xdb
181             191     193     197     199                     0x3d
211                     223     227     229     233     239     0xf9
241             251             257             263     269     0xd5
271     277     281     283                     293             0x4f
        307     311     313     317                             0x1e
331     337                     347     349     353     359     0xf3
        367             373             379     383     389     0xea
        397     401                     409             419     0xa6
421             431     433             439     443     449     0xed
        457     461     463     467                     479     0x9e
        487     491                     499     503     509     0xe6
                521     523                                     0x0c
541     547                     557             563     569     0xd3
571     577                     587             593     599     0xd3

The code showing how the list is encoded is as given below:

void TD_EncodeDtl(int Max) {
	long long Wheel[] = { 6, 4, 2, 4, 2, 4, 6, 2, 0 };
	long long Cand = 1;
	cout << "0x01\t0x02\t0x04\t0x08\t0x10\t0x20\t0x40\t0x80\tcode" << endl;
	while (Cand < Max) {
		int Code = 0;
		int Index = 1;
		int Ind = 0;
		do {
			if ((TD_SRW(Cand) == Cand) && (Cand != 1)) {
				Code |= Index;
				cout << dec << Cand;
			}
			cout << "\t";
			Cand += Wheel[Ind];
			Index = Index << 1;
			Ind++;
		} while (Wheel[Ind] != 0);
		cout << "0x" << setfill('0') << setw(2) << hex << Code << dec << endl;
	}
}

TrialDivLP.cpp in TrialDivLP.zip.

And the code to generate the array is as follows :

// encode Prime Numbers as C array
void TD_EncodeArray(int Max) {
	long long Wheel[] = { 6, 4, 2, 4, 2, 4, 6, 2, 0 };
	long long Cand = 1;
	cout << "Encodage liste nombres premiers compressée" << endl;
	cout << "{";
	while (Cand < Max) {
		int Code = 0;
		int Index = 1;
		int Ind = 0;
		do {
			if ((TD_SRW(Cand) == Cand) && (Cand != 1)) {
				Code |= Index;
			}
			Cand += Wheel[Ind];
			Index = Index << 1;
			Ind++;
		} while (Wheel[Ind] != 0);
		cout << "0x";
		cout << setfill('0') << setw(2) << hex << Code << ",";
	}
	cout << "0}" << dec << endl;
}

TrialDivLP.cpp in TrialDivLP.zip.

Benchmark

We are counting divisions/modulos to measure the efficiency of the variantes. The encoded list of Primes goes up to 1,000,000 and is stored in a 33kB array.

Number				TD_SR	TD_SREF	TD_SRW	TD_SRLPW	Delta
1,009				30		16		11		11			0
2,003				43		22		14		14			0
3,001				53		27		17		16			1
4,001				62		32		19		18			1
5,003				69		35		20		19			1
6,007				76		39		23		21			2
7,001				82		42		25		23			2
8,009				88		45		26		24			2
9,001				93		47		27		24			3
10,007				99		50		28		25			3
20,011				140		71		40		34			6
30,011				172		87		49		40			9
40,009				199		100		56		46			10
49,999				222		112		62		48			14
1,000,003			999		500		268		168			100
4,000,021			1,800	901		483		279			204
9,000,011			2,999	1,500	802		430			372
16,000,057			3,999	2,000	1,068	550			518
25,000,009			4,999	2,500	1,336	669			667
36,000,007			5,999	3,000	1,602	783			819
49,000,027			6,999	3,500	1,868	900			968
64,000,031			7,999	4,000	2,136	1,007		1,129
81,000,001			8,999	4,500	2,402	1,117		1,285
100,000,007			9,999	5,000	2,668	1,229		1,439
121,000,003			10,999	5,500	2,936	1,335		1,601
144,000,001			11,999	6,000	3,202	1,438		1,764
169,000,033			12,999	6,500	3,468	1,547		1,921
196,000,001			13,999	7,000	3,736	1,652		2,084
225,000,011			14,999	7,500	4,002	1,754		2,248
256,000,001			15,999	8,000	4,268	1,862		2,406
289,000,001			16,999	8,500	4,536	1,960		2,576
10,000,000,019		99,999	50,000	26,668	9,592		17,076
1,000,000,000,039	999,999	500,000	266,668	78,498		188,170

Delta is here to highlight the savings of the huge list of primes.

	long long Test[] = { 101, 1009, 2003, 3001, 4001, 5003, 6007, 7001, 8009,
			9001, 10007, 20011, 30011, 40009, 49999, 1000003, 4000021, 9000011,
			16000057, 25000009, 36000007, 49000027, 64000031, 81000001,
			100000007, 121000003, 144000001, 169000033, 196000001, 225000011,
			256000001, 289000001, 10000000019, 1000000000039, 0 };
	cout << "Comparaison Variantes" << endl;
	cout << "Number\tTD_SR\tTD_SREF\tTD_SRW\tTD_SRLPW\tDelta" << endl;
	// test after first 0
	for (Ind = 0; Test[Ind] != 0; Ind++) {
		cout << Test[Ind] << "\t";
		TD_SR(Test[Ind]);
		cout << Count << "\t";
		TD_SREF(Test[Ind]);
		cout << Count << "\t";
		TD_SRW(Test[Ind]);
		int C1 = Count;
		cout << Count << "\t";
		TD_SRLPW(Test[Ind]);
		cout << Count << "\t";
		cout << C1 - Count << endl;
	}
	cout << endl;

TrialDivLP.cpp in TrialDivLP.zip.

128 bits integers

GCC have provisions for 128 bits variables by using 2 64 bits registers, it is ok for calculations, but is not supported as constants in source code nor for IO operations. A little programming is needed to circonvant those limitations.

//#define int64
// GCC have provisions to handle 128 bits integers, but not for constants in source code
// and not IO
#ifdef int64
// 64 bits integer
typedef long long MyBigInt;

void output(MyBigInt Num, int Size) {
	cout << setfill(' ') << setw(Size);
}
#else
// 128 bits integer
typedef __int128 MyBigInt;

void output(MyBigInt Num, int Size) {
	if (Num <= (long long) 0x7fffffffffffffff) {
		if (Size > 0) {
			cout << setfill(' ') << setw(Size);
		}
		cout << (long long) Num;
	} else {
		output(Num / 1000, Size - 4);
		long long Tmp = Num % 1000;
		cout << "," << setfill('0') << setw(3) << Tmp;
	}
}
#endif

TrialDivLP.cpp in TrialDivLP.zip.

	Number                      Fartor Timing
        10,000,000,000,000,061	1	 365 ms
       100,000,000,000,000,003	1	 1,173 ms
     1,000,000,000,000,000,003	1	 3,866 ms
    10,000,000,000,000,000,051	1	 11,988 ms
   100,000,000,000,000,000,039	1	 82,326 ms
 1,000,000,000,000,000,000,117	1	 272,376 ms

Each time the number is multiplied by 10, runtime is roughly multiplied by 3 (sqrt(10)). Note the runtime gap when number cross the 64 bits boundary.

#ifndef int64
	long long Test2[][3] = { { 10, 16, 61 }, { 10, 17, 3 }, { 10, 18, 3 }, { 10,
			19, 51 }, { 10, 20, 39 }, { 10, 21, 117 }, { 0, 0, 0 } };
	// for each triplet A B C, the number is
	// Cand= A ^ B + C
	cout.imbue(locale(cout.getloc(), new gr3));
	for (Ind = 0; Test2[Ind][0] > 0; Ind++) {
		for (int Offset = 0; Offset < 1000; Offset += 2) {
			MyBigInt Cand = 1;
			for (int Scan = 0; Scan < Test2[Ind][1]; Scan++) {
				Cand *= Test2[Ind][0];
			}
			Cand += Test2[Ind][2] + Offset;
			output(Cand, 30);
			cout << "\t";
			auto start2 = high_resolution_clock::now();
			long long Tmp = TD_SRLPW(Cand);
			auto stop2 = high_resolution_clock::now();
			if (Tmp == 1) {
				output(Tmp, 0);
			} else {
				cout << Tmp;
				cout << "\t";
				output((Cand / Tmp) * Tmp, 0);
			}
			cout << "\t";
			auto duration = duration_cast<milliseconds>(stop2 - start2);
			cout << " " << duration.count() << " ms";
			cout << endl;
			if (Tmp == 1) {
				break;
			}
		}
	}
	cout.imbue(locale(cout.getloc(), new gr0));
#endif

TrialDivLP.cpp in TrialDivLP.zip.

Accelerating IsPrime with the compressed list of primes

We can take advantage of huge list of Primes and use it as a loopup table.

With a plain list, we need to do a dichotomy search.

With an encoded list, an almost direct reading is possible.

// IsPrime taking advantage of long list of primes table
// returns 1 if prime, returns 0 otherwise
int IsPrimeLP(MyBigInt Cand) {
	const unsigned long long Wheel[] = { 6, 4, 2, 4, 2, 4, 6, 2, 0 };
	const unsigned long long WOffset[] = { 1, 7, 11, 13, 17, 19, 23, 29, 0 };
	if (Cand <= EPrimesMax) {
		Count = 1;
		// Cand is within list of primes
		long long x = Cand / WSize; // position in wheel list
		long long y = Cand % WSize; // offset in wheel
		for (int Index = 0; WOffset[Index] != 0; Index++) {
			if (WOffset[Index] == y) {
				return ((EPrimes[x] >> Index) & 0x01);
			}
		}
		if (Cand == 2)
			return 1;
		else if (Cand == 3)
			return 1;
		else if (Cand == 5)
			return 1;
		else
			return 0; // multiple of 2, 3 or 5
	}
	// Search a Factor
	return (TD_SRLPW(Cand) == 1);
}

TrialDivLP.cpp in TrialDivLP.zip.

Points of Interest

Trial Division algorithm improves as the list of primes get bigger, and the compression helps a lot.

This work is original. If you know something similar, please drop a link in the forum below.

Links

History

18^th December, 2020: First version
25^th December, 2020: Second version - Corrections with improvements
6^th February, 2021: Added IsPrime code + Corrections
7^th August, 2022: Added Content Table
20^th August, 2022: typos in source code