Andrei alexandrescu three optimization tips for c++提到一个格式化数字为字符串的优化

May 07, 2014 | 5 Minute Read

Andrei Alexandrescu "Three Optimization Tips for C++"
网上可以找到这个演示,里面提到了几种优化办法,不过都被墙了,
下面这个页面也有点介绍
https://m.facebook.com/notes/facebook-engineering/three-optimization-tips-for-c/10151361643253920

提到的优化技巧:
Prefer static linking and position-dependent code (as opposed to PIC, position-independent code).
Prefer 64-bit code and 32-bit data.
Prefer array indexing to pointers (this one seems to reverse every ten years).
Prefer regular memory access patterns.
Minimize control flow.            // 减少复杂的跳转,复杂的if else 函数调用次数等
Avoid data dependencies.
避免内存操作,宁愿更多的计算也不要才注意内存。一个cache不命中造成的延时就可能导致cpu流水线中断很久。
浮点数的运算和整型数一样快,浮点数的乘除法甚至更快


// 普通的转换
uint32_t u64ToAsciiClassic(uint64_t value, char* dst) {
	// Write backwards.
	auto start = dst;
	do {
		*dst++ = '0' + (value % 10);
		value /= 10;
	} while (value != 0);
	const uint32_t result = dst - start;
	// Reverse in place.
	for (dst--; dst > start; start++, dst--) {
		std::iter_swap(dst, start);
	}
	return result;
}


uint32_t digits10(uint64_t v) {
    uint32_t result = 0;
    do {
        ++result;
         v /= 10;
    } while (v);
     return result;
}


//优化1 用比较操作取代除法
uint32_t digits10(uint64_t v) {
	uint32_t result = 1;
	for (;;) {
		if (v < 10) return result;
		if (v < 100) return result + 1;
		if (v < 1000) return result + 2;
		if (v < 10000) return result + 3;
		// Skip ahead by 4 orders of magnitude
		v /= 10000U;
		result += 4;
	}
}

//优化2  用计算取代数组内存的读写,避免后面那个std::iter_swap
uint32_t uint64ToAscii(uint64_t v, char *const buffer) {
	auto const result = digits10(v);
	uint32_t pos = result - 1;
	while (v >= 10) {
		auto const q = v / 10;
		auto const r = static_cast<uint32_t>(v % 10); 
		buffer[pos--] = '0' + r;
		v = q;
	} 
	//assert(pos == 0); // Last digit is trivial to handle
	*buffer = static_cast<uint32_t>(v)+ '0';
		return result;
}


const uint64_t P01 = 10;
const uint64_t P02 = 100;
const uint64_t P03 = 1000;
const uint64_t P04 = 10000;
const uint64_t P05 = 100000;
const uint64_t P06 = 1000000;
const uint64_t P07 = 10000000;
const uint64_t P08 = 100000000;
const uint64_t P09 = 1000000000;
const uint64_t P10 = 10000000000;
const uint64_t P11 = 100000000000;
const uint64_t P12 = 1000000000000;

//优化3算法优化  二分查找十进制表示的数字位数
uint32_t digits10(uint64_t v) {
	if (v < P01) return 1;
	if (v < P02) return 2;
	if (v < P03) return 3;
	if (v < P12) {
		if (v < P08) {
			if (v < P06) {
				if (v < P04) return 4;
				return 5 + (v >= P05);
			}
			return 7 + (v >= P07);
		}
		if (v < P10) {
			return 9 + (v >= P09);
		}
		return 11 + (v >= P11);
	}
	return 12 + digits10(v / P12);
}


//优化3 算法优化  每次转换两个数字
unsigned u64ToAsciiTable(uint64_t value, char* dst) {
	static const char digits[201] =
		"0001020304050607080910111213141516171819"
		"2021222324252627282930313233343536373839"
		"4041424344454647484950515253545556575859"
		"6061626364656667686970717273747576777879"
		"8081828384858687888990919293949596979899";
	uint32_t const length = digits10(value);
	uint32_t next = length - 1;
	while (value >= 100) {
		auto const i = (value % 100) * 2;
		value /= 100;
		dst[next] = digits[i + 1];
		dst[next - 1] = digits[i];
		next -= 2;
	}
	// Handle last 1-2 digits
	if (value < 10) {
		dst[next] = '0' + uint32_t(value);
	}
	else {
		auto i = uint32_t(value) * 2;
		dst[next] = digits[i + 1];
		dst[next - 1] = digits[i];
	}
	return length;
}




对这3个数做测试
4557
3452635722
9223372036854775800

------原始版本测试结果--------------
输入要测试的数:4557
执行次数(n):    100000000
总耗时(微秒):    3499944
平均耗时(微秒)0.0349994

输入要测试的数:3452635722
执行次数(n):    100000000
总耗时(微秒):    8764613
平均耗时(微秒)0.0876461

输入要测试的数:9223372036854775800
执行次数(n):    100000000
总耗时(微秒):    17935778
平均耗时(微秒)0.179358

--------优化版本2的测试结果--------
输入要测试的数:4557
执行次数(n):    100000000
总耗时(微秒):    2785353
平均耗时(微秒)0.0278535

输入要测试的数:3452635722
执行次数(n):    100000000
总耗时(微秒):    9338686
平均耗时(微秒)0.0933869

输入要测试的数:922337203685477580
执行次数(n):    100000000
总耗时(微秒):    18537854
平均耗时(微秒)0.185379
--------最后优化版本3的结果--------
输入要测试的数:4557
执行次数(n):    100000000
总耗时(微秒):    2077764
平均耗时(微秒)0.0207776

输入要测试的数:455
执行次数(n):    100000000
总耗时(微秒):    1763224
平均耗时(微秒)0.0176322

输入要测试的数:3452635722
执行次数(n):    100000000
总耗时(微秒):    5109649
平均耗时(微秒)0.0510965

输入要测试的数:9223372036854775800
执行次数(n):    100000000
总耗时(微秒):    12360570
平均耗时(微秒)0.123606
-------------------------------------



在intel i7 上面的测试,好像第2版的优化作用不是很明显的。 第3版稍微好点。可能编译的32位程序,模拟的64位除非运算的影响?


测试代码

//#include <winsock2.h>
//#include <ws2tcpip.h>

#define _USE_32BIT_TIME_T
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <list>
#include <vector>
#include <random>

//
//#include <boost/shared_ptr.hpp>
//#include <boost/weak_ptr.hpp>

#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/timeb.h>
#include <time.h>
#include "pugixml.hpp"

#ifdef _WIN32
#include <Windows.h>
void my_gettimeofday(struct timeval *tp)
{
	uint64_t  intervals;
	FILETIME  ft;

	GetSystemTimeAsFileTime(&ft);

	/*
	* A file time is a 64-bit value that represents the number
	* of 100-nanosecond intervals that have elapsed since
	* January 1, 1601 12:00 A.M. UTC.
	*
	* Between January 1, 1970 (Epoch) and January 1, 1601 there were
	* 134744 days,
	* 11644473600 seconds or
	* 11644473600,000,000,0 100-nanosecond intervals.
	*
	* See also MSKB Q167296.
	*/

	intervals = ((uint64_t)ft.dwHighDateTime << 32) | ft.dwLowDateTime;
	intervals -= 116444736000000000;

	tp->tv_sec = (long)(intervals / 10000000);
	tp->tv_usec = (long)((intervals % 10000000) / 10);
}
#else
#include <sys/time.h>
#define my_gettimeofday(tp)  (void) gettimeofday(tp, NULL);
#endif

// 返回微秒时间差异
unsigned long long time_stamp_usec()
{
	enum { kUsecPerSec = 1000 * 1000 };
	struct timeval tp;
	my_gettimeofday(&tp);
	return tp.tv_sec * kUsecPerSec  + tp.tv_usec;
}

void print_time_diff(unsigned long long t0, unsigned long long t1, int n)
{
	std::cout << "执行次数(n):\t" << n  << std::endl
			  << "总耗时(微秒):\t " << t1 - t0 << std::endl
	          << "平均耗时(微秒)" << (double)(t1 - t0)/(double)n << std::endl;
}


using namespace std;



//----------------------------------------------
uint32_t digits10(uint64_t v) {
	uint32_t result = 1;
	for (;;) {
		if (v < 10) return result;
		if (v < 100) return result + 1;
		if (v < 1000) return result + 2;
		if (v < 10000) return result + 3;
		// Skip ahead by 4 orders of magnitude
		v /= 10000U;
		result += 4;
	}
}

//优化2  用计算取代数组内存的读写,避免后面那个std::iter_swap
__declspec(noinline) uint32_t uint64ToAscii(uint64_t v, char *const buffer) {
	auto const result = digits10(v);
	uint32_t pos = result - 1;
	while (v >= 10) {
		auto const q = v / 10;
		auto const r = static_cast<uint32_t>(v % 10); 
		buffer[pos--] = '0' + r;
		v = q;
	} 
	//assert(pos == 0); // Last digit is trivial to handle
	*buffer = static_cast<uint32_t>(v)+ '0';
		return result;
}


//-----------------------------------------------



int main(int, char**)
{
	
	//int a = 1;
	//
	//return 0;
again:
	char output[65];
	uint64_t a = 0;
	cout << "输入要测试的数:";
	cin >> a;



	stringstream sstream;
	unsigned long long total_counter = 0;
	int kLoopCount = 10000 * 10000;
	int i, j;
	unsigned long long t0, t1;
	//----------------------------------
	t0 = time_stamp_usec();
	for (i = 0; i < kLoopCount; i++) {
		uint64ToAscii(a, output);
		total_counter += output[i%64];
	}
	t1 = time_stamp_usec();
	print_time_diff(t0, t1, kLoopCount);	
	//----------------------------------
	std::cout << total_counter << std::endl;

	goto again;


	int aaa;
	cin >> aaa;
	cout << aaa;
	return 0;
}