소스 코드 한장으로 합쳤다.
//=============================================================================
// performance.h --------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <algorithm>
#define CSTR // constructor place holder
#define DSTR // destructor place holder
#define _____ // place holder
using i8 = char;
using u8 = unsigned char;
using i16 = short;
using u16 = unsigned short;
using i32 = int;
using i64 = long long;
using u32 = unsigned int;
using u64 = unsigned long long;
using f32 = float;
using f64 = double;
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <x86intrin.h>
#define CHAR_BIT 8
#endif
#define SYS_BITS ( sizeof( size_t ) * CHAR_BIT )
//-----------------------------------------------------------------------------
class RUNNER
{
private:
const char* function_name;
u64 elapsed;
protected:
bool is_success;
public:
CSTR RUNNER
(
const char* function_name,
void( *on_pre )( ),
void( *on_run )( ),
bool( *on_post )( )
)
: function_name( function_name )
, on_pre( on_pre )
, on_run( on_run )
, on_post( on_post )
{
init();
}
DSTR virtual ~RUNNER()
{};
virtual void init()
{
elapsed = -1;
is_success = on_run != NULL;
}
void( *on_pre )( );
void( *on_run )( );
bool( *on_post )( );
void check()
{
if( !on_run )
return;
if( on_pre )
on_pre();
u64 begin = __rdtsc();
on_run();
u64 duration = __rdtsc() - begin;
elapsed = std::min( elapsed, duration );
if( on_post )
is_success &= on_post();
}
const auto set_length( const char* str, const size_t size )
{
static char temp[ 80 ];
size_t index;
for( index = 0; str[ index ]; ++index ) temp[ index ] = str[ index ];
for( ; index < size; ++index ) temp[ index ] = ' ';
temp[ index ] = 0;
return temp;
}
const auto report()
{
const char* judge_strings[] = { " FAILED ", " PASSED " };
static char temp[ 80 ];
sprintf( temp, "[%s] %s llu clocks", judge_strings[ is_success ],
set_length( function_name, 20 ), elapsed );
return temp;
}
const auto elapsed_clocks()
{
return elapsed;
}
const auto name()
{
return function_name;
}
};
//-----------------------------------------------------------------------------
#include <vector>
template < size_t REPEAT_COUNT = 1 >
class BENCH
{
private:
std::vector< RUNNER* > runner_list;
public:
CSTR BENCH()
{};
DSTR ~BENCH()
{};
void add( RUNNER* runner )
{
runner_list.push_back( runner );
}
void run()
{
if( runner_list.size() == 0 ) return;
printf( "< %d bits > %d trials", (int)SYS_BITS, (int)REPEAT_COUNT );
puts( "" );
puts( "-------------------------------------------------------------" );
puts( "| CHECKER | Function name | minimum clocks |" );
puts( "-------------------------------------------------------------" );
u64 min_value = -1;
u64 max_value = 0;
u32 min_index = 0;
u32 max_index = 0;
for( u32 test_case = 0; test_case < runner_list.size(); ++test_case )
{
runner_list[ test_case ]->init();
for( u32 test_count = 0; test_count < REPEAT_COUNT; ++test_count )
runner_list[ test_case ]->check();
puts( runner_list[ test_case ]->report() );
if( min_value > runner_list[ test_case ]->elapsed_clocks() )
{
min_index = test_case;
min_value = runner_list[ test_case ]->elapsed_clocks();
}
if( max_value < runner_list[ test_case ]->elapsed_clocks() )
{
max_index = test_case;
max_value = runner_list[ test_case ]->elapsed_clocks();
}
}
puts( "-------------------------------------------------------------" );
printf( "Winner is %s ( %.2f times faster than loooser )",
runner_list[ min_index ]->name(),
float( runner_list[ max_index ]->elapsed_clocks() ) /
runner_list[ min_index ]->elapsed_clocks() );
puts( "" );
}
};
//-----------------------------------------------------------------------------
#define COOL_FUN( function_name, ... )
new RUNNER( #function_name, nil, []{ function_name(__VA_ARGS__); }, nil )
#define SO_FUN( pre, function_name, ... )
new RUNNER( #function_name, pre, []{ function_name(__VA_ARGS__); }, nil )
#define FUN( pre, run, post, function_name )
new RUNNER( #function_name, pre, run, post )
//=============================================================================
// kukyakya
#include <memory>
#include <cassert>
using byte = char;
using word = std::size_t;
word merge_word( const word& w1, const word& w2, std::size_t byte_offset )
{
[...] ) );
const std::size_t shift_1 = CHAR_BIT * byte_offset;
const std::size_t shift_2 = CHAR_BIT * sizeof( word ) - shift_1;
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
return ( w1 >> shift_1 ) | ( w2 << shift_2 );
#else
return ( w1 << shift_1 ) | ( w2 >> shift_2 );
#endif // __BYTE_ORDER
}
std::size_t get_byte_offset( const word* ptr )
{
void *p = (void*)ptr;
std::size_t sz = -1;
std::align( alignof( word ), 0, p, sz );
const auto diff = (char*)p - (char*)ptr;
return ( diff == 0 ) ? 0 : ( sizeof( word ) - diff );
}
byte* copy_byte( byte* dest, const byte* src, std::size_t n_byte )
{
// Don't fuck with loop-unrolling, just trust your compiler
while( n_byte-- )
*dest++ = *src++;
return dest;
}
word* copy_word_both_aligned( word* dest, const word* src, std::size_t n_word )
{
/* */ word* d = (word*)dest;
const word* s = (const word*)src;
while( n_word-- )
*d++ = *s++;
return dest;
}
// src is not aligned
word* copy_word_dest_aligned( word* dest, const word* src, std::size_t n_word )
{
word *d = dest;
const word *s = src;
// get byte-offset of src
std::size_t src_offset = get_byte_offset( s );
// if src is aligned, call copy_word_both_aligned
if( src_offset == 0 )
{
return copy_word_both_aligned( d, s, n_word );
}
const word* aligned_src = (const word*)( (char*)s - src_offset );
[...] == 0 );
word buf = *aligned_src++;
while( n_word-- )
{
word buf2 = *aligned_src++;
*d++ = merge_word( buf, buf2, src_offset );
buf = buf2;
}
return dest;
}
void *kukyakya_memcpy( void* dest, const void* src, std::size_t n )
{
/* */ void* d = dest;
const void* s = src;
std::size_t sz = n;
if( !std::align( alignof( word ), 0, d, sz ) )
{
return copy_byte( (byte*)dest, (const byte*)src, n );
}
s = (const byte*)s + ( n - sz );
// byte copy until dest is aligned
copy_byte( (byte*)dest, (const byte*)src, n - sz );
// copy words
const std::size_t n_word = sz / sizeof( word );
copy_word_dest_aligned( (word*)d, (const word*)s, n_word );
d = /* */ (word*)d + n_word;
s = (const word*)s + n_word;
sz -= sizeof( word ) * n_word;
// copy remaining bytes
copy_byte( (byte*)d, (const byte*)s, sz );
return dest;
}
void *kukyakya_memcpy_unaligned( void* dest, const void* src, std::size_t n_byte )
{
/* */ word *d = (/* */ word*)dest;
const word* s = (const word*)src;
// copy word
const std::size_t n_word = n_byte / sizeof( word );
copy_word_both_aligned( d, s, n_word );
d += n_word;
s += n_word;
n_byte -= n_word * sizeof( word );
// copy remaining bytes
copy_byte( (byte*)d, (const byte*)s, n_byte );
return dest;
}
void *my_memcpy_byte_only( void* dest, const void* src, std::size_t n )
{
return copy_byte( (byte*)dest, (const byte*)src, n );
}
//=============================================================================
void* codesafer_memcpy_unaligned1( void* dst, const void* src, std::size_t size )
{
using step_t = u32;
_____ step_t* d = (step_t*)dst;
const step_t* s = (step_t*)src;
const std::size_t step_count = size / sizeof(step_t);
const u32 off_road = size % sizeof(step_t);
for( std::size_t i = 0; i < step_count; ++i )
d[i] = s[i];
u8* db = (u8*)( d + step_count );
u8* sb = (u8*)( s + step_count );
for( u32 i = 0; i < off_road; ++i )
db[i] = sb[i];
return dst;
}
//=============================================================================
// test case
const int test_count = 100;
const int test_size = 1000000;
#ifdef _MSC_VER
__declspec( align( 4 ) ) char src[ test_size + 4 ];
__declspec( align( 4 ) ) char dst[ test_size + 4 ];
#else
char src[ test_size + 4 ] __attribute__((aligned(4)));
char dst[ test_size + 4 ] __attribute__((aligned(4)));
#endif
int result1, result2, result3, result4;
void init()
{
}
void pre()
{
}
void test1()
{
result1 += (int)kukyakya_memcpy( dst + 1, src + 3, test_size );
}
void test2()
{
result2 += (int)kukyakya_memcpy_unaligned( dst + 1, src + 3, test_size );
}
void test3()
{
result3 += (int)memcpy( dst + 1, src + 3, test_size );
}
void test4()
{
result4 += (int)codesafer_memcpy_unaligned1( dst + 1, src + 3, test_size );
}
bool post()
{
return true;
}
//-----------------------------------------------------------------------------
#include <iostream>
using namespace std;
int main()
{
init();
BENCH< test_count > bench;
bench.add( FUN( pre, test1, post, kukyakya_aligned ) );
bench.add( FUN( pre, test2, post, kukyakya_unaligned ) );
bench.add( FUN( pre, test3, post, std_memcpy ) );
bench.add( FUN( pre, test4, post, codesafer_unaligned1 ) );
bench.run();
return 0;
}
댓글 영역
획득법
① NFT 발행
작성한 게시물을 NFT로 발행하면 일주일 동안 사용할 수 있습니다. (최초 1회)
② NFT 구매
다른 이용자의 NFT를 구매하면 한 달 동안 사용할 수 있습니다. (구매 시마다 갱신)
사용법
디시콘에서지갑연결시 바로 사용 가능합니다.