File:Haswell Apfelmännchen per Core.png
From Wikimedia Commons, the free media repository
Jump to navigation
Jump to search
Size of this preview: 800 × 574 pixels. Other resolutions: 320 × 230 pixels | 640 × 459 pixels | 1,024 × 735 pixels | 1,280 × 918 pixels | 2,560 × 1,837 pixels | 3,129 × 2,245 pixels.
Original file (3,129 × 2,245 pixels, file size: 184 KB, MIME type: image/png)
File information
Structured data
Captions
Summary
[edit]DescriptionHaswell Apfelmännchen per Core.png |
English: Possible Parallelization of the Mandelbrot Set Calculation within a Haswell Core i7 per Core. You can see that up to 128 calculation (in total 16 instructions divided on two threads) can be executed per Core. On a Haswell Core i7-5960X this can be up to 1024 parallel calculations per CPU, on a Haswell Xeon E7-8890 v3 up to 2304 parallel calculations.
Modern CPUs are far beyond from being non-parallel. |
Date | |
Source | Own work |
Author | Frank Klemm |
Basic core ...
And yes, this code is on a Dual Xeon 18 core Haswell about 5 million times faster than on my 386 with a 387...
typedef union
{
__m128d V2 [ 1];
__m128 V4 [ 1];
__m128i I [ 1];
double f64 [ 2];
float f32 [ 4];
unsigned __int64 ui64[ 2];
unsigned __int32 ui32[ 4];
unsigned __int16 ui16[ 8];
unsigned __int8 ui8 [ 16];
signed __int64 i64 [ 2];
signed __int32 i32 [ 4];
signed __int16 i16 [ 8];
signed __int8 i8 [ 16];
} _128;
typedef union
{
__m256d V4 [ 1];
__m256 V8 [ 1];
__m256i II [ 1];
__m128i I [ 2];
double f64 [ 4];
float f32 [ 8];
unsigned __int64 ui64[ 4];
unsigned __int32 ui32[ 8];
unsigned __int16 ui16[ 16];
unsigned __int8 ui8 [ 32];
signed __int64 i64 [ 4];
signed __int32 i32 [ 8];
signed __int16 i16 [ 16];
signed __int8 i8 [ 32];
} _256;
typedef union
{
_128 U128[ 4];
_256 U256[ 2];
__m256d V4 [ 2];
__m256 V8 [ 2];
__m256i II [ 2];
__m128i I [ 4];
double f64 [ 8];
float f32 [ 16];
unsigned __int64 ui64[ 8];
unsigned __int32 ui32[ 4];
unsigned __int16 ui16[ 32];
unsigned __int8 ui8 [ 64];
signed __int64 i64 [ 8];
signed __int32 i32 [ 16];
signed __int16 i16 [ 32];
signed __int8 i8 [ 64];
} _512;
typedef union
{
_128 U128[ 8];
_256 U256[ 4];
_512 U512[ 2];
__m256d V4 [ 4];
__m256 V8 [ 4];
__m256i II [ 4];
__m128i I [ 8];
double f64 [ 16];
float f32 [ 32];
unsigned __int64 ui64[ 16];
unsigned __int32 ui32[ 8];
unsigned __int16 ui16[ 64];
unsigned __int8 ui8 [128];
signed __int64 i64 [ 16];
signed __int32 i32 [ 32];
signed __int16 i16 [ 64];
signed __int8 i8 [128];
} _1024;
// im = 2*re*im + imadd
// re = re2 - im2 + readd
#define JULIA_1 \
im[0] = _mm256_add_ps (im[0], im[0]); \
im[1] = _mm256_add_ps (im[1], im[1]); \
im[0] = _mm256_fmadd_ps (im[0], re[0], imagadd->V8[0]); \
im[1] = _mm256_fmadd_ps (im[1], re[1], imagadd->V8[1]); \
re[0] = _mm256_sub_ps (re2[0], im2[0]); \
re[1] = _mm256_sub_ps (re2[1], im2[1]); \
re[0] = _mm256_add_ps (re[0], realadd->V8[0]); \
re[1] = _mm256_add_ps (re[1], realadd->V8[1])
// repim = re+im
// remim = re-im
// im = 2*re*im - readd
// re = repim*remim - imadd
#define JULIA_2 \
repim[0] = _mm256_add_ps (re[0], im[0]); \
repim[1] = _mm256_add_ps (re[1], im[1]); \
remim[0] = _mm256_sub_ps (re[0], im[0]); \
remim[1] = _mm256_sub_ps (re[1], im[1]); \
im[0] = _mm256_add_ps (im[0], im[0]); \
im[1] = _mm256_add_ps (im[1], im[1]); \
im[0] = _mm256_fmadd_ps (im[0], re[0], imagadd->V8[0]); \
im[1] = _mm256_fmadd_ps (im[1], re[1], imagadd->V8[1]); \
re[0] = _mm256_fmadd_ps (repim[0], remim[0], realadd->V8[0]); \
re[1] = _mm256_fmadd_ps (repim[1], remim[1], realadd->V8[1])
// re2 = re*re
// im2 = im*im
// sum = re2 + im2
#define JULIA_3 \
re2[0] = _mm256_mul_ps (re[0], re[0]); \
re2[1] = _mm256_mul_ps (re[1], re[1]); \
im2[0] = _mm256_mul_ps (im[0], im[0]); \
im2[1] = _mm256_mul_ps (im[1], im[1]); \
sum[0] = _mm256_add_ps (re2[0], im2[0]); \
sum[1] = _mm256_add_ps (re2[1], im2[1])
static void
Julia16x32_Mac (
_512* const dst,
const _512* const real_begin,
const _512* const imag_begin,
const _512* const realadd,
const _512* const imagadd,
const __int32 maxiter)
{
__int32 cnt = maxiter;
__m256 re[2] = { real_begin->V8[0], real_begin->V8[1] };
__m256 im[2] = { imag_begin->V8[0], imag_begin->V8[1] };
__m256 repim[2];
__m256 remim[2];
__m256 result[2] = { flt_c0 };
__m256 add[2] = { flt_c1, flt_c1 };
__m256 re2[2];
__m256 im2[2];
__m256 sum[2];
__m256i cmp[2];
goto check1;
loop1:
cnt -= 5;
JULIA_1;
JULIA_2;
JULIA_2;
JULIA_2;
JULIA_2;
result[0] = _mm256_add_ps (result[0], flt_c5);
check1:
JULIA_3;
cmp[0] = _mm256_castps_si256 (_mm256_cmp_ps (sum[0], flt_c4, _CMP_LT_OQ));
cmp[1] = _mm256_castps_si256 (_mm256_cmp_ps (sum[1], flt_c4, _CMP_LT_OQ));
cmp[0] = _mm256_castps_si256 (_mm256_and_ps (_mm256_castsi256_ps(cmp[0]), _mm256_castsi256_ps(cmp[1])));
if (cnt >= 5 && (cmp[0].m256i_u64[0] & cmp[0].m256i_u64[1] & cmp[0].m256i_u64[2] & cmp[0].m256i_u64[3]) == 0xFFFFFFFFFFFFFFFF)
goto loop1;
result[1] = result[0];
goto check2;
loop2:
cnt -= 1;
JULIA_1;
result[0] = _mm256_add_ps (result[0], add[0]);
result[1] = _mm256_add_ps (result[1], add[1]);
JULIA_3;
check2:
cmp[0] = _mm256_castps_si256 (_mm256_cmp_ps (sum[0], flt_inf, _CMP_LT_OQ));
cmp[1] = _mm256_castps_si256 (_mm256_cmp_ps (sum[1], flt_inf, _CMP_LT_OQ));
add[0] = _mm256_and_ps (add[0], _mm256_castsi256_ps (cmp[0]));
add[1] = _mm256_and_ps (add[1], _mm256_castsi256_ps (cmp[1]));
cmp[0] = _mm256_castps_si256 (_mm256_or_ps (_mm256_castsi256_ps(cmp[0]), _mm256_castsi256_ps(cmp[1])));
if (cnt >= 1 && _mm256_testz_pd (_mm256_castsi256_pd(cmp[0]), _mm256_castsi256_pd(cmp[0])) == 0)
goto loop2;
(dst->II)[0] = _mm256_cvttps_epi32 (result[0]);
(dst->II)[1] = _mm256_cvttps_epi32 (result[1]);
}
#undef JULIA_1
#undef JULIA_2
#undef JULIA_3
Licensing
[edit]I, the copyright holder of this work, hereby publish it under the following license:
This file is licensed under the Creative Commons Attribution-Share Alike 4.0 International license.
- You are free:
- to share – to copy, distribute and transmit the work
- to remix – to adapt the work
- Under the following conditions:
- attribution – You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
- share alike – If you remix, transform, or build upon the material, you must distribute your contributions under the same or compatible license as the original.
File history
Click on a date/time to view the file as it appeared at that time.
Date/Time | Thumbnail | Dimensions | User | Comment | |
---|---|---|---|---|---|
current | 23:56, 8 August 2017 | 3,129 × 2,245 (184 KB) | Frank Klemm (talk | contribs) | User created page with UploadWizard |
You cannot overwrite this file.
File usage on Commons
There are no pages that use this file.
File usage on other wikis
The following other wikis use this file:
- Usage on de.wikipedia.org