parent
deaadfef4a
commit
0755ce7628
@ -0,0 +1,241 @@ |
||||
/*
|
||||
Vector Extensions - Mandelbrot |
||||
"Succesfully programmed the dishwasher and debugged the pump this week..." - javidx9 |
||||
|
||||
License (OLC-3) |
||||
~~~~~~~~~~~~~~~ |
||||
|
||||
Copyright 2018-2020 OneLoneCoder.com |
||||
|
||||
Redistribution and use in source and binary forms, with or without |
||||
modification, are permitted provided that the following conditions |
||||
are met: |
||||
|
||||
1. Redistributions or derivations of source code must retain the above |
||||
copyright notice, this list of conditions and the following disclaimer. |
||||
|
||||
2. Redistributions or derivative works in binary form must reproduce |
||||
the above copyright notice. This list of conditions and the following |
||||
disclaimer must be reproduced in the documentation and/or other |
||||
materials provided with the distribution. |
||||
|
||||
3. Neither the name of the copyright holder nor the names of its |
||||
contributors may be used to endorse or promote products derived |
||||
from this software without specific prior written permission. |
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
||||
|
||||
Relevant Video: https://youtu.be/x9Scb5Mku1g
|
||||
|
||||
Links |
||||
~~~~~ |
||||
YouTube: https://www.youtube.com/javidx9
|
||||
https://www.youtube.com/javidx9extra
|
||||
Discord: https://discord.gg/WhwHUMV
|
||||
Twitter: https://www.twitter.com/javidx9
|
||||
Twitch: https://www.twitch.tv/javidx9
|
||||
GitHub: https://www.github.com/onelonecoder
|
||||
Patreon: https://www.patreon.com/javidx9
|
||||
Homepage: https://www.onelonecoder.com
|
||||
|
||||
Community Blog: https://community.onelonecoder.com
|
||||
|
||||
Author |
||||
~~~~~~ |
||||
David Barr, aka javidx9, ©OneLoneCoder 2018, 2019, 2020 |
||||
*/ |
||||
|
||||
// NOTE: THIS PROGRAM CANNOT BE EXCUTED - IT IS INTENDED AS A GUIDE
|
||||
// TO THIS VIDEO: https://youtu.be/x9Scb5Mku1g
|
||||
|
||||
|
||||
// Method 4) - Use AVX2 Vector co-processor to handle 4 fractal locations at once
|
||||
void CreateFractalIntrinsics(const olc::vi2d& pix_tl, const olc::vi2d& pix_br, const olc::vd2d& frac_tl, const olc::vd2d& frac_br, const int iterations) |
||||
{ |
||||
double x_scale = (frac_br.x - frac_tl.x) / (double(pix_br.x) - double(pix_tl.x)); |
||||
double y_scale = (frac_br.y - frac_tl.y) / (double(pix_br.y) - double(pix_tl.y)); |
||||
|
||||
double y_pos = frac_tl.y; |
||||
|
||||
int y_offset = 0; |
||||
int row_size = ScreenWidth(); |
||||
|
||||
int x, y; |
||||
|
||||
// 64-bit "double" registers
|
||||
__m256d _a, _b, _two, _four, _mask1; |
||||
__m256d _zr, _zi, _zr2, _zi2, _cr, _ci; |
||||
__m256d _x_pos_offsets, _x_pos, _x_scale, _x_jump; |
||||
|
||||
// 64-bit "integer" registers
|
||||
__m256i _one, _c, _n, _iterations, _mask2; |
||||
|
||||
// Expand constants into vectors of constants
|
||||
// one = |(int)1|(int)1|(int)1|(int)1|
|
||||
_one = _mm256_set1_epi64x(1); |
||||
|
||||
// two = |2.0|2.0|2.0|2.0|
|
||||
_two = _mm256_set1_pd(2.0); |
||||
|
||||
// four = |4.0|4.0|4.0|4.0|
|
||||
_four = _mm256_set1_pd(4.0); |
||||
|
||||
// iterations = |iterations|iterations|iterations|iterations|
|
||||
_iterations = _mm256_set1_epi64x(iterations); |
||||
|
||||
_x_scale = _mm256_set1_pd(x_scale); |
||||
_x_jump = _mm256_set1_pd(x_scale * 4); |
||||
_x_pos_offsets = _mm256_set_pd(0, 1, 2, 3); |
||||
_x_pos_offsets = _mm256_mul_pd(_x_pos_offsets, _x_scale); |
||||
|
||||
|
||||
for (y = pix_tl.y; y < pix_br.y; y++) |
||||
{ |
||||
// Reset x_position
|
||||
_a = _mm256_set1_pd(frac_tl.x); |
||||
_x_pos = _mm256_add_pd(_a, _x_pos_offsets); |
||||
|
||||
_ci = _mm256_set1_pd(y_pos); |
||||
|
||||
for (x = pix_tl.x; x < pix_br.x; x += 4) |
||||
{ |
||||
_cr = _x_pos; |
||||
|
||||
// Zreal = 0
|
||||
_zr = _mm256_setzero_pd(); |
||||
|
||||
// Zimag = 0
|
||||
_zi = _mm256_setzero_pd(); |
||||
|
||||
// nIterations = 0
|
||||
_n = _mm256_setzero_si256(); |
||||
|
||||
|
||||
repeat: |
||||
// Normal: z = (z * z) + c;
|
||||
// Manual: a = zr * zr - zi * zi + cr;
|
||||
// b = zr * zi * 2.0 + ci;
|
||||
// zr = a;
|
||||
// zi = b;
|
||||
|
||||
|
||||
// zr^2 = zr * zr
|
||||
_zr2 = _mm256_mul_pd(_zr, _zr); // zr * zr
|
||||
|
||||
// zi^2 = zi * zi
|
||||
_zi2 = _mm256_mul_pd(_zi, _zi); // zi * zi
|
||||
|
||||
// a = zr^2 - zi^2
|
||||
_a = _mm256_sub_pd(_zr2, _zi2); // a = (zr * zr) - (zi * zi)
|
||||
|
||||
// a = a + cr
|
||||
_a = _mm256_add_pd(_a, _cr); // a = ((zr * zr) - (zi * zi)) + cr
|
||||
|
||||
|
||||
|
||||
// b = zr * zi
|
||||
_b = _mm256_mul_pd(_zr, _zi); // b = zr * zi
|
||||
|
||||
// b = b * 2.0 + ci
|
||||
// b = b * |2.0|2.0|2.0|2.0| + ci
|
||||
_b = _mm256_fmadd_pd(_b, _two, _ci); // b = (zr * zi) * 2.0 + ci
|
||||
|
||||
// zr = a
|
||||
_zr = _a; // zr = a
|
||||
|
||||
// zi = b
|
||||
_zi = _b; // zr = b
|
||||
|
||||
|
||||
|
||||
// Normal: while (abs(z) < 2.0 && n < iterations)
|
||||
// Manual: while ((zr * zr + zi * zi) < 4.0 && n < iterations)
|
||||
|
||||
|
||||
// a = zr^2 + zi^2
|
||||
_a = _mm256_add_pd(_zr2, _zi2); // a = (zr * zr) + (zi * zi)
|
||||
|
||||
// m1 = if (a < 4.0)
|
||||
// m1 = |if(a[3] < 4.0)|if(a[2] < 4.0)|if(a[1] < 4.0)|if(a[0] < 4.0)|
|
||||
// m1 = |111111...11111|000000...00000|111111...11111|000000...00000|
|
||||
// m1 = |11...11|00...00|11...11|00...00| <- Shortened to reduce typing :P
|
||||
_mask1 = _mm256_cmp_pd(_a, _four, _CMP_LT_OQ);
|
||||
|
||||
// m2 = if (iterations > n)
|
||||
// m2 = |00...00|11...11|11...11|00...00|
|
||||
_mask2 = _mm256_cmpgt_epi64(_iterations, _n);
|
||||
|
||||
// m2 = m2 AND m1 = if(a < 4.0 && iterations > n)
|
||||
//
|
||||
// m2 = |00...00|11...11|11...11|00...00|
|
||||
// m1 = AND|11...11|00...00|11...11|00...00|
|
||||
// m2 = |00...00|00...00|11...11|00...00|
|
||||
_mask2 = _mm256_and_si256(_mask2, _mm256_castpd_si256(_mask1)); |
||||
|
||||
// c = |(int)1|(int)1|(int)1|(int)1| AND m2
|
||||
//
|
||||
// c = |00...01|00...01|00...01|00...01|
|
||||
// m2 = AND|00...00|00...00|11...11|00...00|
|
||||
// c = |00...00|00...00|00...01|00...00|
|
||||
//
|
||||
// c = |(int)0|(int)0|(int)1|(int)0|
|
||||
_c = _mm256_and_si256(_one, _mask2);
|
||||
|
||||
// n = n + c
|
||||
// n = |00...24|00...13|00...08|00...21|
|
||||
// c = +|00...00|00...00|00...01|00...00|
|
||||
// n = |00...24|00...13|00...09|00...21| (Increment only applied to 'enabled' element)
|
||||
_n = _mm256_add_epi64(_n, _c); |
||||
|
||||
// if ((zr * zr + zi * zi) < 4.0 && n < iterations) goto repeat
|
||||
// i.e. if our mask has any elements that are 1
|
||||
// |00...00|00...00|11...11|00...00|
|
||||
// | 0 | 0 | 1 | 0 | = 0b0010 = 2
|
||||
// so... if (2 > 0) goto repeat
|
||||
if (_mm256_movemask_pd(_mm256_castsi256_pd(_mask2)) > 0) |
||||
goto repeat; |
||||
|
||||
// Tight loop has finished, all 4 pixels have been evaluated. Increment
|
||||
// fractal space x positions for next 4 pixels
|
||||
// x_pos = x_pos + x_jump
|
||||
_x_pos = _mm256_add_pd(_x_pos, _x_jump); |
||||
|
||||
// Unpack our 4x64-bit Integer Vector into normal 32-bit Integers
|
||||
// and write into memory at correct location. Note, depending on
|
||||
// how you structure the memory, and the types you use, this step
|
||||
// may not be required. If I was working with 64-bit integers I
|
||||
// could choose to just write the vector entirely, saving this
|
||||
// truncation at the expense of 2x the memory required
|
||||
|
||||
#if defined(__linux__) |
||||
// Intrinsics are not cross platform!
|
||||
pFractal[y_offset + x + 0] = int(_n[3]); |
||||
pFractal[y_offset + x + 1] = int(_n[2]); |
||||
pFractal[y_offset + x + 2] = int(_n[1]); |
||||
pFractal[y_offset + x + 3] = int(_n[0]); |
||||
#endif |
||||
|
||||
#if defined(_WIN32) |
||||
pFractal[y_offset + x + 0] = int(_n.m256i_i64[3]); |
||||
pFractal[y_offset + x + 1] = int(_n.m256i_i64[2]); |
||||
pFractal[y_offset + x + 2] = int(_n.m256i_i64[1]); |
||||
pFractal[y_offset + x + 3] = int(_n.m256i_i64[0]); |
||||
#endif |
||||
|
||||
|
||||
} |
||||
|
||||
y_pos += y_scale; |
||||
y_offset += row_size; |
||||
} |
||||
} |
Loading…
Reference in new issue