241 lines
7.9 KiB
C++
241 lines
7.9 KiB
C++
/*
|
||
Vector Extensions - Mandelbrot
|
||
"Succesfully programmed the dishwasher and debugged the pump this week..." - javidx9
|
||
|
||
License (OLC-3)
|
||
~~~~~~~~~~~~~~~
|
||
|
||
Copyright 2018-2020 OneLoneCoder.com
|
||
|
||
Redistribution and use in source and binary forms, with or without
|
||
modification, are permitted provided that the following conditions
|
||
are met:
|
||
|
||
1. Redistributions or derivations of source code must retain the above
|
||
copyright notice, this list of conditions and the following disclaimer.
|
||
|
||
2. Redistributions or derivative works in binary form must reproduce
|
||
the above copyright notice. This list of conditions and the following
|
||
disclaimer must be reproduced in the documentation and/or other
|
||
materials provided with the distribution.
|
||
|
||
3. Neither the name of the copyright holder nor the names of its
|
||
contributors may be used to endorse or promote products derived
|
||
from this software without specific prior written permission.
|
||
|
||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
||
Relevant Video: https://youtu.be/x9Scb5Mku1g
|
||
|
||
Links
|
||
~~~~~
|
||
YouTube: https://www.youtube.com/javidx9
|
||
https://www.youtube.com/javidx9extra
|
||
Discord: https://discord.gg/WhwHUMV
|
||
Twitter: https://www.twitter.com/javidx9
|
||
Twitch: https://www.twitch.tv/javidx9
|
||
GitHub: https://www.github.com/onelonecoder
|
||
Patreon: https://www.patreon.com/javidx9
|
||
Homepage: https://www.onelonecoder.com
|
||
|
||
Community Blog: https://community.onelonecoder.com
|
||
|
||
Author
|
||
~~~~~~
|
||
David Barr, aka javidx9, <20>OneLoneCoder 2018, 2019, 2020
|
||
*/
|
||
|
||
// NOTE: THIS PROGRAM CANNOT BE EXCUTED - IT IS INTENDED AS A GUIDE
|
||
// TO THIS VIDEO: https://youtu.be/x9Scb5Mku1g
|
||
|
||
|
||
// Method 4) - Use AVX2 Vector co-processor to handle 4 fractal locations at once
|
||
void CreateFractalIntrinsics(const olc::vi2d& pix_tl, const olc::vi2d& pix_br, const olc::vd2d& frac_tl, const olc::vd2d& frac_br, const int iterations)
|
||
{
|
||
double x_scale = (frac_br.x - frac_tl.x) / (double(pix_br.x) - double(pix_tl.x));
|
||
double y_scale = (frac_br.y - frac_tl.y) / (double(pix_br.y) - double(pix_tl.y));
|
||
|
||
double y_pos = frac_tl.y;
|
||
|
||
int y_offset = 0;
|
||
int row_size = ScreenWidth();
|
||
|
||
int x, y;
|
||
|
||
// 64-bit "double" registers
|
||
__m256d _a, _b, _two, _four, _mask1;
|
||
__m256d _zr, _zi, _zr2, _zi2, _cr, _ci;
|
||
__m256d _x_pos_offsets, _x_pos, _x_scale, _x_jump;
|
||
|
||
// 64-bit "integer" registers
|
||
__m256i _one, _c, _n, _iterations, _mask2;
|
||
|
||
// Expand constants into vectors of constants
|
||
// one = |(int)1|(int)1|(int)1|(int)1|
|
||
_one = _mm256_set1_epi64x(1);
|
||
|
||
// two = |2.0|2.0|2.0|2.0|
|
||
_two = _mm256_set1_pd(2.0);
|
||
|
||
// four = |4.0|4.0|4.0|4.0|
|
||
_four = _mm256_set1_pd(4.0);
|
||
|
||
// iterations = |iterations|iterations|iterations|iterations|
|
||
_iterations = _mm256_set1_epi64x(iterations);
|
||
|
||
_x_scale = _mm256_set1_pd(x_scale);
|
||
_x_jump = _mm256_set1_pd(x_scale * 4);
|
||
_x_pos_offsets = _mm256_set_pd(0, 1, 2, 3);
|
||
_x_pos_offsets = _mm256_mul_pd(_x_pos_offsets, _x_scale);
|
||
|
||
|
||
for (y = pix_tl.y; y < pix_br.y; y++)
|
||
{
|
||
// Reset x_position
|
||
_a = _mm256_set1_pd(frac_tl.x);
|
||
_x_pos = _mm256_add_pd(_a, _x_pos_offsets);
|
||
|
||
_ci = _mm256_set1_pd(y_pos);
|
||
|
||
for (x = pix_tl.x; x < pix_br.x; x += 4)
|
||
{
|
||
_cr = _x_pos;
|
||
|
||
// Zreal = 0
|
||
_zr = _mm256_setzero_pd();
|
||
|
||
// Zimag = 0
|
||
_zi = _mm256_setzero_pd();
|
||
|
||
// nIterations = 0
|
||
_n = _mm256_setzero_si256();
|
||
|
||
|
||
repeat:
|
||
// Normal: z = (z * z) + c;
|
||
// Manual: a = zr * zr - zi * zi + cr;
|
||
// b = zr * zi * 2.0 + ci;
|
||
// zr = a;
|
||
// zi = b;
|
||
|
||
|
||
// zr^2 = zr * zr
|
||
_zr2 = _mm256_mul_pd(_zr, _zr); // zr * zr
|
||
|
||
// zi^2 = zi * zi
|
||
_zi2 = _mm256_mul_pd(_zi, _zi); // zi * zi
|
||
|
||
// a = zr^2 - zi^2
|
||
_a = _mm256_sub_pd(_zr2, _zi2); // a = (zr * zr) - (zi * zi)
|
||
|
||
// a = a + cr
|
||
_a = _mm256_add_pd(_a, _cr); // a = ((zr * zr) - (zi * zi)) + cr
|
||
|
||
|
||
|
||
// b = zr * zi
|
||
_b = _mm256_mul_pd(_zr, _zi); // b = zr * zi
|
||
|
||
// b = b * 2.0 + ci
|
||
// b = b * |2.0|2.0|2.0|2.0| + ci
|
||
_b = _mm256_fmadd_pd(_b, _two, _ci); // b = (zr * zi) * 2.0 + ci
|
||
|
||
// zr = a
|
||
_zr = _a; // zr = a
|
||
|
||
// zi = b
|
||
_zi = _b; // zr = b
|
||
|
||
|
||
|
||
// Normal: while (abs(z) < 2.0 && n < iterations)
|
||
// Manual: while ((zr * zr + zi * zi) < 4.0 && n < iterations)
|
||
|
||
|
||
// a = zr^2 + zi^2
|
||
_a = _mm256_add_pd(_zr2, _zi2); // a = (zr * zr) + (zi * zi)
|
||
|
||
// m1 = if (a < 4.0)
|
||
// m1 = |if(a[3] < 4.0)|if(a[2] < 4.0)|if(a[1] < 4.0)|if(a[0] < 4.0)|
|
||
// m1 = |111111...11111|000000...00000|111111...11111|000000...00000|
|
||
// m1 = |11...11|00...00|11...11|00...00| <- Shortened to reduce typing :P
|
||
_mask1 = _mm256_cmp_pd(_a, _four, _CMP_LT_OQ);
|
||
|
||
// m2 = if (iterations > n)
|
||
// m2 = |00...00|11...11|11...11|00...00|
|
||
_mask2 = _mm256_cmpgt_epi64(_iterations, _n);
|
||
|
||
// m2 = m2 AND m1 = if(a < 4.0 && iterations > n)
|
||
//
|
||
// m2 = |00...00|11...11|11...11|00...00|
|
||
// m1 = AND|11...11|00...00|11...11|00...00|
|
||
// m2 = |00...00|00...00|11...11|00...00|
|
||
_mask2 = _mm256_and_si256(_mask2, _mm256_castpd_si256(_mask1));
|
||
|
||
// c = |(int)1|(int)1|(int)1|(int)1| AND m2
|
||
//
|
||
// c = |00...01|00...01|00...01|00...01|
|
||
// m2 = AND|00...00|00...00|11...11|00...00|
|
||
// c = |00...00|00...00|00...01|00...00|
|
||
//
|
||
// c = |(int)0|(int)0|(int)1|(int)0|
|
||
_c = _mm256_and_si256(_one, _mask2);
|
||
|
||
// n = n + c
|
||
// n = |00...24|00...13|00...08|00...21|
|
||
// c = +|00...00|00...00|00...01|00...00|
|
||
// n = |00...24|00...13|00...09|00...21| (Increment only applied to 'enabled' element)
|
||
_n = _mm256_add_epi64(_n, _c);
|
||
|
||
// if ((zr * zr + zi * zi) < 4.0 && n < iterations) goto repeat
|
||
// i.e. if our mask has any elements that are 1
|
||
// |00...00|00...00|11...11|00...00|
|
||
// | 0 | 0 | 1 | 0 | = 0b0010 = 2
|
||
// so... if (2 > 0) goto repeat
|
||
if (_mm256_movemask_pd(_mm256_castsi256_pd(_mask2)) > 0)
|
||
goto repeat;
|
||
|
||
// Tight loop has finished, all 4 pixels have been evaluated. Increment
|
||
// fractal space x positions for next 4 pixels
|
||
// x_pos = x_pos + x_jump
|
||
_x_pos = _mm256_add_pd(_x_pos, _x_jump);
|
||
|
||
// Unpack our 4x64-bit Integer Vector into normal 32-bit Integers
|
||
// and write into memory at correct location. Note, depending on
|
||
// how you structure the memory, and the types you use, this step
|
||
// may not be required. If I was working with 64-bit integers I
|
||
// could choose to just write the vector entirely, saving this
|
||
// truncation at the expense of 2x the memory required
|
||
|
||
#if defined(__linux__)
|
||
// Intrinsics are not cross platform!
|
||
pFractal[y_offset + x + 0] = int(_n[3]);
|
||
pFractal[y_offset + x + 1] = int(_n[2]);
|
||
pFractal[y_offset + x + 2] = int(_n[1]);
|
||
pFractal[y_offset + x + 3] = int(_n[0]);
|
||
#endif
|
||
|
||
#if defined(_WIN32)
|
||
pFractal[y_offset + x + 0] = int(_n.m256i_i64[3]);
|
||
pFractal[y_offset + x + 1] = int(_n.m256i_i64[2]);
|
||
pFractal[y_offset + x + 2] = int(_n.m256i_i64[1]);
|
||
pFractal[y_offset + x + 3] = int(_n.m256i_i64[0]);
|
||
#endif
|
||
|
||
|
||
}
|
||
|
||
y_pos += y_scale;
|
||
y_offset += row_size;
|
||
}
|
||
} |