diff --git a/Videos/OneLoneCoder_VIDEO_IntrinsicFunctions.cpp b/Videos/OneLoneCoder_VIDEO_IntrinsicFunctions.cpp new file mode 100644 index 0000000..fcb90c3 --- /dev/null +++ b/Videos/OneLoneCoder_VIDEO_IntrinsicFunctions.cpp @@ -0,0 +1,241 @@ +/* + Vector Extensions - Mandelbrot + "Succesfully programmed the dishwasher and debugged the pump this week..." - javidx9 + + License (OLC-3) + ~~~~~~~~~~~~~~~ + + Copyright 2018-2020 OneLoneCoder.com + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions or derivations of source code must retain the above + copyright notice, this list of conditions and the following disclaimer. + + 2. Redistributions or derivative works in binary form must reproduce + the above copyright notice. This list of conditions and the following + disclaimer must be reproduced in the documentation and/or other + materials provided with the distribution. + + 3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + Relevant Video: https://youtu.be/x9Scb5Mku1g + + Links + ~~~~~ + YouTube: https://www.youtube.com/javidx9 + https://www.youtube.com/javidx9extra + Discord: https://discord.gg/WhwHUMV + Twitter: https://www.twitter.com/javidx9 + Twitch: https://www.twitch.tv/javidx9 + GitHub: https://www.github.com/onelonecoder + Patreon: https://www.patreon.com/javidx9 + Homepage: https://www.onelonecoder.com + + Community Blog: https://community.onelonecoder.com + + Author + ~~~~~~ + David Barr, aka javidx9, ŠOneLoneCoder 2018, 2019, 2020 +*/ + +// NOTE: THIS PROGRAM CANNOT BE EXCUTED - IT IS INTENDED AS A GUIDE +// TO THIS VIDEO: https://youtu.be/x9Scb5Mku1g + + +// Method 4) - Use AVX2 Vector co-processor to handle 4 fractal locations at once +void CreateFractalIntrinsics(const olc::vi2d& pix_tl, const olc::vi2d& pix_br, const olc::vd2d& frac_tl, const olc::vd2d& frac_br, const int iterations) +{ + double x_scale = (frac_br.x - frac_tl.x) / (double(pix_br.x) - double(pix_tl.x)); + double y_scale = (frac_br.y - frac_tl.y) / (double(pix_br.y) - double(pix_tl.y)); + + double y_pos = frac_tl.y; + + int y_offset = 0; + int row_size = ScreenWidth(); + + int x, y; + + // 64-bit "double" registers + __m256d _a, _b, _two, _four, _mask1; + __m256d _zr, _zi, _zr2, _zi2, _cr, _ci; + __m256d _x_pos_offsets, _x_pos, _x_scale, _x_jump; + + // 64-bit "integer" registers + __m256i _one, _c, _n, _iterations, _mask2; + + // Expand constants into vectors of constants + // one = |(int)1|(int)1|(int)1|(int)1| + _one = _mm256_set1_epi64x(1); + + // two = |2.0|2.0|2.0|2.0| + _two = _mm256_set1_pd(2.0); + + // four = |4.0|4.0|4.0|4.0| + _four = _mm256_set1_pd(4.0); + + // iterations = |iterations|iterations|iterations|iterations| + _iterations = _mm256_set1_epi64x(iterations); + + _x_scale = _mm256_set1_pd(x_scale); + _x_jump = _mm256_set1_pd(x_scale * 4); + _x_pos_offsets = _mm256_set_pd(0, 1, 2, 3); + _x_pos_offsets = _mm256_mul_pd(_x_pos_offsets, _x_scale); + + + for (y = pix_tl.y; y < pix_br.y; y++) + { + // Reset x_position + _a = _mm256_set1_pd(frac_tl.x); + _x_pos = _mm256_add_pd(_a, _x_pos_offsets); + + _ci = _mm256_set1_pd(y_pos); + + for (x = pix_tl.x; x < pix_br.x; x += 4) + { + _cr = _x_pos; + + // Zreal = 0 + _zr = _mm256_setzero_pd(); + + // Zimag = 0 + _zi = _mm256_setzero_pd(); + + // nIterations = 0 + _n = _mm256_setzero_si256(); + + + repeat: + // Normal: z = (z * z) + c; + // Manual: a = zr * zr - zi * zi + cr; + // b = zr * zi * 2.0 + ci; + // zr = a; + // zi = b; + + + // zr^2 = zr * zr + _zr2 = _mm256_mul_pd(_zr, _zr); // zr * zr + + // zi^2 = zi * zi + _zi2 = _mm256_mul_pd(_zi, _zi); // zi * zi + + // a = zr^2 - zi^2 + _a = _mm256_sub_pd(_zr2, _zi2); // a = (zr * zr) - (zi * zi) + + // a = a + cr + _a = _mm256_add_pd(_a, _cr); // a = ((zr * zr) - (zi * zi)) + cr + + + + // b = zr * zi + _b = _mm256_mul_pd(_zr, _zi); // b = zr * zi + + // b = b * 2.0 + ci + // b = b * |2.0|2.0|2.0|2.0| + ci + _b = _mm256_fmadd_pd(_b, _two, _ci); // b = (zr * zi) * 2.0 + ci + + // zr = a + _zr = _a; // zr = a + + // zi = b + _zi = _b; // zr = b + + + + // Normal: while (abs(z) < 2.0 && n < iterations) + // Manual: while ((zr * zr + zi * zi) < 4.0 && n < iterations) + + + // a = zr^2 + zi^2 + _a = _mm256_add_pd(_zr2, _zi2); // a = (zr * zr) + (zi * zi) + + // m1 = if (a < 4.0) + // m1 = |if(a[3] < 4.0)|if(a[2] < 4.0)|if(a[1] < 4.0)|if(a[0] < 4.0)| + // m1 = |111111...11111|000000...00000|111111...11111|000000...00000| + // m1 = |11...11|00...00|11...11|00...00| <- Shortened to reduce typing :P + _mask1 = _mm256_cmp_pd(_a, _four, _CMP_LT_OQ); + + // m2 = if (iterations > n) + // m2 = |00...00|11...11|11...11|00...00| + _mask2 = _mm256_cmpgt_epi64(_iterations, _n); + + // m2 = m2 AND m1 = if(a < 4.0 && iterations > n) + // + // m2 = |00...00|11...11|11...11|00...00| + // m1 = AND|11...11|00...00|11...11|00...00| + // m2 = |00...00|00...00|11...11|00...00| + _mask2 = _mm256_and_si256(_mask2, _mm256_castpd_si256(_mask1)); + + // c = |(int)1|(int)1|(int)1|(int)1| AND m2 + // + // c = |00...01|00...01|00...01|00...01| + // m2 = AND|00...00|00...00|11...11|00...00| + // c = |00...00|00...00|00...01|00...00| + // + // c = |(int)0|(int)0|(int)1|(int)0| + _c = _mm256_and_si256(_one, _mask2); + + // n = n + c + // n = |00...24|00...13|00...08|00...21| + // c = +|00...00|00...00|00...01|00...00| + // n = |00...24|00...13|00...09|00...21| (Increment only applied to 'enabled' element) + _n = _mm256_add_epi64(_n, _c); + + // if ((zr * zr + zi * zi) < 4.0 && n < iterations) goto repeat + // i.e. if our mask has any elements that are 1 + // |00...00|00...00|11...11|00...00| + // | 0 | 0 | 1 | 0 | = 0b0010 = 2 + // so... if (2 > 0) goto repeat + if (_mm256_movemask_pd(_mm256_castsi256_pd(_mask2)) > 0) + goto repeat; + + // Tight loop has finished, all 4 pixels have been evaluated. Increment + // fractal space x positions for next 4 pixels + // x_pos = x_pos + x_jump + _x_pos = _mm256_add_pd(_x_pos, _x_jump); + + // Unpack our 4x64-bit Integer Vector into normal 32-bit Integers + // and write into memory at correct location. Note, depending on + // how you structure the memory, and the types you use, this step + // may not be required. If I was working with 64-bit integers I + // could choose to just write the vector entirely, saving this + // truncation at the expense of 2x the memory required + + #if defined(__linux__) + // Intrinsics are not cross platform! + pFractal[y_offset + x + 0] = int(_n[3]); + pFractal[y_offset + x + 1] = int(_n[2]); + pFractal[y_offset + x + 2] = int(_n[1]); + pFractal[y_offset + x + 3] = int(_n[0]); + #endif + + #if defined(_WIN32) + pFractal[y_offset + x + 0] = int(_n.m256i_i64[3]); + pFractal[y_offset + x + 1] = int(_n.m256i_i64[2]); + pFractal[y_offset + x + 2] = int(_n.m256i_i64[1]); + pFractal[y_offset + x + 3] = int(_n.m256i_i64[0]); + #endif + + + } + + y_pos += y_scale; + y_offset += row_size; + } +} \ No newline at end of file