From 0755ce7628427ce2859af5b417c58419a3d9531d Mon Sep 17 00:00:00 2001
From: Javidx9 <25419386+OneLoneCoder@users.noreply.github.com>
Date: Sat, 30 May 2020 22:05:36 +0100
Subject: [PATCH] Added Intrinsics Video

---
 .../OneLoneCoder_VIDEO_IntrinsicFunctions.cpp | 241 ++++++++++++++++++
 1 file changed, 241 insertions(+)
 create mode 100644 Videos/OneLoneCoder_VIDEO_IntrinsicFunctions.cpp

diff --git a/Videos/OneLoneCoder_VIDEO_IntrinsicFunctions.cpp b/Videos/OneLoneCoder_VIDEO_IntrinsicFunctions.cpp
new file mode 100644
index 0000000..fcb90c3
--- /dev/null
+++ b/Videos/OneLoneCoder_VIDEO_IntrinsicFunctions.cpp
@@ -0,0 +1,241 @@
+/*
+	Vector Extensions - Mandelbrot
+	"Succesfully programmed the dishwasher and debugged the pump this week..." - javidx9
+
+	License (OLC-3)
+	~~~~~~~~~~~~~~~
+
+	Copyright 2018-2020 OneLoneCoder.com
+
+	Redistribution and use in source and binary forms, with or without
+	modification, are permitted provided that the following conditions
+	are met:
+
+	1. Redistributions or derivations of source code must retain the above
+	copyright notice, this list of conditions and the following disclaimer.
+
+	2. Redistributions or derivative works in binary form must reproduce
+	the above copyright notice. This list of conditions and the following
+	disclaimer must be reproduced in the documentation and/or other
+	materials provided with the distribution.
+
+	3. Neither the name of the copyright holder nor the names of its
+	contributors may be used to endorse or promote products derived
+	from this software without specific prior written permission.
+
+	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+	Relevant Video: https://youtu.be/x9Scb5Mku1g
+
+	Links
+	~~~~~
+	YouTube:	https://www.youtube.com/javidx9
+				https://www.youtube.com/javidx9extra
+	Discord:	https://discord.gg/WhwHUMV
+	Twitter:	https://www.twitter.com/javidx9
+	Twitch:		https://www.twitch.tv/javidx9
+	GitHub:		https://www.github.com/onelonecoder
+	Patreon:	https://www.patreon.com/javidx9
+	Homepage:	https://www.onelonecoder.com
+
+	Community Blog: https://community.onelonecoder.com
+
+	Author
+	~~~~~~
+	David Barr, aka javidx9, ŠOneLoneCoder 2018, 2019, 2020
+*/
+
+// NOTE: THIS PROGRAM CANNOT BE EXCUTED - IT IS INTENDED AS A GUIDE
+// TO THIS VIDEO: https://youtu.be/x9Scb5Mku1g
+
+
+// Method 4) - Use AVX2 Vector co-processor to handle 4 fractal locations at once
+void CreateFractalIntrinsics(const olc::vi2d& pix_tl, const olc::vi2d& pix_br, const olc::vd2d& frac_tl, const olc::vd2d& frac_br, const int iterations)
+{
+	double x_scale = (frac_br.x - frac_tl.x) / (double(pix_br.x) - double(pix_tl.x));
+	double y_scale = (frac_br.y - frac_tl.y) / (double(pix_br.y) - double(pix_tl.y));
+
+	double y_pos = frac_tl.y;
+
+	int y_offset = 0;
+	int row_size = ScreenWidth();
+
+	int x, y;
+
+	// 64-bit "double" registers
+	__m256d _a, _b, _two, _four, _mask1;
+	__m256d _zr, _zi, _zr2, _zi2, _cr, _ci;
+	__m256d _x_pos_offsets, _x_pos, _x_scale, _x_jump;
+	
+	// 64-bit "integer" registers
+	__m256i _one, _c, _n, _iterations, _mask2;
+
+	// Expand constants into vectors of constants
+	// one = |(int)1|(int)1|(int)1|(int)1|		
+	_one = _mm256_set1_epi64x(1);
+	
+	// two = |2.0|2.0|2.0|2.0|
+	_two = _mm256_set1_pd(2.0);
+	
+	// four = |4.0|4.0|4.0|4.0|
+	_four = _mm256_set1_pd(4.0);
+	
+	// iterations = |iterations|iterations|iterations|iterations|
+	_iterations = _mm256_set1_epi64x(iterations);
+
+	_x_scale = _mm256_set1_pd(x_scale);
+	_x_jump = _mm256_set1_pd(x_scale * 4);
+	_x_pos_offsets = _mm256_set_pd(0, 1, 2, 3);
+	_x_pos_offsets = _mm256_mul_pd(_x_pos_offsets, _x_scale);
+
+
+	for (y = pix_tl.y; y < pix_br.y; y++)
+	{
+		// Reset x_position
+		_a = _mm256_set1_pd(frac_tl.x);
+		_x_pos = _mm256_add_pd(_a, _x_pos_offsets);
+
+		_ci = _mm256_set1_pd(y_pos);
+
+		for (x = pix_tl.x; x < pix_br.x; x += 4)
+		{
+			_cr = _x_pos;
+			
+			// Zreal = 0
+			_zr = _mm256_setzero_pd();
+			
+			// Zimag = 0
+			_zi = _mm256_setzero_pd();
+			
+			// nIterations = 0
+			_n = _mm256_setzero_si256();
+
+
+		repeat:
+			// Normal: z = (z * z) + c;
+			// Manual: a = zr * zr - zi * zi + cr;
+			//         b = zr * zi * 2.0 + ci;
+			//         zr = a;
+			//         zi = b;
+		
+		
+			// zr^2 = zr * zr
+			_zr2 = _mm256_mul_pd(_zr, _zr);     // zr * zr
+			
+			// zi^2 = zi * zi
+			_zi2 = _mm256_mul_pd(_zi, _zi);     // zi * zi
+			
+			// a = zr^2 - zi^2
+			_a = _mm256_sub_pd(_zr2, _zi2);     // a = (zr * zr) - (zi * zi)
+			
+			// a = a + cr
+			_a = _mm256_add_pd(_a, _cr);        // a = ((zr * zr) - (zi * zi)) + cr
+			
+			
+			
+			// b = zr * zi
+			_b = _mm256_mul_pd(_zr, _zi);        // b = zr * zi
+			
+			// b = b * 2.0 + ci
+			// b = b * |2.0|2.0|2.0|2.0| + ci
+			_b = _mm256_fmadd_pd(_b, _two, _ci); // b = (zr * zi) * 2.0 + ci
+			
+			// zr = a
+			_zr = _a;                            // zr = a
+			
+			// zi = b
+			_zi = _b;                            // zr = b
+			
+			
+			
+			// Normal: while (abs(z) < 2.0 && n < iterations)
+			// Manual: while ((zr * zr + zi * zi) < 4.0 && n < iterations)
+			
+			
+			// a = zr^2 + zi^2
+			_a = _mm256_add_pd(_zr2, _zi2);     // a = (zr * zr) + (zi * zi)
+			
+			// m1 = if (a < 4.0)
+			// m1 = |if(a[3] < 4.0)|if(a[2] < 4.0)|if(a[1] < 4.0)|if(a[0] < 4.0)|
+			// m1 = |111111...11111|000000...00000|111111...11111|000000...00000|
+			// m1 = |11...11|00...00|11...11|00...00| <- Shortened to reduce typing :P
+			_mask1 = _mm256_cmp_pd(_a, _four, _CMP_LT_OQ); 
+			
+			// m2 = if (iterations > n)
+			// m2 = |00...00|11...11|11...11|00...00|
+			_mask2 = _mm256_cmpgt_epi64(_iterations, _n);  
+			
+			// m2 = m2 AND m1 = if(a < 4.0 && iterations > n)
+			//
+			// m2 =    |00...00|11...11|11...11|00...00|
+			// m1 = AND|11...11|00...00|11...11|00...00|
+			// m2 =    |00...00|00...00|11...11|00...00|
+			_mask2 = _mm256_and_si256(_mask2, _mm256_castpd_si256(_mask1));
+			
+			//  c = |(int)1|(int)1|(int)1|(int)1| AND m2
+			//
+			//  c =    |00...01|00...01|00...01|00...01| 
+			// m2 = AND|00...00|00...00|11...11|00...00|
+			//  c =    |00...00|00...00|00...01|00...00|
+			//
+			//  c = |(int)0|(int)0|(int)1|(int)0|
+			_c = _mm256_and_si256(_one, _mask2);				
+			
+			// n = n + c
+			// n =  |00...24|00...13|00...08|00...21| 
+			// c = +|00...00|00...00|00...01|00...00|
+			// n =  |00...24|00...13|00...09|00...21| (Increment only applied to 'enabled' element)
+			_n = _mm256_add_epi64(_n, _c);
+			
+			// if ((zr * zr + zi * zi) < 4.0 && n < iterations) goto repeat
+			// i.e. if our mask has any elements that are 1
+			// |00...00|00...00|11...11|00...00|
+			// |   0   |   0   |   1   |   0   | = 0b0010 = 2
+			// so... if (2 > 0) goto repeat
+			if (_mm256_movemask_pd(_mm256_castsi256_pd(_mask2)) > 0)
+				goto repeat;
+				
+			// Tight loop has finished, all 4 pixels have been evaluated. Increment
+			// fractal space x positions for next 4 pixels
+			// x_pos = x_pos + x_jump
+			_x_pos = _mm256_add_pd(_x_pos, _x_jump);
+				
+			// Unpack our 4x64-bit Integer Vector into normal 32-bit Integers
+			// and write into memory at correct location. Note, depending on
+			// how you structure the memory, and the types you use, this step
+			// may not be required. If I was working with 64-bit integers I
+			// could choose to just write the vector entirely, saving this
+			// truncation at the expense of 2x the memory required
+			
+			#if defined(__linux__)				
+				// Intrinsics are not cross platform!
+				pFractal[y_offset + x + 0] = int(_n[3]);
+				pFractal[y_offset + x + 1] = int(_n[2]);
+				pFractal[y_offset + x + 2] = int(_n[1]);
+				pFractal[y_offset + x + 3] = int(_n[0]);
+			#endif
+			
+			#if defined(_WIN32)				
+				pFractal[y_offset + x + 0] = int(_n.m256i_i64[3]);
+				pFractal[y_offset + x + 1] = int(_n.m256i_i64[2]);
+				pFractal[y_offset + x + 2] = int(_n.m256i_i64[1]);
+				pFractal[y_offset + x + 3] = int(_n.m256i_i64[0]);
+			#endif
+			
+			
+		}
+
+		y_pos += y_scale;
+		y_offset += row_size;
+	}
+}
\ No newline at end of file