diff --git a/misc/dist/docker/README.md b/misc/dist/docker/README.md
index 7f10b46ad84..71aac8a77c4 100644
--- a/misc/dist/docker/README.md
+++ b/misc/dist/docker/README.md
@@ -1,40 +1,40 @@
-## A Docker image to build Linux, Windows and Android godot binaries.
-
-The main reason to write this, is to provide a simple way in all platforms to integrate external godot modules and build a custom version of godot.
-
-## usage
-1. Install docker on Linux or docker toolbox on Windows or Mac.
-2. Open a terminal on linux or "Docker Quickstart Terminal" on Windows or Mac.
-3. Run command:
-	- Linux: `cd`
-	- Windows: `cd /c/Users/YOUR_USERNAME`
-	- Mac: `cd /Users/YOUR_USERNAME`
-4. Get godot source code: `git clone https://github.com/godotengine/godot.git`
-5. Run command: `cd godot/tools/docker`
-6. Run command: `docker build -t godot .`(In Linux run Docker commands with `sudo` or add your user to docker group before run the Docker commands). The godot docker image will be build after a while.
-7. Run command:
-	- Linux: `docker run -it --name=godot-dev -v /home/YOUR_USERNAME/godot:/godot-dev/godot godot`
-	- Windows: `docker run -it --name=godot-dev -v /c/Users/YOUR_USERNAME/godot:/godot-dev/godot godot`
-	- Mac: `docker run -it --name=godot-dev -v /Users/YOUR_USERNAME/godot:/godot-dev/godot godot`
-	You are in the godot-dev container and /godot-dev directory now.
-8. Run `./install-android-tools` to download and install all android development tools.
-9. Run command: `source ~/.bashrc`
-10. Run command: `cd godot`
-11. Run command: `scons p=android target=release` to test everything is ok. You can set platform to x11, windows, android, haiku and server.
-
-After use and exit, you can use this environment again by open terminal and type commands: `docker start godot-dev && docker attach godot-dev`.
-
-### Windows and Mac stuffs:
-
-- Speed up compilation:
-	- Exit from container.
-	- Run command: `docker-machine stop`
-	- Open "Oracle VM VirtualBox".
-	- In settings of default VM increase CPU cores and RAM to suitable values.
-	- Run command: `docker-machine start`
-	- Run command: `docker start godot-dev && docker attach godot-dev`
-
-- ssh to VM(can be useful sometimes):
-	- `docker-machine ssh`
-	
-Check docker and boot2docker projects for more details.
+## A Docker image to build Linux, Windows and Android godot binaries.
+
+The main reason to write this, is to provide a simple way in all platforms to integrate external godot modules and build a custom version of godot.
+
+## usage
+1. Install docker on Linux or docker toolbox on Windows or Mac.
+2. Open a terminal on linux or "Docker Quickstart Terminal" on Windows or Mac.
+3. Run command:
+	- Linux: `cd`
+	- Windows: `cd /c/Users/YOUR_USERNAME`
+	- Mac: `cd /Users/YOUR_USERNAME`
+4. Get godot source code: `git clone https://github.com/godotengine/godot.git`
+5. Run command: `cd godot/tools/docker`
+6. Run command: `docker build -t godot .`(In Linux run Docker commands with `sudo` or add your user to docker group before run the Docker commands). The godot docker image will be build after a while.
+7. Run command:
+	- Linux: `docker run -it --name=godot-dev -v /home/YOUR_USERNAME/godot:/godot-dev/godot godot`
+	- Windows: `docker run -it --name=godot-dev -v /c/Users/YOUR_USERNAME/godot:/godot-dev/godot godot`
+	- Mac: `docker run -it --name=godot-dev -v /Users/YOUR_USERNAME/godot:/godot-dev/godot godot`
+	You are in the godot-dev container and /godot-dev directory now.
+8. Run `./install-android-tools` to download and install all android development tools.
+9. Run command: `source ~/.bashrc`
+10. Run command: `cd godot`
+11. Run command: `scons p=android target=release` to test everything is ok. You can set platform to x11, windows, android, haiku and server.
+
+After use and exit, you can use this environment again by open terminal and type commands: `docker start godot-dev && docker attach godot-dev`.
+
+### Windows and Mac stuffs:
+
+- Speed up compilation:
+	- Exit from container.
+	- Run command: `docker-machine stop`
+	- Open "Oracle VM VirtualBox".
+	- In settings of default VM increase CPU cores and RAM to suitable values.
+	- Run command: `docker-machine start`
+	- Run command: `docker start godot-dev && docker attach godot-dev`
+
+- ssh to VM(can be useful sometimes):
+	- `docker-machine ssh`
+
+Check docker and boot2docker projects for more details.
diff --git a/misc/dist/uwp_template/AppxManifest.xml b/misc/dist/uwp_template/AppxManifest.xml
index d5e653708c5..cf26387f22c 100644
--- a/misc/dist/uwp_template/AppxManifest.xml
+++ b/misc/dist/uwp_template/AppxManifest.xml
@@ -1,32 +1,32 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Package xmlns="http://schemas.microsoft.com/appx/manifest/foundation/windows10" xmlns:mp="http://schemas.microsoft.com/appx/2014/phone/manifest" xmlns:uap="http://schemas.microsoft.com/appx/manifest/uap/windows10" IgnorableNamespaces="uap mp build" xmlns:build="http://schemas.microsoft.com/developer/appx/2015/build">
-  <Identity Name="$identity_name$" Publisher="$publisher$" Version="$version_string$" ProcessorArchitecture="$architecture$" />
-  <mp:PhoneIdentity PhoneProductId="$product_guid$" PhonePublisherId="$publisher_guid$" />
-  <Properties>
-    <DisplayName>$display_name$</DisplayName>
-    <PublisherDisplayName>$publisher_display_name$</PublisherDisplayName>
-    <Logo>Assets\StoreLogo.png</Logo>
-  </Properties>
-  <Dependencies>
-    <TargetDeviceFamily Name="Windows.Universal" MinVersion="10.0.10240.0" MaxVersionTested="10.0.14393.0" />
-    <PackageDependency Name="Microsoft.VCLibs.140.00" MinVersion="14.0.24123.0" Publisher="CN=Microsoft Corporation, O=Microsoft Corporation, L=Redmond, S=Washington, C=US" />
-  </Dependencies>
-  <Resources>
-    <Resource Language="EN-US" />
-  </Resources>
-  <Applications>
-    <Application Id="App" Executable="godot.uwp.exe" EntryPoint="GodotUWP.App">
-      <uap:VisualElements DisplayName="$display_name$" Square150x150Logo="Assets\Square150x150Logo.png" Square44x44Logo="Assets\Square44x44Logo.png" Description="$app_description$" BackgroundColor="$bg_color$">
-        <uap:DefaultTile Wide310x150Logo="Assets\Wide310x150Logo.png" Square310x310Logo="Assets\Square310x310Logo.png" Square71x71Logo="Assets\Square71x71Logo.png" ShortName="$short_name$">
-		  $name_on_tiles$
-        </uap:DefaultTile>
-        <uap:SplashScreen Image="Assets\SplashScreen.png" />
-		$rotation_preference$
-      </uap:VisualElements>
-    </Application>
-  </Applications>
-  $capabilities_place$
-  <build:Metadata>
-    <build:Item Name="GodotEngine" Version="$godot_version$" />
-  </build:Metadata>
+<?xml version="1.0" encoding="utf-8"?>
+<Package xmlns="http://schemas.microsoft.com/appx/manifest/foundation/windows10" xmlns:mp="http://schemas.microsoft.com/appx/2014/phone/manifest" xmlns:uap="http://schemas.microsoft.com/appx/manifest/uap/windows10" IgnorableNamespaces="uap mp build" xmlns:build="http://schemas.microsoft.com/developer/appx/2015/build">
+  <Identity Name="$identity_name$" Publisher="$publisher$" Version="$version_string$" ProcessorArchitecture="$architecture$" />
+  <mp:PhoneIdentity PhoneProductId="$product_guid$" PhonePublisherId="$publisher_guid$" />
+  <Properties>
+    <DisplayName>$display_name$</DisplayName>
+    <PublisherDisplayName>$publisher_display_name$</PublisherDisplayName>
+    <Logo>Assets\StoreLogo.png</Logo>
+  </Properties>
+  <Dependencies>
+    <TargetDeviceFamily Name="Windows.Universal" MinVersion="10.0.10240.0" MaxVersionTested="10.0.14393.0" />
+    <PackageDependency Name="Microsoft.VCLibs.140.00" MinVersion="14.0.24123.0" Publisher="CN=Microsoft Corporation, O=Microsoft Corporation, L=Redmond, S=Washington, C=US" />
+  </Dependencies>
+  <Resources>
+    <Resource Language="EN-US" />
+  </Resources>
+  <Applications>
+    <Application Id="App" Executable="godot.uwp.exe" EntryPoint="GodotUWP.App">
+      <uap:VisualElements DisplayName="$display_name$" Square150x150Logo="Assets\Square150x150Logo.png" Square44x44Logo="Assets\Square44x44Logo.png" Description="$app_description$" BackgroundColor="$bg_color$">
+        <uap:DefaultTile Wide310x150Logo="Assets\Wide310x150Logo.png" Square310x310Logo="Assets\Square310x310Logo.png" Square71x71Logo="Assets\Square71x71Logo.png" ShortName="$short_name$">
+		  $name_on_tiles$
+        </uap:DefaultTile>
+        <uap:SplashScreen Image="Assets\SplashScreen.png" />
+		$rotation_preference$
+      </uap:VisualElements>
+    </Application>
+  </Applications>
+  $capabilities_place$
+  <build:Metadata>
+    <build:Item Name="GodotEngine" Version="$godot_version$" />
+  </build:Metadata>
 </Package>
\ No newline at end of file
diff --git a/modules/mono/editor/GodotSharpTools/GodotSharpTools.sln b/modules/mono/editor/GodotSharpTools/GodotSharpTools.sln
index 7eabcdff5db..5f7d0e8a39f 100644
--- a/modules/mono/editor/GodotSharpTools/GodotSharpTools.sln
+++ b/modules/mono/editor/GodotSharpTools/GodotSharpTools.sln
@@ -1,17 +1,17 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 2012
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "GodotSharpTools", "GodotSharpTools.csproj", "{A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Any CPU = Debug|Any CPU
-		Release|Any CPU = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Release|Any CPU.Build.0 = Release|Any CPU
-	EndGlobalSection
-EndGlobal
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "GodotSharpTools", "GodotSharpTools.csproj", "{A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Release|Any CPU = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Release|Any CPU.Build.0 = Release|Any CPU
+	EndGlobalSection
+EndGlobal
diff --git a/modules/mono/glue/cs_files/Color.cs b/modules/mono/glue/cs_files/Color.cs
index df88a46832d..0a00f83d473 100644
--- a/modules/mono/glue/cs_files/Color.cs
+++ b/modules/mono/glue/cs_files/Color.cs
@@ -1,590 +1,590 @@
-using System;
-
-namespace Godot
-{
-    public struct Color : IEquatable<Color>
-    {
-        public float r;
-        public float g;
-        public float b;
-        public float a;
-
-        public int r8
-        {
-            get
-            {
-                return (int)(r * 255.0f);
-            }
-        }
-
-        public int g8
-        {
-            get
-            {
-                return (int)(g * 255.0f);
-            }
-        }
-
-        public int b8
-        {
-            get
-            {
-                return (int)(b * 255.0f);
-            }
-        }
-
-        public int a8
-        {
-            get
-            {
-                return (int)(a * 255.0f);
-            }
-        }
-
-        public float h
-        {
-            get
-            {
-                float max = Mathf.max(r, Mathf.max(g, b));
-                float min = Mathf.min(r, Mathf.min(g, b));
-
-                float delta = max - min;
-
-                if (delta == 0)
-                    return 0;
-
-                float h;
-
-                if (r == max)
-                    h = (g - b) / delta; // Between yellow & magenta
-                else if (g == max)
-                    h = 2 + (b - r) / delta; // Between cyan & yellow
-                else
-                    h = 4 + (r - g) / delta; // Between magenta & cyan
-
-                h /= 6.0f;
-
-                if (h < 0)
-                    h += 1.0f;
-
-                return h;
-            }
-            set
-            {
-                this = from_hsv(value, s, v);
-            }
-        }
-
-        public float s
-        {
-            get
-            {
-                float max = Mathf.max(r, Mathf.max(g, b));
-                float min = Mathf.min(r, Mathf.min(g, b));
-
-                float delta = max - min;
-
-                return max != 0 ? delta / max : 0;
-            }
-            set
-            {
-                this = from_hsv(h, value, v);
-            }
-        }
-
-        public float v
-        {
-            get
-            {
-                return Mathf.max(r, Mathf.max(g, b));
-            }
-            set
-            {
-                this = from_hsv(h, s, value);
-            }
-        }
-
-        private static readonly Color black = new Color(0f, 0f, 0f, 1.0f);
-
-        public Color Black
-        {
-            get
-            {
-                return black;
-            }
-        }
-
-        public float this [int index]
-        {
-            get
-            {
-                switch (index)
-                {
-                    case 0:
-                        return r;
-                    case 1:
-                        return g;
-                    case 2:
-                        return b;
-                    case 3:
-                        return a;
-                    default:
-                        throw new IndexOutOfRangeException();
-                }
-            }
-            set
-            {
-                switch (index)
-                {
-                    case 0:
-                        r = value;
-                        return;
-                    case 1:
-                        g = value;
-                        return;
-                    case 2:
-                        b = value;
-                        return;
-                    case 3:
-                        a = value;
-                        return;
-                    default:
-                        throw new IndexOutOfRangeException();
-                }
-            }
-        }
-
-        public static void to_hsv(Color color, out float hue, out float saturation, out float value)
-        {
-            int max = Mathf.max(color.r8, Mathf.max(color.g8, color.b8));
-            int min = Mathf.min(color.r8, Mathf.min(color.g8, color.b8));
-
-            float delta = max - min;
-
-            if (delta == 0)
-            {
-                hue = 0;
-            }
-            else
-            {
-                if (color.r == max)
-                    hue = (color.g - color.b) / delta; // Between yellow & magenta
-                else if (color.g == max)
-                    hue = 2 + (color.b - color.r) / delta; // Between cyan & yellow
-                else
-                    hue = 4 + (color.r - color.g) / delta; // Between magenta & cyan
-
-                hue /= 6.0f;
-
-                if (hue < 0)
-                    hue += 1.0f;
-            }
-
-            saturation = (max == 0) ? 0 : 1f - (1f * min / max);
-            value = max / 255f;
-        }
-
-        public static Color from_hsv(float hue, float saturation, float value, float alpha = 1.0f)
-        {
-            if (saturation == 0)
-            {
-                // acp_hromatic (grey)
-                return new Color(value, value, value, alpha);
-            }
-
-            int i;
-            float f, p, q, t;
-
-            hue *= 6.0f;
-            hue %= 6f;
-            i = (int)hue;
-
-            f = hue - i;
-            p = value * (1 - saturation);
-            q = value * (1 - saturation * f);
-            t = value * (1 - saturation * (1 - f));
-
-            switch (i)
-            {
-                case 0: // Red is the dominant color
-                    return new Color(value, t, p, alpha);
-                case 1: // Green is the dominant color
-                    return new Color(q, value, p, alpha);
-                case 2:
-                    return new Color(p, value, t, alpha);
-                case 3: // Blue is the dominant color
-                    return new Color(p, q, value, alpha);
-                case 4:
-                    return new Color(t, p, value, alpha);
-                default: // (5) Red is the dominant color
-                    return new Color(value, p, q, alpha);
-            }
-        }
-
-        public Color blend(Color over)
-        {
-            Color res;
-
-            float sa = 1.0f - over.a;
-            res.a = a * sa + over.a;
-
-            if (res.a == 0)
-            {
-                return new Color(0, 0, 0, 0);
-            }
-            else
-            {
-                res.r = (r * a * sa + over.r * over.a) / res.a;
-                res.g = (g * a * sa + over.g * over.a) / res.a;
-                res.b = (b * a * sa + over.b * over.a) / res.a;
-            }
-
-            return res;
-        }
-
-        public Color contrasted()
-        {
-            return new Color(
-                (r + 0.5f) % 1.0f,
-                (g + 0.5f) % 1.0f,
-                (b + 0.5f) % 1.0f
-            );
-        }
-
-        public float gray()
-        {
-            return (r + g + b) / 3.0f;
-        }
-
-        public Color inverted()
-        {
-            return new Color(
-                1.0f - r,
-                1.0f - g,
-                1.0f - b
-            );
-        }
-
-        public Color linear_interpolate(Color b, float t)
-        {
-            Color res = this;
-
-            res.r += (t * (b.r - this.r));
-            res.g += (t * (b.g - this.g));
-            res.b += (t * (b.b - this.b));
-            res.a += (t * (b.a - this.a));
-
-            return res;
-        }
-
-        public int to_32()
-        {
-            int c = (byte)(a * 255);
-            c <<= 8;
-            c |= (byte)(r * 255);
-            c <<= 8;
-            c |= (byte)(g * 255);
-            c <<= 8;
-            c |= (byte)(b * 255);
-
-            return c;
-        }
-
-        public int to_ARGB32()
-        {
-            int c = (byte)(a * 255);
-            c <<= 8;
-            c |= (byte)(r * 255);
-            c <<= 8;
-            c |= (byte)(g * 255);
-            c <<= 8;
-            c |= (byte)(b * 255);
-
-            return c;
-        }
-
-        public string to_html(bool include_alpha = true)
-        {
-            String txt = string.Empty;
-
-            txt += _to_hex(r);
-            txt += _to_hex(g);
-            txt += _to_hex(b);
-
-            if (include_alpha)
-                txt = _to_hex(a) + txt;
-
-            return txt;
-        }
-
-        public Color(float r, float g, float b, float a = 1.0f)
-        {
-            this.r = r;
-            this.g = g;
-            this.b = b;
-            this.a = a;
-        }
-
-        public Color(int rgba)
-        {
-            this.a = (rgba & 0xFF) / 255.0f;
-            rgba >>= 8;
-            this.b = (rgba & 0xFF) / 255.0f;
-            rgba >>= 8;
-            this.g = (rgba & 0xFF) / 255.0f;
-            rgba >>= 8;
-            this.r = (rgba & 0xFF) / 255.0f;
-        }
-
-        private static float _parse_col(string str, int ofs)
-        {
-            int ig = 0;
-
-            for (int i = 0; i < 2; i++)
-            {
-                int c = str[i + ofs];
-                int v = 0;
-
-                if (c >= '0' && c <= '9')
-                {
-                    v = c - '0';
-                }
-                else if (c >= 'a' && c <= 'f')
-                {
-                    v = c - 'a';
-                    v += 10;
-                }
-                else if (c >= 'A' && c <= 'F')
-                {
-                    v = c - 'A';
-                    v += 10;
-                }
-                else
-                {
-                    return -1;
-                }
-
-                if (i == 0)
-                    ig += v * 16;
-                else
-                    ig += v;
-            }
-
-            return ig;
-        }
-
-        private String _to_hex(float val)
-        {
-            int v = (int)Mathf.clamp(val * 255.0f, 0, 255);
-
-            string ret = string.Empty;
-
-            for (int i = 0; i < 2; i++)
-            {
-                char[] c = { (char)0, (char)0 };
-                int lv = v & 0xF;
-
-                if (lv < 10)
-                    c[0] = (char)('0' + lv);
-                else
-                    c[0] = (char)('a' + lv - 10);
-
-                v >>= 4;
-                ret = c + ret;
-            }
-
-            return ret;
-        }
-
-        internal static bool html_is_valid(string color)
-        {
-            if (color.Length == 0)
-                return false;
-
-            if (color[0] == '#')
-                color = color.Substring(1, color.Length - 1);
-
-            bool alpha = false;
-
-            if (color.Length == 8)
-                alpha = true;
-            else if (color.Length == 6)
-                alpha = false;
-            else
-                return false;
-
-            if (alpha)
-            {
-                if ((int)_parse_col(color, 0) < 0)
-                    return false;
-            }
-
-            int from = alpha ? 2 : 0;
-
-            if ((int)_parse_col(color, from + 0) < 0)
-                return false;
-            if ((int)_parse_col(color, from + 2) < 0)
-                return false;
-            if ((int)_parse_col(color, from + 4) < 0)
-                return false;
-
-            return true;
-        }
-
-        public static Color Color8(byte r8, byte g8, byte b8, byte a8)
-        {
-            return new Color((float)r8 / 255f, (float)g8 / 255f, (float)b8 / 255f, (float)a8 / 255f);
-        }
-
-        public Color(string rgba)
-        {
-            if (rgba.Length == 0)
-            {
-                r = 0f;
-                g = 0f;
-                b = 0f;
-                a = 1.0f;
-                return;
-            }
-
-            if (rgba[0] == '#')
-                rgba = rgba.Substring(1);
-
-            bool alpha = false;
-
-            if (rgba.Length == 8)
-            {
-                alpha = true;
-            }
-            else if (rgba.Length == 6)
-            {
-                alpha = false;
-            }
-            else
-            {
-                throw new ArgumentOutOfRangeException("Invalid color code. Length is " + rgba.Length + " but a length of 6 or 8 is expected: " + rgba);
-            }
-
-            if (alpha)
-            {
-                a = _parse_col(rgba, 0);
-
-                if (a < 0)
-                    throw new ArgumentOutOfRangeException("Invalid color code. Alpha is " + a + " but zero or greater is expected: " + rgba);
-            }
-            else
-            {
-                a = 1.0f;
-            }
-
-            int from = alpha ? 2 : 0;
-
-            r = _parse_col(rgba, from + 0);
-
-            if (r < 0)
-                throw new ArgumentOutOfRangeException("Invalid color code. Red is " + r + " but zero or greater is expected: " + rgba);
-
-            g = _parse_col(rgba, from + 2);
-
-            if (g < 0)
-                throw new ArgumentOutOfRangeException("Invalid color code. Green is " + g + " but zero or greater is expected: " + rgba);
-
-            b = _parse_col(rgba, from + 4);
-
-            if (b < 0)
-                throw new ArgumentOutOfRangeException("Invalid color code. Blue is " + b + " but zero or greater is expected: " + rgba);
-        }
-
-        public static bool operator ==(Color left, Color right)
-        {
-            return left.Equals(right);
-        }
-
-        public static bool operator !=(Color left, Color right)
-        {
-            return !left.Equals(right);
-        }
-
-        public static bool operator <(Color left, Color right)
-        {
-            if (left.r == right.r)
-            {
-                if (left.g == right.g)
-                {
-                    if (left.b == right.b)
-                        return (left.a < right.a);
-                    else
-                        return (left.b < right.b);
-                }
-                else
-                {
-                    return left.g < right.g;
-                }
-            }
-
-            return left.r < right.r;
-        }
-
-        public static bool operator >(Color left, Color right)
-        {
-            if (left.r == right.r)
-            {
-                if (left.g == right.g)
-                {
-                    if (left.b == right.b)
-                        return (left.a > right.a);
-                    else
-                        return (left.b > right.b);
-                }
-                else
-                {
-                    return left.g > right.g;
-                }
-            }
-
-            return left.r > right.r;
-        }
-
-        public override bool Equals(object obj)
-        {
-            if (obj is Color)
-            {
-                return Equals((Color)obj);
-            }
-
-            return false;
-        }
-
-        public bool Equals(Color other)
-        {
-            return r == other.r && g == other.g && b == other.b && a == other.a;
-        }
-
-        public override int GetHashCode()
-        {
-            return r.GetHashCode() ^ g.GetHashCode() ^ b.GetHashCode() ^ a.GetHashCode();
-        }
-
-        public override string ToString()
-        {
-            return String.Format("{0},{1},{2},{3}", new object[]
-                {
-                    this.r.ToString(),
-                    this.g.ToString(),
-                    this.b.ToString(),
-                    this.a.ToString()
-                });
-        }
-
-        public string ToString(string format)
-        {
-            return String.Format("{0},{1},{2},{3}", new object[]
-                {
-                    this.r.ToString(format),
-                    this.g.ToString(format),
-                    this.b.ToString(format),
-                    this.a.ToString(format)
-                });
-        }
-    }
-}
+using System;
+
+namespace Godot
+{
+    public struct Color : IEquatable<Color>
+    {
+        public float r;
+        public float g;
+        public float b;
+        public float a;
+
+        public int r8
+        {
+            get
+            {
+                return (int)(r * 255.0f);
+            }
+        }
+
+        public int g8
+        {
+            get
+            {
+                return (int)(g * 255.0f);
+            }
+        }
+
+        public int b8
+        {
+            get
+            {
+                return (int)(b * 255.0f);
+            }
+        }
+
+        public int a8
+        {
+            get
+            {
+                return (int)(a * 255.0f);
+            }
+        }
+
+        public float h
+        {
+            get
+            {
+                float max = Mathf.max(r, Mathf.max(g, b));
+                float min = Mathf.min(r, Mathf.min(g, b));
+
+                float delta = max - min;
+
+                if (delta == 0)
+                    return 0;
+
+                float h;
+
+                if (r == max)
+                    h = (g - b) / delta; // Between yellow & magenta
+                else if (g == max)
+                    h = 2 + (b - r) / delta; // Between cyan & yellow
+                else
+                    h = 4 + (r - g) / delta; // Between magenta & cyan
+
+                h /= 6.0f;
+
+                if (h < 0)
+                    h += 1.0f;
+
+                return h;
+            }
+            set
+            {
+                this = from_hsv(value, s, v);
+            }
+        }
+
+        public float s
+        {
+            get
+            {
+                float max = Mathf.max(r, Mathf.max(g, b));
+                float min = Mathf.min(r, Mathf.min(g, b));
+
+                float delta = max - min;
+
+                return max != 0 ? delta / max : 0;
+            }
+            set
+            {
+                this = from_hsv(h, value, v);
+            }
+        }
+
+        public float v
+        {
+            get
+            {
+                return Mathf.max(r, Mathf.max(g, b));
+            }
+            set
+            {
+                this = from_hsv(h, s, value);
+            }
+        }
+
+        private static readonly Color black = new Color(0f, 0f, 0f, 1.0f);
+
+        public Color Black
+        {
+            get
+            {
+                return black;
+            }
+        }
+
+        public float this [int index]
+        {
+            get
+            {
+                switch (index)
+                {
+                    case 0:
+                        return r;
+                    case 1:
+                        return g;
+                    case 2:
+                        return b;
+                    case 3:
+                        return a;
+                    default:
+                        throw new IndexOutOfRangeException();
+                }
+            }
+            set
+            {
+                switch (index)
+                {
+                    case 0:
+                        r = value;
+                        return;
+                    case 1:
+                        g = value;
+                        return;
+                    case 2:
+                        b = value;
+                        return;
+                    case 3:
+                        a = value;
+                        return;
+                    default:
+                        throw new IndexOutOfRangeException();
+                }
+            }
+        }
+
+        public static void to_hsv(Color color, out float hue, out float saturation, out float value)
+        {
+            int max = Mathf.max(color.r8, Mathf.max(color.g8, color.b8));
+            int min = Mathf.min(color.r8, Mathf.min(color.g8, color.b8));
+
+            float delta = max - min;
+
+            if (delta == 0)
+            {
+                hue = 0;
+            }
+            else
+            {
+                if (color.r == max)
+                    hue = (color.g - color.b) / delta; // Between yellow & magenta
+                else if (color.g == max)
+                    hue = 2 + (color.b - color.r) / delta; // Between cyan & yellow
+                else
+                    hue = 4 + (color.r - color.g) / delta; // Between magenta & cyan
+
+                hue /= 6.0f;
+
+                if (hue < 0)
+                    hue += 1.0f;
+            }
+
+            saturation = (max == 0) ? 0 : 1f - (1f * min / max);
+            value = max / 255f;
+        }
+
+        public static Color from_hsv(float hue, float saturation, float value, float alpha = 1.0f)
+        {
+            if (saturation == 0)
+            {
+                // acp_hromatic (grey)
+                return new Color(value, value, value, alpha);
+            }
+
+            int i;
+            float f, p, q, t;
+
+            hue *= 6.0f;
+            hue %= 6f;
+            i = (int)hue;
+
+            f = hue - i;
+            p = value * (1 - saturation);
+            q = value * (1 - saturation * f);
+            t = value * (1 - saturation * (1 - f));
+
+            switch (i)
+            {
+                case 0: // Red is the dominant color
+                    return new Color(value, t, p, alpha);
+                case 1: // Green is the dominant color
+                    return new Color(q, value, p, alpha);
+                case 2:
+                    return new Color(p, value, t, alpha);
+                case 3: // Blue is the dominant color
+                    return new Color(p, q, value, alpha);
+                case 4:
+                    return new Color(t, p, value, alpha);
+                default: // (5) Red is the dominant color
+                    return new Color(value, p, q, alpha);
+            }
+        }
+
+        public Color blend(Color over)
+        {
+            Color res;
+
+            float sa = 1.0f - over.a;
+            res.a = a * sa + over.a;
+
+            if (res.a == 0)
+            {
+                return new Color(0, 0, 0, 0);
+            }
+            else
+            {
+                res.r = (r * a * sa + over.r * over.a) / res.a;
+                res.g = (g * a * sa + over.g * over.a) / res.a;
+                res.b = (b * a * sa + over.b * over.a) / res.a;
+            }
+
+            return res;
+        }
+
+        public Color contrasted()
+        {
+            return new Color(
+                (r + 0.5f) % 1.0f,
+                (g + 0.5f) % 1.0f,
+                (b + 0.5f) % 1.0f
+            );
+        }
+
+        public float gray()
+        {
+            return (r + g + b) / 3.0f;
+        }
+
+        public Color inverted()
+        {
+            return new Color(
+                1.0f - r,
+                1.0f - g,
+                1.0f - b
+            );
+        }
+
+        public Color linear_interpolate(Color b, float t)
+        {
+            Color res = this;
+
+            res.r += (t * (b.r - this.r));
+            res.g += (t * (b.g - this.g));
+            res.b += (t * (b.b - this.b));
+            res.a += (t * (b.a - this.a));
+
+            return res;
+        }
+
+        public int to_32()
+        {
+            int c = (byte)(a * 255);
+            c <<= 8;
+            c |= (byte)(r * 255);
+            c <<= 8;
+            c |= (byte)(g * 255);
+            c <<= 8;
+            c |= (byte)(b * 255);
+
+            return c;
+        }
+
+        public int to_ARGB32()
+        {
+            int c = (byte)(a * 255);
+            c <<= 8;
+            c |= (byte)(r * 255);
+            c <<= 8;
+            c |= (byte)(g * 255);
+            c <<= 8;
+            c |= (byte)(b * 255);
+
+            return c;
+        }
+
+        public string to_html(bool include_alpha = true)
+        {
+            String txt = string.Empty;
+
+            txt += _to_hex(r);
+            txt += _to_hex(g);
+            txt += _to_hex(b);
+
+            if (include_alpha)
+                txt = _to_hex(a) + txt;
+
+            return txt;
+        }
+
+        public Color(float r, float g, float b, float a = 1.0f)
+        {
+            this.r = r;
+            this.g = g;
+            this.b = b;
+            this.a = a;
+        }
+
+        public Color(int rgba)
+        {
+            this.a = (rgba & 0xFF) / 255.0f;
+            rgba >>= 8;
+            this.b = (rgba & 0xFF) / 255.0f;
+            rgba >>= 8;
+            this.g = (rgba & 0xFF) / 255.0f;
+            rgba >>= 8;
+            this.r = (rgba & 0xFF) / 255.0f;
+        }
+
+        private static float _parse_col(string str, int ofs)
+        {
+            int ig = 0;
+
+            for (int i = 0; i < 2; i++)
+            {
+                int c = str[i + ofs];
+                int v = 0;
+
+                if (c >= '0' && c <= '9')
+                {
+                    v = c - '0';
+                }
+                else if (c >= 'a' && c <= 'f')
+                {
+                    v = c - 'a';
+                    v += 10;
+                }
+                else if (c >= 'A' && c <= 'F')
+                {
+                    v = c - 'A';
+                    v += 10;
+                }
+                else
+                {
+                    return -1;
+                }
+
+                if (i == 0)
+                    ig += v * 16;
+                else
+                    ig += v;
+            }
+
+            return ig;
+        }
+
+        private String _to_hex(float val)
+        {
+            int v = (int)Mathf.clamp(val * 255.0f, 0, 255);
+
+            string ret = string.Empty;
+
+            for (int i = 0; i < 2; i++)
+            {
+                char[] c = { (char)0, (char)0 };
+                int lv = v & 0xF;
+
+                if (lv < 10)
+                    c[0] = (char)('0' + lv);
+                else
+                    c[0] = (char)('a' + lv - 10);
+
+                v >>= 4;
+                ret = c + ret;
+            }
+
+            return ret;
+        }
+
+        internal static bool html_is_valid(string color)
+        {
+            if (color.Length == 0)
+                return false;
+
+            if (color[0] == '#')
+                color = color.Substring(1, color.Length - 1);
+
+            bool alpha = false;
+
+            if (color.Length == 8)
+                alpha = true;
+            else if (color.Length == 6)
+                alpha = false;
+            else
+                return false;
+
+            if (alpha)
+            {
+                if ((int)_parse_col(color, 0) < 0)
+                    return false;
+            }
+
+            int from = alpha ? 2 : 0;
+
+            if ((int)_parse_col(color, from + 0) < 0)
+                return false;
+            if ((int)_parse_col(color, from + 2) < 0)
+                return false;
+            if ((int)_parse_col(color, from + 4) < 0)
+                return false;
+
+            return true;
+        }
+
+        public static Color Color8(byte r8, byte g8, byte b8, byte a8)
+        {
+            return new Color((float)r8 / 255f, (float)g8 / 255f, (float)b8 / 255f, (float)a8 / 255f);
+        }
+
+        public Color(string rgba)
+        {
+            if (rgba.Length == 0)
+            {
+                r = 0f;
+                g = 0f;
+                b = 0f;
+                a = 1.0f;
+                return;
+            }
+
+            if (rgba[0] == '#')
+                rgba = rgba.Substring(1);
+
+            bool alpha = false;
+
+            if (rgba.Length == 8)
+            {
+                alpha = true;
+            }
+            else if (rgba.Length == 6)
+            {
+                alpha = false;
+            }
+            else
+            {
+                throw new ArgumentOutOfRangeException("Invalid color code. Length is " + rgba.Length + " but a length of 6 or 8 is expected: " + rgba);
+            }
+
+            if (alpha)
+            {
+                a = _parse_col(rgba, 0);
+
+                if (a < 0)
+                    throw new ArgumentOutOfRangeException("Invalid color code. Alpha is " + a + " but zero or greater is expected: " + rgba);
+            }
+            else
+            {
+                a = 1.0f;
+            }
+
+            int from = alpha ? 2 : 0;
+
+            r = _parse_col(rgba, from + 0);
+
+            if (r < 0)
+                throw new ArgumentOutOfRangeException("Invalid color code. Red is " + r + " but zero or greater is expected: " + rgba);
+
+            g = _parse_col(rgba, from + 2);
+
+            if (g < 0)
+                throw new ArgumentOutOfRangeException("Invalid color code. Green is " + g + " but zero or greater is expected: " + rgba);
+
+            b = _parse_col(rgba, from + 4);
+
+            if (b < 0)
+                throw new ArgumentOutOfRangeException("Invalid color code. Blue is " + b + " but zero or greater is expected: " + rgba);
+        }
+
+        public static bool operator ==(Color left, Color right)
+        {
+            return left.Equals(right);
+        }
+
+        public static bool operator !=(Color left, Color right)
+        {
+            return !left.Equals(right);
+        }
+
+        public static bool operator <(Color left, Color right)
+        {
+            if (left.r == right.r)
+            {
+                if (left.g == right.g)
+                {
+                    if (left.b == right.b)
+                        return (left.a < right.a);
+                    else
+                        return (left.b < right.b);
+                }
+                else
+                {
+                    return left.g < right.g;
+                }
+            }
+
+            return left.r < right.r;
+        }
+
+        public static bool operator >(Color left, Color right)
+        {
+            if (left.r == right.r)
+            {
+                if (left.g == right.g)
+                {
+                    if (left.b == right.b)
+                        return (left.a > right.a);
+                    else
+                        return (left.b > right.b);
+                }
+                else
+                {
+                    return left.g > right.g;
+                }
+            }
+
+            return left.r > right.r;
+        }
+
+        public override bool Equals(object obj)
+        {
+            if (obj is Color)
+            {
+                return Equals((Color)obj);
+            }
+
+            return false;
+        }
+
+        public bool Equals(Color other)
+        {
+            return r == other.r && g == other.g && b == other.b && a == other.a;
+        }
+
+        public override int GetHashCode()
+        {
+            return r.GetHashCode() ^ g.GetHashCode() ^ b.GetHashCode() ^ a.GetHashCode();
+        }
+
+        public override string ToString()
+        {
+            return String.Format("{0},{1},{2},{3}", new object[]
+                {
+                    this.r.ToString(),
+                    this.g.ToString(),
+                    this.b.ToString(),
+                    this.a.ToString()
+                });
+        }
+
+        public string ToString(string format)
+        {
+            return String.Format("{0},{1},{2},{3}", new object[]
+                {
+                    this.r.ToString(format),
+                    this.g.ToString(format),
+                    this.b.ToString(format),
+                    this.a.ToString(format)
+                });
+        }
+    }
+}
diff --git a/modules/mono/glue/cs_files/ExportAttribute.cs b/modules/mono/glue/cs_files/ExportAttribute.cs
index a4e7d447dd6..dce9cc59a05 100644
--- a/modules/mono/glue/cs_files/ExportAttribute.cs
+++ b/modules/mono/glue/cs_files/ExportAttribute.cs
@@ -1,4 +1,4 @@
-﻿using System;
+using System;
 
 namespace Godot
 {
diff --git a/modules/mono/glue/cs_files/MarshalUtils.cs b/modules/mono/glue/cs_files/MarshalUtils.cs
index 5d401113392..2bdfb95c511 100644
--- a/modules/mono/glue/cs_files/MarshalUtils.cs
+++ b/modules/mono/glue/cs_files/MarshalUtils.cs
@@ -1,4 +1,4 @@
-﻿using System;
+using System;
 using System.Collections.Generic;
 
 namespace Godot
diff --git a/modules/mono/glue/cs_files/Plane.cs b/modules/mono/glue/cs_files/Plane.cs
index ada6e465ac1..37f70aca1eb 100644
--- a/modules/mono/glue/cs_files/Plane.cs
+++ b/modules/mono/glue/cs_files/Plane.cs
@@ -1,209 +1,209 @@
-using System;
-
-namespace Godot
-{
-    public struct Plane : IEquatable<Plane>
-    {
-        Vector3 normal;
-
-        public float x
-        {
-            get
-            {
-                return normal.x;
-            }
-            set
-            {
-                normal.x = value;
-            }
-        }
-
-        public float y
-        {
-            get
-            {
-                return normal.y;
-            }
-            set
-            {
-                normal.y = value;
-            }
-        }
-
-        public float z
-        {
-            get
-            {
-                return normal.z;
-            }
-            set
-            {
-                normal.z = value;
-            }
-        }
-
-        float d;
-
-        public Vector3 Center
-        {
-            get
-            {
-                return normal * d;
-            }
-        }
-
-        public float distance_to(Vector3 point)
-        {
-            return normal.dot(point) - d;
-        }
-
-        public Vector3 get_any_point()
-        {
-            return normal * d;
-        }
-
-        public bool has_point(Vector3 point, float epsilon = Mathf.Epsilon)
-        {
-            float dist = normal.dot(point) - d;
-            return Mathf.abs(dist) <= epsilon;
-        }
-
-        public Vector3 intersect_3(Plane b, Plane c)
-        {
-            float denom = normal.cross(b.normal).dot(c.normal);
-
-            if (Mathf.abs(denom) <= Mathf.Epsilon)
-                return new Vector3();
-
-            Vector3 result = (b.normal.cross(c.normal) * this.d) +
-                                (c.normal.cross(normal) * b.d) +
-                                (normal.cross(b.normal) * c.d);
-
-            return result / denom;
-        }
-
-        public Vector3 intersect_ray(Vector3 from, Vector3 dir)
-        {
-            float den = normal.dot(dir);
-
-            if (Mathf.abs(den) <= Mathf.Epsilon)
-                return new Vector3();
-
-            float dist = (normal.dot(from) - d) / den;
-
-            // This is a ray, before the emiting pos (from) does not exist
-            if (dist > Mathf.Epsilon)
-                return new Vector3();
-
-            return from + dir * -dist;
-        }
-
-        public Vector3 intersect_segment(Vector3 begin, Vector3 end)
-        {
-            Vector3 segment = begin - end;
-            float den = normal.dot(segment);
-
-            if (Mathf.abs(den) <= Mathf.Epsilon)
-                return new Vector3();
-
-            float dist = (normal.dot(begin) - d) / den;
-
-            if (dist < -Mathf.Epsilon || dist > (1.0f + Mathf.Epsilon))
-                return new Vector3();
-
-            return begin + segment * -dist;
-        }
-
-        public bool is_point_over(Vector3 point)
-        {
-            return normal.dot(point) > d;
-        }
-
-        public Plane normalized()
-        {
-            float len = normal.length();
-
-            if (len == 0)
-                return new Plane(0, 0, 0, 0);
-
-            return new Plane(normal / len, d / len);
-        }
-
-        public Vector3 project(Vector3 point)
-        {
-            return point - normal * distance_to(point);
-        }
-
-        public Plane(float a, float b, float c, float d)
-        {
-            normal = new Vector3(a, b, c);
-            this.d = d;
-        }
-
-        public Plane(Vector3 normal, float d)
-        {
-            this.normal = normal;
-            this.d = d;
-        }
-
-        public Plane(Vector3 v1, Vector3 v2, Vector3 v3)
-        {
-            normal = (v1 - v3).cross(v1 - v2);
-            normal.normalize();
-            d = normal.dot(v1);
-        }
-
-        public static Plane operator -(Plane plane)
-        {
-            return new Plane(-plane.normal, -plane.d);
-        }
-
-        public static bool operator ==(Plane left, Plane right)
-        {
-            return left.Equals(right);
-        }
-
-        public static bool operator !=(Plane left, Plane right)
-        {
-            return !left.Equals(right);
-        }
-
-        public override bool Equals(object obj)
-        {
-            if (obj is Plane)
-            {
-                return Equals((Plane)obj);
-            }
-
-            return false;
-        }
-
-        public bool Equals(Plane other)
-        {
-            return normal == other.normal && d == other.d;
-        }
-
-        public override int GetHashCode()
-        {
-            return normal.GetHashCode() ^ d.GetHashCode();
-        }
-
-        public override string ToString()
-        {
-            return String.Format("({0}, {1})", new object[]
-            {
-                this.normal.ToString(),
-                this.d.ToString()
-            });
-        }
-
-        public string ToString(string format)
-        {
-            return String.Format("({0}, {1})", new object[]
-            {
-                this.normal.ToString(format),
-                this.d.ToString(format)
-            });
-        }
-    }
-}
+using System;
+
+namespace Godot
+{
+    public struct Plane : IEquatable<Plane>
+    {
+        Vector3 normal;
+
+        public float x
+        {
+            get
+            {
+                return normal.x;
+            }
+            set
+            {
+                normal.x = value;
+            }
+        }
+
+        public float y
+        {
+            get
+            {
+                return normal.y;
+            }
+            set
+            {
+                normal.y = value;
+            }
+        }
+
+        public float z
+        {
+            get
+            {
+                return normal.z;
+            }
+            set
+            {
+                normal.z = value;
+            }
+        }
+
+        float d;
+
+        public Vector3 Center
+        {
+            get
+            {
+                return normal * d;
+            }
+        }
+
+        public float distance_to(Vector3 point)
+        {
+            return normal.dot(point) - d;
+        }
+
+        public Vector3 get_any_point()
+        {
+            return normal * d;
+        }
+
+        public bool has_point(Vector3 point, float epsilon = Mathf.Epsilon)
+        {
+            float dist = normal.dot(point) - d;
+            return Mathf.abs(dist) <= epsilon;
+        }
+
+        public Vector3 intersect_3(Plane b, Plane c)
+        {
+            float denom = normal.cross(b.normal).dot(c.normal);
+
+            if (Mathf.abs(denom) <= Mathf.Epsilon)
+                return new Vector3();
+
+            Vector3 result = (b.normal.cross(c.normal) * this.d) +
+                                (c.normal.cross(normal) * b.d) +
+                                (normal.cross(b.normal) * c.d);
+
+            return result / denom;
+        }
+
+        public Vector3 intersect_ray(Vector3 from, Vector3 dir)
+        {
+            float den = normal.dot(dir);
+
+            if (Mathf.abs(den) <= Mathf.Epsilon)
+                return new Vector3();
+
+            float dist = (normal.dot(from) - d) / den;
+
+            // This is a ray, before the emiting pos (from) does not exist
+            if (dist > Mathf.Epsilon)
+                return new Vector3();
+
+            return from + dir * -dist;
+        }
+
+        public Vector3 intersect_segment(Vector3 begin, Vector3 end)
+        {
+            Vector3 segment = begin - end;
+            float den = normal.dot(segment);
+
+            if (Mathf.abs(den) <= Mathf.Epsilon)
+                return new Vector3();
+
+            float dist = (normal.dot(begin) - d) / den;
+
+            if (dist < -Mathf.Epsilon || dist > (1.0f + Mathf.Epsilon))
+                return new Vector3();
+
+            return begin + segment * -dist;
+        }
+
+        public bool is_point_over(Vector3 point)
+        {
+            return normal.dot(point) > d;
+        }
+
+        public Plane normalized()
+        {
+            float len = normal.length();
+
+            if (len == 0)
+                return new Plane(0, 0, 0, 0);
+
+            return new Plane(normal / len, d / len);
+        }
+
+        public Vector3 project(Vector3 point)
+        {
+            return point - normal * distance_to(point);
+        }
+
+        public Plane(float a, float b, float c, float d)
+        {
+            normal = new Vector3(a, b, c);
+            this.d = d;
+        }
+
+        public Plane(Vector3 normal, float d)
+        {
+            this.normal = normal;
+            this.d = d;
+        }
+
+        public Plane(Vector3 v1, Vector3 v2, Vector3 v3)
+        {
+            normal = (v1 - v3).cross(v1 - v2);
+            normal.normalize();
+            d = normal.dot(v1);
+        }
+
+        public static Plane operator -(Plane plane)
+        {
+            return new Plane(-plane.normal, -plane.d);
+        }
+
+        public static bool operator ==(Plane left, Plane right)
+        {
+            return left.Equals(right);
+        }
+
+        public static bool operator !=(Plane left, Plane right)
+        {
+            return !left.Equals(right);
+        }
+
+        public override bool Equals(object obj)
+        {
+            if (obj is Plane)
+            {
+                return Equals((Plane)obj);
+            }
+
+            return false;
+        }
+
+        public bool Equals(Plane other)
+        {
+            return normal == other.normal && d == other.d;
+        }
+
+        public override int GetHashCode()
+        {
+            return normal.GetHashCode() ^ d.GetHashCode();
+        }
+
+        public override string ToString()
+        {
+            return String.Format("({0}, {1})", new object[]
+            {
+                this.normal.ToString(),
+                this.d.ToString()
+            });
+        }
+
+        public string ToString(string format)
+        {
+            return String.Format("({0}, {1})", new object[]
+            {
+                this.normal.ToString(format),
+                this.d.ToString(format)
+            });
+        }
+    }
+}
diff --git a/modules/mono/glue/cs_files/Rect3.cs b/modules/mono/glue/cs_files/Rect3.cs
index 0d25de1ec66..617d33e7fd6 100644
--- a/modules/mono/glue/cs_files/Rect3.cs
+++ b/modules/mono/glue/cs_files/Rect3.cs
@@ -1,477 +1,477 @@
-using System;
-
-// file: core/math/rect3.h
-// commit: 7ad14e7a3e6f87ddc450f7e34621eb5200808451
-// file: core/math/rect3.cpp
-// commit: bd282ff43f23fe845f29a3e25c8efc01bd65ffb0
-// file: core/variant_call.cpp
-// commit: 5ad9be4c24e9d7dc5672fdc42cea896622fe5685
-
-namespace Godot
-{
-    public struct Rect3 : IEquatable<Rect3>
-    {
-        private Vector3 position;
-        private Vector3 size;
-
-        public Vector3 Position
-        {
-            get
-            {
-                return position;
-            }
-        }
-
-        public Vector3 Size
-        {
-            get
-            {
-                return size;
-            }
-        }
-
-        public Vector3 End
-        {
-            get
-            {
-                return position + size;
-            }
-        }
-
-        public bool encloses(Rect3 with)
-        {
-            Vector3 src_min = position;
-            Vector3 src_max = position + size;
-            Vector3 dst_min = with.position;
-            Vector3 dst_max = with.position + with.size;
-
-            return ((src_min.x <= dst_min.x) &&
-                    (src_max.x > dst_max.x) &&
-                    (src_min.y <= dst_min.y) &&
-                    (src_max.y > dst_max.y) &&
-                    (src_min.z <= dst_min.z) &&
-                    (src_max.z > dst_max.z));
-        }
-
-        public Rect3 expand(Vector3 to_point)
-        {
-            Vector3 begin = position;
-            Vector3 end = position + size;
-
-            if (to_point.x < begin.x)
-                begin.x = to_point.x;
-            if (to_point.y < begin.y)
-                begin.y = to_point.y;
-            if (to_point.z < begin.z)
-                begin.z = to_point.z;
-
-            if (to_point.x > end.x)
-                end.x = to_point.x;
-            if (to_point.y > end.y)
-                end.y = to_point.y;
-            if (to_point.z > end.z)
-                end.z = to_point.z;
-
-            return new Rect3(begin, end - begin);
-        }
-
-        public float get_area()
-        {
-            return size.x * size.y * size.z;
-        }
-
-        public Vector3 get_endpoint(int idx)
-        {
-            switch (idx)
-            {
-                case 0:
-                    return new Vector3(position.x, position.y, position.z);
-                case 1:
-                    return new Vector3(position.x, position.y, position.z + size.z);
-                case 2:
-                    return new Vector3(position.x, position.y + size.y, position.z);
-                case 3:
-                    return new Vector3(position.x, position.y + size.y, position.z + size.z);
-                case 4:
-                    return new Vector3(position.x + size.x, position.y, position.z);
-                case 5:
-                    return new Vector3(position.x + size.x, position.y, position.z + size.z);
-                case 6:
-                    return new Vector3(position.x + size.x, position.y + size.y, position.z);
-                case 7:
-                    return new Vector3(position.x + size.x, position.y + size.y, position.z + size.z);
-                default:
-                    throw new ArgumentOutOfRangeException(nameof(idx), String.Format("Index is {0}, but a value from 0 to 7 is expected.", idx));
-            }
-        }
-
-        public Vector3 get_longest_axis()
-        {
-            Vector3 axis = new Vector3(1f, 0f, 0f);
-            float max_size = size.x;
-
-            if (size.y > max_size)
-            {
-                axis = new Vector3(0f, 1f, 0f);
-                max_size = size.y;
-            }
-
-            if (size.z > max_size)
-            {
-                axis = new Vector3(0f, 0f, 1f);
-                max_size = size.z;
-            }
-
-            return axis;
-        }
-
-        public Vector3.Axis get_longest_axis_index()
-        {
-            Vector3.Axis axis = Vector3.Axis.X;
-            float max_size = size.x;
-
-            if (size.y > max_size)
-            {
-                axis = Vector3.Axis.Y;
-                max_size = size.y;
-            }
-
-            if (size.z > max_size)
-            {
-                axis = Vector3.Axis.Z;
-                max_size = size.z;
-            }
-
-            return axis;
-        }
-
-        public float get_longest_axis_size()
-        {
-            float max_size = size.x;
-
-            if (size.y > max_size)
-                max_size = size.y;
-
-            if (size.z > max_size)
-                max_size = size.z;
-
-            return max_size;
-        }
-
-        public Vector3 get_shortest_axis()
-        {
-            Vector3 axis = new Vector3(1f, 0f, 0f);
-            float max_size = size.x;
-
-            if (size.y < max_size)
-            {
-                axis = new Vector3(0f, 1f, 0f);
-                max_size = size.y;
-            }
-
-            if (size.z < max_size)
-            {
-                axis = new Vector3(0f, 0f, 1f);
-                max_size = size.z;
-            }
-
-            return axis;
-        }
-
-        public Vector3.Axis get_shortest_axis_index()
-        {
-            Vector3.Axis axis = Vector3.Axis.X;
-            float max_size = size.x;
-
-            if (size.y < max_size)
-            {
-                axis = Vector3.Axis.Y;
-                max_size = size.y;
-            }
-
-            if (size.z < max_size)
-            {
-                axis = Vector3.Axis.Z;
-                max_size = size.z;
-            }
-
-            return axis;
-        }
-
-        public float get_shortest_axis_size()
-        {
-            float max_size = size.x;
-
-            if (size.y < max_size)
-                max_size = size.y;
-
-            if (size.z < max_size)
-                max_size = size.z;
-
-            return max_size;
-        }
-
-        public Vector3 get_support(Vector3 dir)
-        {
-            Vector3 half_extents = size * 0.5f;
-            Vector3 ofs = position + half_extents;
-
-            return ofs + new Vector3(
-                (dir.x > 0f) ? -half_extents.x : half_extents.x,
-                (dir.y > 0f) ? -half_extents.y : half_extents.y,
-                (dir.z > 0f) ? -half_extents.z : half_extents.z);
-        }
-
-        public Rect3 grow(float by)
-        {
-            Rect3 res = this;
-
-            res.position.x -= by;
-            res.position.y -= by;
-            res.position.z -= by;
-            res.size.x += 2.0f * by;
-            res.size.y += 2.0f * by;
-            res.size.z += 2.0f * by;
-
-            return res;
-        }
-
-        public bool has_no_area()
-        {
-            return size.x <= 0f || size.y <= 0f || size.z <= 0f;
-        }
-
-        public bool has_no_surface()
-        {
-            return size.x <= 0f && size.y <= 0f && size.z <= 0f;
-        }
-
-        public bool has_point(Vector3 point)
-        {
-            if (point.x < position.x)
-                return false;
-            if (point.y < position.y)
-                return false;
-            if (point.z < position.z)
-                return false;
-            if (point.x > position.x + size.x)
-                return false;
-            if (point.y > position.y + size.y)
-                return false;
-            if (point.z > position.z + size.z)
-                return false;
-
-            return true;
-        }
-
-        public Rect3 intersection(Rect3 with)
-        {
-            Vector3 src_min = position;
-            Vector3 src_max = position + size;
-            Vector3 dst_min = with.position;
-            Vector3 dst_max = with.position + with.size;
-
-            Vector3 min, max;
-
-            if (src_min.x > dst_max.x || src_max.x < dst_min.x)
-            {
-                return new Rect3();
-            }
-            else
-            {
-                min.x = (src_min.x > dst_min.x) ? src_min.x : dst_min.x;
-                max.x = (src_max.x < dst_max.x) ? src_max.x : dst_max.x;
-            }
-
-            if (src_min.y > dst_max.y || src_max.y < dst_min.y)
-            {
-                return new Rect3();
-            }
-            else
-            {
-                min.y = (src_min.y > dst_min.y) ? src_min.y : dst_min.y;
-                max.y = (src_max.y < dst_max.y) ? src_max.y : dst_max.y;
-            }
-
-            if (src_min.z > dst_max.z || src_max.z < dst_min.z)
-            {
-                return new Rect3();
-            }
-            else
-            {
-                min.z = (src_min.z > dst_min.z) ? src_min.z : dst_min.z;
-                max.z = (src_max.z < dst_max.z) ? src_max.z : dst_max.z;
-            }
-
-            return new Rect3(min, max - min);
-        }
-
-        public bool intersects(Rect3 with)
-        {
-            if (position.x >= (with.position.x + with.size.x))
-                return false;
-            if ((position.x + size.x) <= with.position.x)
-                return false;
-            if (position.y >= (with.position.y + with.size.y))
-                return false;
-            if ((position.y + size.y) <= with.position.y)
-                return false;
-            if (position.z >= (with.position.z + with.size.z))
-                return false;
-            if ((position.z + size.z) <= with.position.z)
-                return false;
-
-            return true;
-        }
-
-        public bool intersects_plane(Plane plane)
-        {
-            Vector3[] points =
-            {
-                new Vector3(position.x, position.y, position.z),
-                new Vector3(position.x, position.y, position.z + size.z),
-                new Vector3(position.x, position.y + size.y, position.z),
-                new Vector3(position.x, position.y + size.y, position.z + size.z),
-                new Vector3(position.x + size.x, position.y, position.z),
-                new Vector3(position.x + size.x, position.y, position.z + size.z),
-                new Vector3(position.x + size.x, position.y + size.y, position.z),
-                new Vector3(position.x + size.x, position.y + size.y, position.z + size.z),
-            };
-
-            bool over = false;
-            bool under = false;
-
-            for (int i = 0; i < 8; i++)
-            {
-                if (plane.distance_to(points[i]) > 0)
-                    over = true;
-                else
-                    under = true;
-            }
-
-            return under && over;
-        }
-
-        public bool intersects_segment(Vector3 from, Vector3 to)
-        {
-            float min = 0f;
-            float max = 1f;
-
-            for (int i = 0; i < 3; i++)
-            {
-                float seg_from = from[i];
-                float seg_to = to[i];
-                float box_begin = position[i];
-                float box_end = box_begin + size[i];
-                float cmin, cmax;
-
-                if (seg_from < seg_to)
-                {
-                    if (seg_from > box_end || seg_to < box_begin)
-                        return false;
-
-                    float length = seg_to - seg_from;
-                    cmin = seg_from < box_begin ? (box_begin - seg_from) / length : 0f;
-                    cmax = seg_to > box_end ? (box_end - seg_from) / length : 1f;
-                }
-                else
-                {
-                    if (seg_to > box_end || seg_from < box_begin)
-                        return false;
-
-                    float length = seg_to - seg_from;
-                    cmin = seg_from > box_end ? (box_end - seg_from) / length : 0f;
-                    cmax = seg_to < box_begin ? (box_begin - seg_from) / length : 1f;
-                }
-
-                if (cmin > min)
-                {
-                    min = cmin;
-                }
-
-                if (cmax < max)
-                    max = cmax;
-                if (max < min)
-                    return false;
-            }
-
-            return true;
-        }
-
-        public Rect3 merge(Rect3 with)
-        {
-            Vector3 beg_1 = position;
-            Vector3 beg_2 = with.position;
-            Vector3 end_1 = new Vector3(size.x, size.y, size.z) + beg_1;
-            Vector3 end_2 = new Vector3(with.size.x, with.size.y, with.size.z) + beg_2;
-
-            Vector3 min = new Vector3(
-                              (beg_1.x < beg_2.x) ? beg_1.x : beg_2.x,
-                              (beg_1.y < beg_2.y) ? beg_1.y : beg_2.y,
-                              (beg_1.z < beg_2.z) ? beg_1.z : beg_2.z
-                          );
-
-            Vector3 max = new Vector3(
-                              (end_1.x > end_2.x) ? end_1.x : end_2.x,
-                              (end_1.y > end_2.y) ? end_1.y : end_2.y,
-                              (end_1.z > end_2.z) ? end_1.z : end_2.z
-                          );
-
-            return new Rect3(min, max - min);
-        }
-
-        public Rect3(Vector3 position, Vector3 size)
-        {
-            this.position = position;
-            this.size = size;
-        }
-
-        public static bool operator ==(Rect3 left, Rect3 right)
-        {
-            return left.Equals(right);
-        }
-
-        public static bool operator !=(Rect3 left, Rect3 right)
-        {
-            return !left.Equals(right);
-        }
-
-        public override bool Equals(object obj)
-        {
-            if (obj is Rect3)
-            {
-                return Equals((Rect3)obj);
-            }
-
-            return false;
-        }
-
-        public bool Equals(Rect3 other)
-        {
-            return position == other.position && size == other.size;
-        }
-
-        public override int GetHashCode()
-        {
-            return position.GetHashCode() ^ size.GetHashCode();
-        }
-
-        public override string ToString()
-        {
-            return String.Format("{0} - {1}", new object[]
-                {
-                    this.position.ToString(),
-                    this.size.ToString()
-                });
-        }
-
-        public string ToString(string format)
-        {
-            return String.Format("{0} - {1}", new object[]
-                {
-                    this.position.ToString(format),
-                    this.size.ToString(format)
-                });
-        }
-    }
-}
+using System;
+
+// file: core/math/rect3.h
+// commit: 7ad14e7a3e6f87ddc450f7e34621eb5200808451
+// file: core/math/rect3.cpp
+// commit: bd282ff43f23fe845f29a3e25c8efc01bd65ffb0
+// file: core/variant_call.cpp
+// commit: 5ad9be4c24e9d7dc5672fdc42cea896622fe5685
+
+namespace Godot
+{
+    public struct Rect3 : IEquatable<Rect3>
+    {
+        private Vector3 position;
+        private Vector3 size;
+
+        public Vector3 Position
+        {
+            get
+            {
+                return position;
+            }
+        }
+
+        public Vector3 Size
+        {
+            get
+            {
+                return size;
+            }
+        }
+
+        public Vector3 End
+        {
+            get
+            {
+                return position + size;
+            }
+        }
+
+        public bool encloses(Rect3 with)
+        {
+            Vector3 src_min = position;
+            Vector3 src_max = position + size;
+            Vector3 dst_min = with.position;
+            Vector3 dst_max = with.position + with.size;
+
+            return ((src_min.x <= dst_min.x) &&
+                    (src_max.x > dst_max.x) &&
+                    (src_min.y <= dst_min.y) &&
+                    (src_max.y > dst_max.y) &&
+                    (src_min.z <= dst_min.z) &&
+                    (src_max.z > dst_max.z));
+        }
+
+        public Rect3 expand(Vector3 to_point)
+        {
+            Vector3 begin = position;
+            Vector3 end = position + size;
+
+            if (to_point.x < begin.x)
+                begin.x = to_point.x;
+            if (to_point.y < begin.y)
+                begin.y = to_point.y;
+            if (to_point.z < begin.z)
+                begin.z = to_point.z;
+
+            if (to_point.x > end.x)
+                end.x = to_point.x;
+            if (to_point.y > end.y)
+                end.y = to_point.y;
+            if (to_point.z > end.z)
+                end.z = to_point.z;
+
+            return new Rect3(begin, end - begin);
+        }
+
+        public float get_area()
+        {
+            return size.x * size.y * size.z;
+        }
+
+        public Vector3 get_endpoint(int idx)
+        {
+            switch (idx)
+            {
+                case 0:
+                    return new Vector3(position.x, position.y, position.z);
+                case 1:
+                    return new Vector3(position.x, position.y, position.z + size.z);
+                case 2:
+                    return new Vector3(position.x, position.y + size.y, position.z);
+                case 3:
+                    return new Vector3(position.x, position.y + size.y, position.z + size.z);
+                case 4:
+                    return new Vector3(position.x + size.x, position.y, position.z);
+                case 5:
+                    return new Vector3(position.x + size.x, position.y, position.z + size.z);
+                case 6:
+                    return new Vector3(position.x + size.x, position.y + size.y, position.z);
+                case 7:
+                    return new Vector3(position.x + size.x, position.y + size.y, position.z + size.z);
+                default:
+                    throw new ArgumentOutOfRangeException(nameof(idx), String.Format("Index is {0}, but a value from 0 to 7 is expected.", idx));
+            }
+        }
+
+        public Vector3 get_longest_axis()
+        {
+            Vector3 axis = new Vector3(1f, 0f, 0f);
+            float max_size = size.x;
+
+            if (size.y > max_size)
+            {
+                axis = new Vector3(0f, 1f, 0f);
+                max_size = size.y;
+            }
+
+            if (size.z > max_size)
+            {
+                axis = new Vector3(0f, 0f, 1f);
+                max_size = size.z;
+            }
+
+            return axis;
+        }
+
+        public Vector3.Axis get_longest_axis_index()
+        {
+            Vector3.Axis axis = Vector3.Axis.X;
+            float max_size = size.x;
+
+            if (size.y > max_size)
+            {
+                axis = Vector3.Axis.Y;
+                max_size = size.y;
+            }
+
+            if (size.z > max_size)
+            {
+                axis = Vector3.Axis.Z;
+                max_size = size.z;
+            }
+
+            return axis;
+        }
+
+        public float get_longest_axis_size()
+        {
+            float max_size = size.x;
+
+            if (size.y > max_size)
+                max_size = size.y;
+
+            if (size.z > max_size)
+                max_size = size.z;
+
+            return max_size;
+        }
+
+        public Vector3 get_shortest_axis()
+        {
+            Vector3 axis = new Vector3(1f, 0f, 0f);
+            float max_size = size.x;
+
+            if (size.y < max_size)
+            {
+                axis = new Vector3(0f, 1f, 0f);
+                max_size = size.y;
+            }
+
+            if (size.z < max_size)
+            {
+                axis = new Vector3(0f, 0f, 1f);
+                max_size = size.z;
+            }
+
+            return axis;
+        }
+
+        public Vector3.Axis get_shortest_axis_index()
+        {
+            Vector3.Axis axis = Vector3.Axis.X;
+            float max_size = size.x;
+
+            if (size.y < max_size)
+            {
+                axis = Vector3.Axis.Y;
+                max_size = size.y;
+            }
+
+            if (size.z < max_size)
+            {
+                axis = Vector3.Axis.Z;
+                max_size = size.z;
+            }
+
+            return axis;
+        }
+
+        public float get_shortest_axis_size()
+        {
+            float max_size = size.x;
+
+            if (size.y < max_size)
+                max_size = size.y;
+
+            if (size.z < max_size)
+                max_size = size.z;
+
+            return max_size;
+        }
+
+        public Vector3 get_support(Vector3 dir)
+        {
+            Vector3 half_extents = size * 0.5f;
+            Vector3 ofs = position + half_extents;
+
+            return ofs + new Vector3(
+                (dir.x > 0f) ? -half_extents.x : half_extents.x,
+                (dir.y > 0f) ? -half_extents.y : half_extents.y,
+                (dir.z > 0f) ? -half_extents.z : half_extents.z);
+        }
+
+        public Rect3 grow(float by)
+        {
+            Rect3 res = this;
+
+            res.position.x -= by;
+            res.position.y -= by;
+            res.position.z -= by;
+            res.size.x += 2.0f * by;
+            res.size.y += 2.0f * by;
+            res.size.z += 2.0f * by;
+
+            return res;
+        }
+
+        public bool has_no_area()
+        {
+            return size.x <= 0f || size.y <= 0f || size.z <= 0f;
+        }
+
+        public bool has_no_surface()
+        {
+            return size.x <= 0f && size.y <= 0f && size.z <= 0f;
+        }
+
+        public bool has_point(Vector3 point)
+        {
+            if (point.x < position.x)
+                return false;
+            if (point.y < position.y)
+                return false;
+            if (point.z < position.z)
+                return false;
+            if (point.x > position.x + size.x)
+                return false;
+            if (point.y > position.y + size.y)
+                return false;
+            if (point.z > position.z + size.z)
+                return false;
+
+            return true;
+        }
+
+        public Rect3 intersection(Rect3 with)
+        {
+            Vector3 src_min = position;
+            Vector3 src_max = position + size;
+            Vector3 dst_min = with.position;
+            Vector3 dst_max = with.position + with.size;
+
+            Vector3 min, max;
+
+            if (src_min.x > dst_max.x || src_max.x < dst_min.x)
+            {
+                return new Rect3();
+            }
+            else
+            {
+                min.x = (src_min.x > dst_min.x) ? src_min.x : dst_min.x;
+                max.x = (src_max.x < dst_max.x) ? src_max.x : dst_max.x;
+            }
+
+            if (src_min.y > dst_max.y || src_max.y < dst_min.y)
+            {
+                return new Rect3();
+            }
+            else
+            {
+                min.y = (src_min.y > dst_min.y) ? src_min.y : dst_min.y;
+                max.y = (src_max.y < dst_max.y) ? src_max.y : dst_max.y;
+            }
+
+            if (src_min.z > dst_max.z || src_max.z < dst_min.z)
+            {
+                return new Rect3();
+            }
+            else
+            {
+                min.z = (src_min.z > dst_min.z) ? src_min.z : dst_min.z;
+                max.z = (src_max.z < dst_max.z) ? src_max.z : dst_max.z;
+            }
+
+            return new Rect3(min, max - min);
+        }
+
+        public bool intersects(Rect3 with)
+        {
+            if (position.x >= (with.position.x + with.size.x))
+                return false;
+            if ((position.x + size.x) <= with.position.x)
+                return false;
+            if (position.y >= (with.position.y + with.size.y))
+                return false;
+            if ((position.y + size.y) <= with.position.y)
+                return false;
+            if (position.z >= (with.position.z + with.size.z))
+                return false;
+            if ((position.z + size.z) <= with.position.z)
+                return false;
+
+            return true;
+        }
+
+        public bool intersects_plane(Plane plane)
+        {
+            Vector3[] points =
+            {
+                new Vector3(position.x, position.y, position.z),
+                new Vector3(position.x, position.y, position.z + size.z),
+                new Vector3(position.x, position.y + size.y, position.z),
+                new Vector3(position.x, position.y + size.y, position.z + size.z),
+                new Vector3(position.x + size.x, position.y, position.z),
+                new Vector3(position.x + size.x, position.y, position.z + size.z),
+                new Vector3(position.x + size.x, position.y + size.y, position.z),
+                new Vector3(position.x + size.x, position.y + size.y, position.z + size.z),
+            };
+
+            bool over = false;
+            bool under = false;
+
+            for (int i = 0; i < 8; i++)
+            {
+                if (plane.distance_to(points[i]) > 0)
+                    over = true;
+                else
+                    under = true;
+            }
+
+            return under && over;
+        }
+
+        public bool intersects_segment(Vector3 from, Vector3 to)
+        {
+            float min = 0f;
+            float max = 1f;
+
+            for (int i = 0; i < 3; i++)
+            {
+                float seg_from = from[i];
+                float seg_to = to[i];
+                float box_begin = position[i];
+                float box_end = box_begin + size[i];
+                float cmin, cmax;
+
+                if (seg_from < seg_to)
+                {
+                    if (seg_from > box_end || seg_to < box_begin)
+                        return false;
+
+                    float length = seg_to - seg_from;
+                    cmin = seg_from < box_begin ? (box_begin - seg_from) / length : 0f;
+                    cmax = seg_to > box_end ? (box_end - seg_from) / length : 1f;
+                }
+                else
+                {
+                    if (seg_to > box_end || seg_from < box_begin)
+                        return false;
+
+                    float length = seg_to - seg_from;
+                    cmin = seg_from > box_end ? (box_end - seg_from) / length : 0f;
+                    cmax = seg_to < box_begin ? (box_begin - seg_from) / length : 1f;
+                }
+
+                if (cmin > min)
+                {
+                    min = cmin;
+                }
+
+                if (cmax < max)
+                    max = cmax;
+                if (max < min)
+                    return false;
+            }
+
+            return true;
+        }
+
+        public Rect3 merge(Rect3 with)
+        {
+            Vector3 beg_1 = position;
+            Vector3 beg_2 = with.position;
+            Vector3 end_1 = new Vector3(size.x, size.y, size.z) + beg_1;
+            Vector3 end_2 = new Vector3(with.size.x, with.size.y, with.size.z) + beg_2;
+
+            Vector3 min = new Vector3(
+                              (beg_1.x < beg_2.x) ? beg_1.x : beg_2.x,
+                              (beg_1.y < beg_2.y) ? beg_1.y : beg_2.y,
+                              (beg_1.z < beg_2.z) ? beg_1.z : beg_2.z
+                          );
+
+            Vector3 max = new Vector3(
+                              (end_1.x > end_2.x) ? end_1.x : end_2.x,
+                              (end_1.y > end_2.y) ? end_1.y : end_2.y,
+                              (end_1.z > end_2.z) ? end_1.z : end_2.z
+                          );
+
+            return new Rect3(min, max - min);
+        }
+
+        public Rect3(Vector3 position, Vector3 size)
+        {
+            this.position = position;
+            this.size = size;
+        }
+
+        public static bool operator ==(Rect3 left, Rect3 right)
+        {
+            return left.Equals(right);
+        }
+
+        public static bool operator !=(Rect3 left, Rect3 right)
+        {
+            return !left.Equals(right);
+        }
+
+        public override bool Equals(object obj)
+        {
+            if (obj is Rect3)
+            {
+                return Equals((Rect3)obj);
+            }
+
+            return false;
+        }
+
+        public bool Equals(Rect3 other)
+        {
+            return position == other.position && size == other.size;
+        }
+
+        public override int GetHashCode()
+        {
+            return position.GetHashCode() ^ size.GetHashCode();
+        }
+
+        public override string ToString()
+        {
+            return String.Format("{0} - {1}", new object[]
+                {
+                    this.position.ToString(),
+                    this.size.ToString()
+                });
+        }
+
+        public string ToString(string format)
+        {
+            return String.Format("{0} - {1}", new object[]
+                {
+                    this.position.ToString(format),
+                    this.size.ToString(format)
+                });
+        }
+    }
+}
diff --git a/modules/mono/glue/cs_files/ToolAttribute.cs b/modules/mono/glue/cs_files/ToolAttribute.cs
index 0275982c7fc..d8601b5b327 100644
--- a/modules/mono/glue/cs_files/ToolAttribute.cs
+++ b/modules/mono/glue/cs_files/ToolAttribute.cs
@@ -1,4 +1,4 @@
-﻿using System;
+using System;
 
 namespace Godot
 {
diff --git a/platform/android/java/gradlew.bat b/platform/android/java/gradlew.bat
index aec99730b4e..8a0b282aa68 100644
--- a/platform/android/java/gradlew.bat
+++ b/platform/android/java/gradlew.bat
@@ -1,90 +1,90 @@
-@if "%DEBUG%" == "" @echo off
-@rem ##########################################################################
-@rem
-@rem  Gradle startup script for Windows
-@rem
-@rem ##########################################################################
-
-@rem Set local scope for the variables with windows NT shell
-if "%OS%"=="Windows_NT" setlocal
-
-@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-set DEFAULT_JVM_OPTS=
-
-set DIRNAME=%~dp0
-if "%DIRNAME%" == "" set DIRNAME=.
-set APP_BASE_NAME=%~n0
-set APP_HOME=%DIRNAME%
-
-@rem Find java.exe
-if defined JAVA_HOME goto findJavaFromJavaHome
-
-set JAVA_EXE=java.exe
-%JAVA_EXE% -version >NUL 2>&1
-if "%ERRORLEVEL%" == "0" goto init
-
-echo.
-echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:findJavaFromJavaHome
-set JAVA_HOME=%JAVA_HOME:"=%
-set JAVA_EXE=%JAVA_HOME%/bin/java.exe
-
-if exist "%JAVA_EXE%" goto init
-
-echo.
-echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:init
-@rem Get command-line arguments, handling Windowz variants
-
-if not "%OS%" == "Windows_NT" goto win9xME_args
-if "%@eval[2+2]" == "4" goto 4NT_args
-
-:win9xME_args
-@rem Slurp the command line arguments.
-set CMD_LINE_ARGS=
-set _SKIP=2
-
-:win9xME_args_slurp
-if "x%~1" == "x" goto execute
-
-set CMD_LINE_ARGS=%*
-goto execute
-
-:4NT_args
-@rem Get arguments from the 4NT Shell from JP Software
-set CMD_LINE_ARGS=%$
-
-:execute
-@rem Setup the command line
-
-set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
-
-@rem Execute Gradle
-"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
-
-:end
-@rem End local scope for the variables with windows NT shell
-if "%ERRORLEVEL%"=="0" goto mainEnd
-
-:fail
-rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
-rem the _cmd.exe /c_ return code!
-if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
-exit /b 1
-
-:mainEnd
-if "%OS%"=="Windows_NT" endlocal
-
-:omega
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windowz variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+if "%@eval[2+2]" == "4" goto 4NT_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+goto execute
+
+:4NT_args
+@rem Get arguments from the 4NT Shell from JP Software
+set CMD_LINE_ARGS=%$
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/thirdparty/etc2comp/AUTHORS b/thirdparty/etc2comp/AUTHORS
index 32daca27fe9..e78a7f4d218 100644
--- a/thirdparty/etc2comp/AUTHORS
+++ b/thirdparty/etc2comp/AUTHORS
@@ -1,7 +1,7 @@
-# This is the list of Etc2Comp authors for copyright purposes.
-#
-# This does not necessarily list everyone who has contributed code, since in
-# some cases, their employer may be the copyright holder.  To see the full list
-# of contributors, see the revision history in source control.
-Google Inc.
-Blue Shift Inc.
+# This is the list of Etc2Comp authors for copyright purposes.
+#
+# This does not necessarily list everyone who has contributed code, since in
+# some cases, their employer may be the copyright holder.  To see the full list
+# of contributors, see the revision history in source control.
+Google Inc.
+Blue Shift Inc.
diff --git a/thirdparty/etc2comp/LICENSE b/thirdparty/etc2comp/LICENSE
index 75b52484ea4..d6456956733 100644
--- a/thirdparty/etc2comp/LICENSE
+++ b/thirdparty/etc2comp/LICENSE
@@ -1,202 +1,202 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/thirdparty/etc2comp/README.md b/thirdparty/etc2comp/README.md
index 1c70ae9f4eb..2f4363d0429 100644
--- a/thirdparty/etc2comp/README.md
+++ b/thirdparty/etc2comp/README.md
@@ -1,197 +1,197 @@
-# Etc2Comp - Texture to ETC2 compressor
-
-Etc2Comp is a command line tool that converts textures (e.g. bitmaps)
-into the [ETC2](https://en.wikipedia.org/wiki/Ericsson_Texture_Compression)
-format. The tool is built with a focus on encoding performance
-to reduce the amount of time required to compile asset heavy applications as
-well as reduce overall application size.
-
-This repo provides source code that can be compiled into a binary. The
-binary can then be used to convert textures to the ETC2 format.
-
-Important: This is not an official Google product. It is an experimental
-library published as-is. Please see the CONTRIBUTORS.md file for information
-about questions or issues.
-
-## Setup
-This project uses [CMake](https://cmake.org/) to generate platform-specific
-build files:
- - Linux: make files
- - OS X: Xcode workspace files
- - Microsoft Windows: Visual Studio solution files
- - Note: CMake supports other formats, but this doc only provides steps for
- one of each platform for brevity.
-
-Refer to each platform's setup section to setup your environment and build
-an Etc2Comp binary. Then skip to the usage section of this page for examples
-of how to use the library.
-
-### Setup for OS X
- build tested on this config:
-  OS X 10.9.5 i7 16GB RAM
-  Xcode 5.1.1
-  cmake 3.2.3
-  
-Start by downloading and installing the following components if they are not
-already installed on your development machine.
- - *Xcode* version 5.1.1, or greater
- - [CMake](https://cmake.org/download/) version 3.2.3, or greater
-
-To build the Etc2Comp binary:
- 1. Open a *Terminal* window and navigate to the project directory.
- 1. Run `mkdir build_xcode`
- 1. Run `cd build_xcode`
- 1. Run `cmake -G Xcode ../`
- 1. Open *Xcode* and import the `build_xcode/EtcTest.xcodeproj` file.
- 1. Open the Product menu and choose Build For -> Running.
- 1. Once the build succeeds the binary located at `build_xcode/EtcTool/Debug/EtcTool`
-can be executed.
-
-Optional
-Xcode EtcTool ‘Run’ preferences
-note: if the build_xcode/EtcTest.xcodeproj is manually deleted then some Xcode preferences 
-will need to be set by hand after cmake is run (these prefs are retained across 
-cmake updates if the .xcodeproj is not deleted/removed)
-
-1. Set the active scheme to ‘EtcTool’
-1. Edit the scheme
-1. Select option ‘Run EtcTool’, then tab ‘Arguments’. 
-Add this launch argument: ‘-argfile ../../EtcTool/args.txt’
-1. Select tab ‘Options’ and set a custom working directory to: ‘$(SRCROOT)/Build_Xcode/EtcTool’
-
-### SetUp for Windows
-
-1. Open a *Terminal* window and navigate to the project directory.
-1. Run `mkdir build_vs`
-1. Run `cd build_vs`
-1. Run CMAKE, noting what build version you need, and pointing to the parent directory as the source root; 
-  For VS 2013 : `cmake -G "Visual Studio 12 2013 Win64" ../`
-  For VS 2015 : `cmake -G "Visual Studio 14 2015 Win64" ../`
-  NOTE: To see what supported Visual Studio outputs there are, run `cmake -G`
-1. open the 'EtcTest' solution
-1. make the 'EtcTool' project the start up project 
-1. (optional) in the project properties, under 'Debugging ->command arguments' 
-add the argfile textfile thats included in the EtcTool directory. 
-example: -argfile C:\etc2\EtcTool\Args.txt
-
-### Setup For Linux
-The Linux build was tested on this config:
-  Ubuntu desktop 14.04
-  gcc/g++ 4.8
-  cmake 2.8.12.2
-
-1. Verify linux has cmake and C++-11 capable g++ installed
-1. Open shell
-1. Run `mkdir build_linux`
-1. Run `cd build_linux`
-1. Run `cmake ../`
-1. Run `make`
-1. navigate to the newly created EtcTool directory `cd EtcTool`
-1. run the executable: `./EtcTool -argfile ../../EtcTool/args.txt`
-
-Skip to the <a href="#usage">Usage</a> section for more information about using the
-tool.
-
-## Usage
-
-### Command Line Usage
-EtcTool can be run from the command line with the following usage:
-    etctool.exe source_image [options ...] -output encoded_image
-
-The encoder will use an array of RGBA floats read from the source_image to create 
-an ETC1 or ETC2 encoded image in encoded_image.  The RGBA floats should be in the 
-range [0:1].
-
-Options:
-
-    -analyze <analysis_folder>
-    -argfile <arg_file>           additional command line arguments read from a file
-    -blockAtHV <H V>              encodes a single block that contains the
-                                  pixel specified by the H V coordinates
-    -compare <comparison_image>   compares source_image to comparison_image
-    -effort <amount>              number between 0 and 100 to specify the encoding quality 
-                                  (100 is the highest quality)
-    -errormetric <error_metric>   specify the error metric, the options are
-                                  rgba, rgbx, rec709, numeric and normalxyz
-    -format <etc_format>          ETC1, RGB8, SRGB8, RGBA8, SRGB8, RGB8A1,
-                                  SRGB8A1 or R11
-    -help                         prints this message
-    -jobs or -j <thread_count>    specifies the number of threads (default=1)
-    -normalizexyz                 normalize RGB to have a length of 1
-    -verbose or -v                shows status information during the encoding
-                                  process
-	-mipmaps or -m <mip_count>    sets the maximum number of mipaps to generate (default=1)
-	-mipwrap or -w <x|y|xy>       sets the mipmap filter wrap mode (default=clamp)
-
-* -analyze will run an analysis of the encoding and place it in folder 
-"analysis_folder" (e.g. ../analysis/kodim05).  within the analysis_folder, a folder 
-will be created with a name of the current date/time (e.g. 20151204_153306).  this 
-date/time folder is used to compare encodings of the same texture over time.  
-within the date/time folder is a text file with several encoding stats and a 2x png 
-image showing the encoding mode for each 4x4 block.
-
-* -argfile allows additional command line arguments to be placed in a text file
-
-* -blockAtHV selects the 4x4 pixel subset of the source image at position (H,V).  
-This is mainly used for debugging
-
-* -compare compares the source image to the created encoded image. The encoding
-will dictate what error analysis is used in the comparison.
-
-* -effort uses an "amount" between 0 and 100 to determine how much additional effort 
-to apply during the encoding.
-
-* -errormetric selects the fitting algorithm used by the encoder.  "rgba" calculates 
-RMS error using RGB components that are weighted by A.  "rgbx" calculates RMS error 
-using RGBA components, where A is treated as an additional data channel, instead of 
-as alpha.  "rec709" is similar to "rgba", except the RGB components are also weighted 
-according to Rec709.  "numeric" calculates RMS error using unweighted RGBA components.  
-"normalize" calculates error based on dot product and vector length for RGB and RMS 
-error for A.
-
-* -help prints out the usage message
-
-* -jobs enables multi-threading to speed up image encoding
-
-* -normalizexyz normalizes the source RGB to have a length of 1.
-
-* -verbose shows information on the current encoding process. It will then display the 
-PSNR and time time it took to encode the image.
-
-* -mipmaps takes an argument that specifies how many mipmaps to generate from the 
-source image.  The mipmaps are generated with a lanczos3 filter using edge clamping.
-If the mipmaps option is not specified no mipmaps are created.
-
-* -mipwrap takes an argument that specifies the mipmap filter wrap mode.  The options 
-are "x", "y" and "xy" which specify wrapping in x only, y only or x and y respectively.
-The default options are clamping in both x and y.
-
-Note: Path names can use slashes or backslashes.  The tool will convert the 
-slashes to the appropriate polarity for the current platform.
-
-
-## API
-
-The library supports two different APIs - a C-like API that is not heavily 
-class-based and a class-based API.
-
-main() in EtcTool.cpp contains an example of both APIs.
-
-The Encode() method now returns an EncodingStatus that contains bit flags for
-reporting various warnings and flags encountered when encoding.
-
-
-## Copyright
-Copyright 2015 Etc2Comp Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
+# Etc2Comp - Texture to ETC2 compressor
+
+Etc2Comp is a command line tool that converts textures (e.g. bitmaps)
+into the [ETC2](https://en.wikipedia.org/wiki/Ericsson_Texture_Compression)
+format. The tool is built with a focus on encoding performance
+to reduce the amount of time required to compile asset heavy applications as
+well as reduce overall application size.
+
+This repo provides source code that can be compiled into a binary. The
+binary can then be used to convert textures to the ETC2 format.
+
+Important: This is not an official Google product. It is an experimental
+library published as-is. Please see the CONTRIBUTORS.md file for information
+about questions or issues.
+
+## Setup
+This project uses [CMake](https://cmake.org/) to generate platform-specific
+build files:
+ - Linux: make files
+ - OS X: Xcode workspace files
+ - Microsoft Windows: Visual Studio solution files
+ - Note: CMake supports other formats, but this doc only provides steps for
+ one of each platform for brevity.
+
+Refer to each platform's setup section to setup your environment and build
+an Etc2Comp binary. Then skip to the usage section of this page for examples
+of how to use the library.
+
+### Setup for OS X
+ build tested on this config:
+  OS X 10.9.5 i7 16GB RAM
+  Xcode 5.1.1
+  cmake 3.2.3
+  
+Start by downloading and installing the following components if they are not
+already installed on your development machine.
+ - *Xcode* version 5.1.1, or greater
+ - [CMake](https://cmake.org/download/) version 3.2.3, or greater
+
+To build the Etc2Comp binary:
+ 1. Open a *Terminal* window and navigate to the project directory.
+ 1. Run `mkdir build_xcode`
+ 1. Run `cd build_xcode`
+ 1. Run `cmake -G Xcode ../`
+ 1. Open *Xcode* and import the `build_xcode/EtcTest.xcodeproj` file.
+ 1. Open the Product menu and choose Build For -> Running.
+ 1. Once the build succeeds the binary located at `build_xcode/EtcTool/Debug/EtcTool`
+can be executed.
+
+Optional
+Xcode EtcTool ‘Run’ preferences
+note: if the build_xcode/EtcTest.xcodeproj is manually deleted then some Xcode preferences 
+will need to be set by hand after cmake is run (these prefs are retained across 
+cmake updates if the .xcodeproj is not deleted/removed)
+
+1. Set the active scheme to ‘EtcTool’
+1. Edit the scheme
+1. Select option ‘Run EtcTool’, then tab ‘Arguments’. 
+Add this launch argument: ‘-argfile ../../EtcTool/args.txt’
+1. Select tab ‘Options’ and set a custom working directory to: ‘$(SRCROOT)/Build_Xcode/EtcTool’
+
+### SetUp for Windows
+
+1. Open a *Terminal* window and navigate to the project directory.
+1. Run `mkdir build_vs`
+1. Run `cd build_vs`
+1. Run CMAKE, noting what build version you need, and pointing to the parent directory as the source root; 
+  For VS 2013 : `cmake -G "Visual Studio 12 2013 Win64" ../`
+  For VS 2015 : `cmake -G "Visual Studio 14 2015 Win64" ../`
+  NOTE: To see what supported Visual Studio outputs there are, run `cmake -G`
+1. open the 'EtcTest' solution
+1. make the 'EtcTool' project the start up project 
+1. (optional) in the project properties, under 'Debugging ->command arguments' 
+add the argfile textfile thats included in the EtcTool directory. 
+example: -argfile C:\etc2\EtcTool\Args.txt
+
+### Setup For Linux
+The Linux build was tested on this config:
+  Ubuntu desktop 14.04
+  gcc/g++ 4.8
+  cmake 2.8.12.2
+
+1. Verify linux has cmake and C++-11 capable g++ installed
+1. Open shell
+1. Run `mkdir build_linux`
+1. Run `cd build_linux`
+1. Run `cmake ../`
+1. Run `make`
+1. navigate to the newly created EtcTool directory `cd EtcTool`
+1. run the executable: `./EtcTool -argfile ../../EtcTool/args.txt`
+
+Skip to the <a href="#usage">Usage</a> section for more information about using the
+tool.
+
+## Usage
+
+### Command Line Usage
+EtcTool can be run from the command line with the following usage:
+    etctool.exe source_image [options ...] -output encoded_image
+
+The encoder will use an array of RGBA floats read from the source_image to create 
+an ETC1 or ETC2 encoded image in encoded_image.  The RGBA floats should be in the 
+range [0:1].
+
+Options:
+
+    -analyze <analysis_folder>
+    -argfile <arg_file>           additional command line arguments read from a file
+    -blockAtHV <H V>              encodes a single block that contains the
+                                  pixel specified by the H V coordinates
+    -compare <comparison_image>   compares source_image to comparison_image
+    -effort <amount>              number between 0 and 100 to specify the encoding quality 
+                                  (100 is the highest quality)
+    -errormetric <error_metric>   specify the error metric, the options are
+                                  rgba, rgbx, rec709, numeric and normalxyz
+    -format <etc_format>          ETC1, RGB8, SRGB8, RGBA8, SRGB8, RGB8A1,
+                                  SRGB8A1 or R11
+    -help                         prints this message
+    -jobs or -j <thread_count>    specifies the number of threads (default=1)
+    -normalizexyz                 normalize RGB to have a length of 1
+    -verbose or -v                shows status information during the encoding
+                                  process
+	-mipmaps or -m <mip_count>    sets the maximum number of mipaps to generate (default=1)
+	-mipwrap or -w <x|y|xy>       sets the mipmap filter wrap mode (default=clamp)
+
+* -analyze will run an analysis of the encoding and place it in folder 
+"analysis_folder" (e.g. ../analysis/kodim05).  within the analysis_folder, a folder 
+will be created with a name of the current date/time (e.g. 20151204_153306).  this 
+date/time folder is used to compare encodings of the same texture over time.  
+within the date/time folder is a text file with several encoding stats and a 2x png 
+image showing the encoding mode for each 4x4 block.
+
+* -argfile allows additional command line arguments to be placed in a text file
+
+* -blockAtHV selects the 4x4 pixel subset of the source image at position (H,V).  
+This is mainly used for debugging
+
+* -compare compares the source image to the created encoded image. The encoding
+will dictate what error analysis is used in the comparison.
+
+* -effort uses an "amount" between 0 and 100 to determine how much additional effort 
+to apply during the encoding.
+
+* -errormetric selects the fitting algorithm used by the encoder.  "rgba" calculates 
+RMS error using RGB components that are weighted by A.  "rgbx" calculates RMS error 
+using RGBA components, where A is treated as an additional data channel, instead of 
+as alpha.  "rec709" is similar to "rgba", except the RGB components are also weighted 
+according to Rec709.  "numeric" calculates RMS error using unweighted RGBA components.  
+"normalize" calculates error based on dot product and vector length for RGB and RMS 
+error for A.
+
+* -help prints out the usage message
+
+* -jobs enables multi-threading to speed up image encoding
+
+* -normalizexyz normalizes the source RGB to have a length of 1.
+
+* -verbose shows information on the current encoding process. It will then display the 
+PSNR and time time it took to encode the image.
+
+* -mipmaps takes an argument that specifies how many mipmaps to generate from the 
+source image.  The mipmaps are generated with a lanczos3 filter using edge clamping.
+If the mipmaps option is not specified no mipmaps are created.
+
+* -mipwrap takes an argument that specifies the mipmap filter wrap mode.  The options 
+are "x", "y" and "xy" which specify wrapping in x only, y only or x and y respectively.
+The default options are clamping in both x and y.
+
+Note: Path names can use slashes or backslashes.  The tool will convert the 
+slashes to the appropriate polarity for the current platform.
+
+
+## API
+
+The library supports two different APIs - a C-like API that is not heavily 
+class-based and a class-based API.
+
+main() in EtcTool.cpp contains an example of both APIs.
+
+The Encode() method now returns an EncodingStatus that contains bit flags for
+reporting various warnings and flags encountered when encoding.
+
+
+## Copyright
+Copyright 2015 Etc2Comp Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/thirdparty/libtheora/x86_vc/mmxencfrag.c b/thirdparty/libtheora/x86_vc/mmxencfrag.c
index ac9dacf3773..94f1d06513e 100644
--- a/thirdparty/libtheora/x86_vc/mmxencfrag.c
+++ b/thirdparty/libtheora/x86_vc/mmxencfrag.c
@@ -1,969 +1,969 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
-
- ********************************************************************/
-#include <stddef.h>
-#include "x86enc.h"
-
-#if defined(OC_X86_ASM)
-
-unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride){
-  ptrdiff_t ret;
-  __asm{
-#define SRC esi
-#define REF edx
-#define YSTRIDE ecx
-#define YSTRIDE3 edi
-    mov YSTRIDE,_ystride
-    mov SRC,_src
-    mov REF,_ref
-    /*Load the first 4 rows of each block.*/
-    movq mm0,[SRC]
-    movq mm1,[REF]
-    movq mm2,[SRC][YSTRIDE]
-    movq mm3,[REF][YSTRIDE]
-    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
-    movq mm4,[SRC+YSTRIDE*2]
-    movq mm5,[REF+YSTRIDE*2]
-    movq mm6,[SRC+YSTRIDE3]
-    movq mm7,[REF+YSTRIDE3]
-    /*Compute their SADs and add them in mm0*/
-    psadbw mm0,mm1
-    psadbw mm2,mm3
-    lea SRC,[SRC+YSTRIDE*4]
-    paddw mm0,mm2
-    lea REF,[REF+YSTRIDE*4]
-    /*Load the next 3 rows as registers become available.*/
-    movq mm2,[SRC]
-    movq mm3,[REF]
-    psadbw mm4,mm5
-    psadbw mm6,mm7
-    paddw mm0,mm4
-    movq mm5,[REF+YSTRIDE]
-    movq mm4,[SRC+YSTRIDE]
-    paddw mm0,mm6
-    movq mm7,[REF+YSTRIDE*2]
-    movq mm6,[SRC+YSTRIDE*2]
-    /*Start adding their SADs to mm0*/
-    psadbw mm2,mm3
-    psadbw mm4,mm5
-    paddw mm0,mm2
-    psadbw mm6,mm7
-    /*Load last row as registers become available.*/
-    movq mm2,[SRC+YSTRIDE3]
-    movq mm3,[REF+YSTRIDE3]
-    /*And finish adding up their SADs.*/
-    paddw mm0,mm4
-    psadbw mm2,mm3
-    paddw mm0,mm6
-    paddw mm0,mm2
-    movd [ret],mm0
-#undef SRC
-#undef REF
-#undef YSTRIDE
-#undef YSTRIDE3
-  }
-  return (unsigned)ret;
-}
-
-unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh){
-  /*Early termination is for suckers.*/
-  return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
-}
-
-#define OC_SAD2_LOOP __asm{ \
-  /*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \
-     pavgb computes (mm0+mm1+1>>1). \
-   The latter is exactly 1 too large when the low bit of two corresponding \
-    bytes is only set in one of them. \
-   Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
-    correct the output of pavgb.*/ \
-  __asm  movq mm6,mm0 \
-  __asm  lea REF1,[REF1+YSTRIDE*2] \
-  __asm  pxor mm0,mm1 \
-  __asm  pavgb mm6,mm1 \
-  __asm  lea REF2,[REF2+YSTRIDE*2] \
-  __asm  movq mm1,mm2 \
-  __asm  pand mm0,mm7 \
-  __asm  pavgb mm2,mm3 \
-  __asm  pxor mm1,mm3 \
-  __asm  movq mm3,[REF2+YSTRIDE] \
-  __asm  psubb mm6,mm0 \
-  __asm  movq mm0,[REF1] \
-  __asm  pand mm1,mm7 \
-  __asm  psadbw mm4,mm6 \
-  __asm  movd mm6,RET \
-  __asm  psubb mm2,mm1 \
-  __asm  movq mm1,[REF2] \
-  __asm  lea SRC,[SRC+YSTRIDE*2] \
-  __asm  psadbw mm5,mm2 \
-  __asm  movq mm2,[REF1+YSTRIDE] \
-  __asm  paddw mm5,mm4 \
-  __asm  movq mm4,[SRC] \
-  __asm  paddw mm6,mm5 \
-  __asm  movq mm5,[SRC+YSTRIDE] \
-  __asm  movd RET,mm6 \
-}
-
-/*Same as above, but does not pre-load the next two rows.*/
-#define OC_SAD2_TAIL __asm{ \
-  __asm  movq mm6,mm0 \
-  __asm  pavgb mm0,mm1 \
-  __asm  pxor mm6,mm1 \
-  __asm  movq mm1,mm2 \
-  __asm  pand mm6,mm7 \
-  __asm  pavgb mm2,mm3 \
-  __asm  pxor mm1,mm3 \
-  __asm  psubb mm0,mm6 \
-  __asm  pand mm1,mm7 \
-  __asm  psadbw mm4,mm0 \
-  __asm  psubb mm2,mm1 \
-  __asm  movd mm6,RET \
-  __asm  psadbw mm5,mm2 \
-  __asm  paddw mm5,mm4 \
-  __asm  paddw mm6,mm5 \
-  __asm  movd RET,mm6 \
-}
-
-unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh){
-  ptrdiff_t ret;
-  __asm{
-#define REF1 ecx
-#define REF2 edi
-#define YSTRIDE esi
-#define SRC edx
-#define RET eax
-    mov YSTRIDE,_ystride
-    mov SRC,_src
-    mov REF1,_ref1
-    mov REF2,_ref2
-    movq mm0,[REF1]
-    movq mm1,[REF2]
-    movq mm2,[REF1+YSTRIDE]
-    movq mm3,[REF2+YSTRIDE]
-    xor RET,RET
-    movq mm4,[SRC]
-    pxor mm7,mm7
-    pcmpeqb mm6,mm6
-    movq mm5,[SRC+YSTRIDE]
-    psubb mm7,mm6
-    OC_SAD2_LOOP
-    OC_SAD2_LOOP
-    OC_SAD2_LOOP
-    OC_SAD2_TAIL
-    mov [ret],RET
-#undef REF1
-#undef REF2
-#undef YSTRIDE
-#undef SRC
-#undef RET
-  }
-  return (unsigned)ret;
-}
-
-/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
-  16-bit difference in mm0...mm7.*/
-#define OC_LOAD_SUB_8x4(_off) __asm{ \
-  __asm  movd mm0,[_off+SRC] \
-  __asm  movd mm4,[_off+REF] \
-  __asm  movd mm1,[_off+SRC+SRC_YSTRIDE] \
-  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
-  __asm  movd mm5,[_off+REF+REF_YSTRIDE] \
-  __asm  lea REF,[REF+REF_YSTRIDE*2] \
-  __asm  movd mm2,[_off+SRC] \
-  __asm  movd mm7,[_off+REF] \
-  __asm  movd mm3,[_off+SRC+SRC_YSTRIDE] \
-  __asm  movd mm6,[_off+REF+REF_YSTRIDE] \
-  __asm  punpcklbw mm0,mm4 \
-  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
-  __asm  punpcklbw mm4,mm4 \
-  __asm  lea REF,[REF+REF_YSTRIDE*2] \
-  __asm  psubw mm0,mm4 \
-  __asm  movd mm4,[_off+SRC] \
-  __asm  movq [_off*2+BUF],mm0 \
-  __asm  movd mm0,[_off+REF] \
-  __asm  punpcklbw mm1,mm5 \
-  __asm  punpcklbw mm5,mm5 \
-  __asm  psubw mm1,mm5 \
-  __asm  movd mm5,[_off+SRC+SRC_YSTRIDE] \
-  __asm  punpcklbw mm2,mm7 \
-  __asm  punpcklbw mm7,mm7 \
-  __asm  psubw mm2,mm7 \
-  __asm  movd mm7,[_off+REF+REF_YSTRIDE] \
-  __asm  punpcklbw mm3,mm6 \
-  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
-  __asm  punpcklbw mm6,mm6 \
-  __asm  psubw mm3,mm6 \
-  __asm  movd mm6,[_off+SRC] \
-  __asm  punpcklbw mm4,mm0 \
-  __asm  lea REF,[REF+REF_YSTRIDE*2] \
-  __asm  punpcklbw mm0,mm0 \
-  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
-  __asm  psubw mm4,mm0 \
-  __asm  movd mm0,[_off+REF] \
-  __asm  punpcklbw mm5,mm7 \
-  __asm  neg SRC_YSTRIDE \
-  __asm  punpcklbw mm7,mm7 \
-  __asm  psubw mm5,mm7 \
-  __asm  movd mm7,[_off+SRC+SRC_YSTRIDE] \
-  __asm  punpcklbw mm6,mm0 \
-  __asm  lea REF,[REF+REF_YSTRIDE*2] \
-  __asm  punpcklbw mm0,mm0 \
-  __asm  neg REF_YSTRIDE \
-  __asm  psubw mm6,mm0 \
-  __asm  movd mm0,[_off+REF+REF_YSTRIDE] \
-  __asm  lea SRC,[SRC+SRC_YSTRIDE*8] \
-  __asm  punpcklbw mm7,mm0 \
-  __asm  neg SRC_YSTRIDE \
-  __asm  punpcklbw mm0,mm0 \
-  __asm  lea REF,[REF+REF_YSTRIDE*8] \
-  __asm  psubw mm7,mm0 \
-  __asm  neg REF_YSTRIDE \
-  __asm  movq mm0,[_off*2+BUF] \
-}
-
-/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
-#define OC_LOAD_8x4(_off) __asm{ \
-  __asm  movd mm0,[_off+SRC] \
-  __asm  movd mm1,[_off+SRC+YSTRIDE] \
-  __asm  movd mm2,[_off+SRC+YSTRIDE*2] \
-  __asm  pxor mm7,mm7 \
-  __asm  movd mm3,[_off+SRC+YSTRIDE3] \
-  __asm  punpcklbw mm0,mm7 \
-  __asm  movd mm4,[_off+SRC4] \
-  __asm  punpcklbw mm1,mm7 \
-  __asm  movd mm5,[_off+SRC4+YSTRIDE] \
-  __asm  punpcklbw mm2,mm7 \
-  __asm  movd mm6,[_off+SRC4+YSTRIDE*2] \
-  __asm  punpcklbw mm3,mm7 \
-  __asm  movd mm7,[_off+SRC4+YSTRIDE3] \
-  __asm  punpcklbw mm4,mm4 \
-  __asm  punpcklbw mm5,mm5 \
-  __asm  psrlw mm4,8 \
-  __asm  psrlw mm5,8 \
-  __asm  punpcklbw mm6,mm6 \
-  __asm  punpcklbw mm7,mm7 \
-  __asm  psrlw mm6,8 \
-  __asm  psrlw mm7,8 \
-}
-
-/*Performs the first two stages of an 8-point 1-D Hadamard transform.
-  The transform is performed in place, except that outputs 0-3 are swapped with
-   outputs 4-7.
-  Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
-   perform this stage in place with no temporary registers).*/
-#define OC_HADAMARD_AB_8x4 __asm{ \
-  /*Stage A: \
-    Outputs 0-3 are swapped with 4-7 here.*/ \
-  __asm  paddw mm5,mm1 \
-  __asm  paddw mm6,mm2 \
-  __asm  paddw mm1,mm1 \
-  __asm  paddw mm2,mm2 \
-  __asm  psubw mm1,mm5 \
-  __asm  psubw mm2,mm6 \
-  __asm  paddw mm7,mm3 \
-  __asm  paddw mm4,mm0 \
-  __asm  paddw mm3,mm3 \
-  __asm  paddw mm0,mm0 \
-  __asm  psubw mm3,mm7 \
-  __asm  psubw mm0,mm4 \
-   /*Stage B:*/ \
-  __asm  paddw mm0,mm2 \
-  __asm  paddw mm1,mm3 \
-  __asm  paddw mm4,mm6 \
-  __asm  paddw mm5,mm7 \
-  __asm  paddw mm2,mm2 \
-  __asm  paddw mm3,mm3 \
-  __asm  paddw mm6,mm6 \
-  __asm  paddw mm7,mm7 \
-  __asm  psubw mm2,mm0 \
-  __asm  psubw mm3,mm1 \
-  __asm  psubw mm6,mm4 \
-  __asm  psubw mm7,mm5 \
-}
-
-/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
-  Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
-   place with no temporary registers).*/
-#define OC_HADAMARD_C_8x4 __asm{ \
-  /*Stage C:*/ \
-  __asm  paddw mm0,mm1 \
-  __asm  paddw mm2,mm3 \
-  __asm  paddw mm4,mm5 \
-  __asm  paddw mm6,mm7 \
-  __asm  paddw mm1,mm1 \
-  __asm  paddw mm3,mm3 \
-  __asm  paddw mm5,mm5 \
-  __asm  paddw mm7,mm7 \
-  __asm  psubw mm1,mm0 \
-  __asm  psubw mm3,mm2 \
-  __asm  psubw mm5,mm4 \
-  __asm  psubw mm7,mm6 \
-}
-
-/*Performs an 8-point 1-D Hadamard transform.
-  The transform is performed in place, except that outputs 0-3 are swapped with
-   outputs 4-7.
-  Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
-   in place with no temporary registers).*/
-#define OC_HADAMARD_8x4 __asm{ \
-  OC_HADAMARD_AB_8x4 \
-  OC_HADAMARD_C_8x4 \
-}
-
-/*Performs the first part of the final stage of the Hadamard transform and
-   summing of absolute values.
-  At the end of this part, mm1 will contain the DC coefficient of the
-   transform.*/
-#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \
-  /*We use the fact that \
-      (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
-     to merge the final butterfly with the abs and the first stage of \
-     accumulation. \
-    Thus we can avoid using pabsw, which is not available until SSSE3. \
-    Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
-     implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
-     registers). \
-    Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
-    This implementation is only 26 (+4 for spilling registers).*/ \
-  __asm  movq [_r7+BUF],mm7 \
-  __asm  movq [_r6+BUF],mm6 \
-  /*mm7={0x7FFF}x4 \
-    mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
-  __asm  pcmpeqb mm7,mm7 \
-  __asm  movq mm6,mm0 \
-  __asm  psrlw mm7,1 \
-  __asm  paddw mm6,mm1 \
-  __asm  pmaxsw mm0,mm1 \
-  __asm  paddsw mm6,mm7 \
-  __asm  psubw mm0,mm6 \
-  /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
-    mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
-  __asm  movq mm6,mm2 \
-  __asm  movq mm1,mm4 \
-  __asm  pmaxsw mm2,mm3 \
-  __asm  pmaxsw mm4,mm5 \
-  __asm  paddw mm6,mm3 \
-  __asm  paddw mm1,mm5 \
-  __asm  movq mm3,[_r7+BUF] \
-}
-
-/*Performs the second part of the final stage of the Hadamard transform and
-   summing of absolute values.*/
-#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \
-  __asm  paddsw mm6,mm7 \
-  __asm  movq mm5,[_r6+BUF] \
-  __asm  paddsw mm1,mm7 \
-  __asm  psubw mm2,mm6 \
-  __asm  psubw mm4,mm1 \
-  /*mm7={1}x4 (needed for the horizontal add that follows) \
-    mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
-  __asm  movq mm6,mm3 \
-  __asm  pmaxsw mm3,mm5 \
-  __asm  paddw mm0,mm2 \
-  __asm  paddw mm6,mm5 \
-  __asm  paddw mm0,mm4 \
-  __asm  paddsw mm6,mm7 \
-  __asm  paddw mm0,mm3 \
-  __asm  psrlw mm7,14 \
-  __asm  psubw mm0,mm6 \
-}
-
-/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
-   absolute value of each component, and accumulates everything into mm0.
-  This is the only portion of SATD which requires MMXEXT (we could use plain
-   MMX, but it takes 4 instructions and an extra register to work around the
-   lack of a pmaxsw, which is a pretty serious penalty).*/
-#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
-  OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
-  OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
-}
-
-/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
-   component, and accumulates everything into mm0.
-  Note that mm0 will have an extra 4 added to each column, and that after
-   removing this value, the remainder will be half the conventional value.*/
-#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
-  OC_HADAMARD_AB_8x4 \
-  OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
-}
-
-/*Performs two 4x4 transposes (mostly) in place.
-  On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
-   contains rows {a,b,c,d}.
-  On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and
-   {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
-#define OC_TRANSPOSE_4x4x2(_off) __asm{ \
-  /*First 4x4 transpose:*/ \
-  __asm  movq [0x10+_off+BUF],mm5 \
-  /*mm0 = e3 e2 e1 e0 \
-    mm1 = f3 f2 f1 f0 \
-    mm2 = g3 g2 g1 g0 \
-    mm3 = h3 h2 h1 h0*/ \
-  __asm  movq mm5,mm2 \
-  __asm  punpcklwd mm2,mm3 \
-  __asm  punpckhwd mm5,mm3 \
-  __asm  movq mm3,mm0 \
-  __asm  punpcklwd mm0,mm1 \
-  __asm  punpckhwd mm3,mm1 \
-  /*mm0 = f1 e1 f0 e0 \
-    mm3 = f3 e3 f2 e2 \
-    mm2 = h1 g1 h0 g0 \
-    mm5 = h3 g3 h2 g2*/ \
-  __asm  movq mm1,mm0 \
-  __asm  punpckldq mm0,mm2 \
-  __asm  punpckhdq mm1,mm2 \
-  __asm  movq mm2,mm3 \
-  __asm  punpckhdq mm3,mm5 \
-  __asm  movq [0x40+_off+BUF],mm0 \
-  __asm  punpckldq mm2,mm5 \
-  /*mm0 = h0 g0 f0 e0 \
-    mm1 = h1 g1 f1 e1 \
-    mm2 = h2 g2 f2 e2 \
-    mm3 = h3 g3 f3 e3*/ \
-  __asm  movq mm5,[0x10+_off+BUF] \
-  /*Second 4x4 transpose:*/ \
-  /*mm4 = a3 a2 a1 a0 \
-    mm5 = b3 b2 b1 b0 \
-    mm6 = c3 c2 c1 c0 \
-    mm7 = d3 d2 d1 d0*/ \
-  __asm  movq mm0,mm6 \
-  __asm  punpcklwd mm6,mm7 \
-  __asm  movq [0x50+_off+BUF],mm1 \
-  __asm  punpckhwd mm0,mm7 \
-  __asm  movq mm7,mm4 \
-  __asm  punpcklwd mm4,mm5 \
-  __asm  movq [0x60+_off+BUF],mm2 \
-  __asm  punpckhwd mm7,mm5 \
-  /*mm4 = b1 a1 b0 a0 \
-    mm7 = b3 a3 b2 a2 \
-    mm6 = d1 c1 d0 c0 \
-    mm0 = d3 c3 d2 c2*/ \
-  __asm  movq mm5,mm4 \
-  __asm  punpckldq mm4,mm6 \
-  __asm  movq [0x70+_off+BUF],mm3 \
-  __asm  punpckhdq mm5,mm6 \
-  __asm  movq mm6,mm7 \
-  __asm  punpckhdq mm7,mm0 \
-  __asm  punpckldq mm6,mm0 \
-  /*mm4 = d0 c0 b0 a0 \
-    mm5 = d1 c1 b1 a1 \
-    mm6 = d2 c2 b2 a2 \
-    mm7 = d3 c3 b3 a3*/ \
-}
-
-static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
- int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
-  OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t           *bufp;
-  unsigned               ret1;
-  unsigned               ret2;
-  bufp=buf;
-  __asm{
-#define SRC esi
-#define REF eax
-#define SRC_YSTRIDE ecx
-#define REF_YSTRIDE edx
-#define BUF edi
-#define RET eax
-#define RET2 edx
-    mov SRC,_src
-    mov SRC_YSTRIDE,_src_ystride
-    mov REF,_ref
-    mov REF_YSTRIDE,_ref_ystride
-    mov BUF,bufp
-    OC_LOAD_SUB_8x4(0x00)
-    OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2(0x00)
-    /*Finish swapping out this 8x4 block to make room for the next one.
-      mm0...mm3 have been swapped out already.*/
-    movq [0x00+BUF],mm4
-    movq [0x10+BUF],mm5
-    movq [0x20+BUF],mm6
-    movq [0x30+BUF],mm7
-    OC_LOAD_SUB_8x4(0x04)
-    OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2(0x08)
-    /*Here the first 4x4 block of output from the last transpose is the second
-       4x4 block of input for the next transform.
-      We have cleverly arranged that it already be in the appropriate place, so
-       we only have to do half the loads.*/
-    movq mm1,[0x10+BUF]
-    movq mm2,[0x20+BUF]
-    movq mm3,[0x30+BUF]
-    movq mm0,[0x00+BUF]
-    OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38)
-    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
-       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
-       for the factor of two we dropped + 3 for the vertical accumulation).
-      Now we finally have to promote things to dwords.
-      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
-       latency of pmaddwd by starting the next series of loads now.*/
-    mov RET2,_thresh
-    pmaddwd mm0,mm7
-    movq mm1,[0x50+BUF]
-    movq mm5,[0x58+BUF]
-    movq mm4,mm0
-    movq mm2,[0x60+BUF]
-    punpckhdq mm0,mm0
-    movq mm6,[0x68+BUF]
-    paddd mm4,mm0
-    movq mm3,[0x70+BUF]
-    movd RET,mm4
-    movq mm7,[0x78+BUF]
-    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
-       added to them, and a factor of two removed; correct the final sum here.*/
-    lea RET,[RET+RET-32]
-    movq mm0,[0x40+BUF]
-    cmp RET,RET2
-    movq mm4,[0x48+BUF]
-    jae at_end
-    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
-    pmaddwd mm0,mm7
-    /*There isn't much to stick in here to hide the latency this time, but the
-       alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
-       latency is even worse.*/
-    sub RET,32
-    movq mm4,mm0
-    punpckhdq mm0,mm0
-    paddd mm4,mm0
-    movd RET2,mm4
-    lea RET,[RET+RET2*2]
-    align 16
-at_end:
-    mov ret1,RET
-#undef SRC
-#undef REF
-#undef SRC_YSTRIDE
-#undef REF_YSTRIDE
-#undef BUF
-#undef RET
-#undef RET2
-  }
-  return ret1;
-}
-
-unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh){
-  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
-}
-
-
-/*Our internal implementation of frag_copy2 takes an extra stride parameter so
-   we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
-static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
-  __asm{
-    /*Load the first 3 rows.*/
-#define DST_YSTRIDE edi
-#define SRC_YSTRIDE esi
-#define DST eax
-#define SRC1 edx
-#define SRC2 ecx
-    mov DST_YSTRIDE,_dst_ystride
-    mov SRC_YSTRIDE,_src_ystride
-    mov DST,_dst
-    mov SRC1,_src1
-    mov SRC2,_src2
-    movq mm0,[SRC1]
-    movq mm1,[SRC2]
-    movq mm2,[SRC1+SRC_YSTRIDE]
-    lea SRC1,[SRC1+SRC_YSTRIDE*2]
-    movq mm3,[SRC2+SRC_YSTRIDE]
-    lea SRC2,[SRC2+SRC_YSTRIDE*2]
-    pxor mm7,mm7
-    movq mm4,[SRC1]
-    pcmpeqb mm6,mm6
-    movq mm5,[SRC2]
-    /*mm7={1}x8.*/
-    psubb mm7,mm6
-    /*Start averaging mm0 and mm1 into mm6.*/
-    movq mm6,mm0
-    pxor mm0,mm1
-    pavgb mm6,mm1
-    /*mm1 is free, start averaging mm3 into mm2 using mm1.*/
-    movq mm1,mm2
-    pand mm0,mm7
-    pavgb mm2,mm3
-    pxor mm1,mm3
-    /*mm3 is free.*/
-    psubb mm6,mm0
-    /*mm0 is free, start loading the next row.*/
-    movq mm0,[SRC1+SRC_YSTRIDE]
-    /*Start averaging mm5 and mm4 using mm3.*/
-    movq mm3,mm4
-    /*mm6 [row 0] is done; write it out.*/
-    movq [DST],mm6
-    pand mm1,mm7
-    pavgb mm4,mm5
-    psubb mm2,mm1
-    /*mm1 is free, continue loading the next row.*/
-    movq mm1,[SRC2+SRC_YSTRIDE]
-    pxor mm3,mm5
-    lea SRC1,[SRC1+SRC_YSTRIDE*2]
-    /*mm2 [row 1] is done; write it out.*/
-    movq [DST+DST_YSTRIDE],mm2
-    pand mm3,mm7
-    /*Start loading the next row.*/
-    movq mm2,[SRC1]
-    lea DST,[DST+DST_YSTRIDE*2]
-    psubb mm4,mm3
-    lea SRC2,[SRC2+SRC_YSTRIDE*2]
-    /*mm4 [row 2] is done; write it out.*/
-    movq [DST],mm4
-    /*Continue loading the next row.*/
-    movq mm3,[SRC2]
-    /*Start averaging mm0 and mm1 into mm6.*/
-    movq mm6,mm0
-    pxor mm0,mm1
-    /*Start loading the next row.*/
-    movq mm4,[SRC1+SRC_YSTRIDE]
-    pavgb mm6,mm1
-    /*mm1 is free; start averaging mm3 into mm2 using mm1.*/
-    movq mm1,mm2
-    pand mm0,mm7
-    /*Continue loading the next row.*/
-    movq mm5,[SRC2+SRC_YSTRIDE]
-    pavgb mm2,mm3
-    lea SRC1,[SRC1+SRC_YSTRIDE*2]
-    pxor mm1,mm3
-    /*mm3 is free.*/
-    psubb mm6,mm0
-    /*mm0 is free, start loading the next row.*/
-    movq mm0,[SRC1]
-    /*Start averaging mm5 into mm4 using mm3.*/
-    movq mm3,mm4
-    /*mm6 [row 3] is done; write it out.*/
-    movq [DST+DST_YSTRIDE],mm6
-    pand mm1,mm7
-    lea SRC2,[SRC2+SRC_YSTRIDE*2]
-    pavgb mm4,mm5
-    lea DST,[DST+DST_YSTRIDE*2]
-    psubb mm2,mm1
-    /*mm1 is free; continue loading the next row.*/
-    movq mm1,[SRC2]
-    pxor mm3,mm5
-    /*mm2 [row 4] is done; write it out.*/
-    movq [DST],mm2
-    pand mm3,mm7
-    /*Start loading the next row.*/
-    movq mm2,[SRC1+SRC_YSTRIDE]
-    psubb mm4,mm3
-    /*Start averaging mm0 and mm1 into mm6.*/
-    movq mm6,mm0
-    /*Continue loading the next row.*/
-    movq mm3,[SRC2+SRC_YSTRIDE]
-    /*mm4 [row 5] is done; write it out.*/
-    movq [DST+DST_YSTRIDE],mm4
-    pxor mm0,mm1
-    pavgb mm6,mm1
-    /*mm4 is free; start averaging mm3 into mm2 using mm4.*/
-    movq mm4,mm2
-    pand mm0,mm7
-    pavgb mm2,mm3
-    pxor mm4,mm3
-    lea DST,[DST+DST_YSTRIDE*2]
-    psubb mm6,mm0
-    pand mm4,mm7
-    /*mm6 [row 6] is done, write it out.*/
-    movq [DST],mm6
-    psubb mm2,mm4
-    /*mm2 [row 7] is done, write it out.*/
-    movq [DST+DST_YSTRIDE],mm2
-#undef SRC1
-#undef SRC2
-#undef SRC_YSTRIDE
-#undef DST_YSTRIDE
-#undef DST
-  }
-}
-
-unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh){
-  OC_ALIGN8(unsigned char ref[64]);
-  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
-  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
-}
-
-unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
- int _ystride){
-  OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t           *bufp;
-  unsigned               ret1;
-  unsigned               ret2;
-  bufp=buf;
-  __asm{
-#define SRC eax
-#define SRC4 esi
-#define BUF edi
-#define RET eax
-#define RET_WORD ax
-#define RET2 ecx
-#define YSTRIDE edx
-#define YSTRIDE3 ecx
-    mov SRC,_src
-    mov BUF,bufp
-    mov YSTRIDE,_ystride
-    /* src4 = src+4*ystride */
-    lea SRC4,[SRC+YSTRIDE*4]
-    /* ystride3 = 3*ystride */
-    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
-    OC_LOAD_8x4(0x00)
-    OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2(0x00)
-    /*Finish swapping out this 8x4 block to make room for the next one.
-      mm0...mm3 have been swapped out already.*/
-    movq [0x00+BUF],mm4
-    movq [0x10+BUF],mm5
-    movq [0x20+BUF],mm6
-    movq [0x30+BUF],mm7
-    OC_LOAD_8x4(0x04)
-    OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2(0x08)
-    /*Here the first 4x4 block of output from the last transpose is the second
-      4x4 block of input for the next transform.
-      We have cleverly arranged that it already be in the appropriate place, so
-      we only have to do half the loads.*/
-    movq mm1,[0x10+BUF]
-    movq mm2,[0x20+BUF]
-    movq mm3,[0x30+BUF]
-    movq mm0,[0x00+BUF]
-    /*We split out the stages here so we can save the DC coefficient in the
-      middle.*/
-    OC_HADAMARD_AB_8x4
-    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
-    movd RET,mm1
-    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
-    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
-      difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
-      for the factor of two we dropped + 3 for the vertical accumulation).
-      Now we finally have to promote things to dwords.
-      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
-      latency of pmaddwd by starting the next series of loads now.*/
-    pmaddwd mm0,mm7
-    movq mm1,[0x50+BUF]
-    movq mm5,[0x58+BUF]
-    movq mm2,[0x60+BUF]
-    movq mm4,mm0
-    movq mm6,[0x68+BUF]
-    punpckhdq mm0,mm0
-    movq mm3,[0x70+BUF]
-    paddd mm4,mm0
-    movq mm7,[0x78+BUF]
-    movd RET2,mm4
-    movq mm0,[0x40+BUF]
-    movq mm4,[0x48+BUF]
-    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
-    pmaddwd mm0,mm7
-    /*We assume that the DC coefficient is always positive (which is true,
-    because the input to the INTRA transform was not a difference).*/
-    movzx RET,RET_WORD
-    add RET2,RET2
-    sub RET2,RET
-    movq mm4,mm0
-    punpckhdq mm0,mm0
-    paddd mm4,mm0
-    movd RET,mm4
-    lea RET,[-64+RET2+RET*2]
-    mov [ret1],RET
-#undef SRC
-#undef SRC4
-#undef BUF
-#undef RET
-#undef RET_WORD
-#undef RET2
-#undef YSTRIDE
-#undef YSTRIDE3
-  }
-  return ret1;
-}
-
-void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
- const unsigned char *_src, const unsigned char *_ref,int _ystride){
-  int i;
-  __asm  pxor mm7,mm7
-  for(i=4;i-->0;){
-    __asm{
-#define SRC edx
-#define YSTRIDE esi
-#define RESIDUE eax
-#define REF ecx
-      mov YSTRIDE,_ystride
-      mov RESIDUE,_residue
-      mov SRC,_src
-      mov REF,_ref
-      /*mm0=[src]*/
-      movq mm0,[SRC]
-      /*mm1=[ref]*/
-      movq mm1,[REF]
-      /*mm4=[src+ystride]*/
-      movq mm4,[SRC+YSTRIDE]
-      /*mm5=[ref+ystride]*/
-      movq mm5,[REF+YSTRIDE]
-      /*Compute [src]-[ref].*/
-      movq mm2,mm0
-      punpcklbw mm0,mm7
-      movq mm3,mm1
-      punpckhbw mm2,mm7
-      punpcklbw mm1,mm7
-      punpckhbw mm3,mm7
-      psubw mm0,mm1
-      psubw mm2,mm3
-      /*Compute [src+ystride]-[ref+ystride].*/
-      movq mm1,mm4
-      punpcklbw mm4,mm7
-      movq mm3,mm5
-      punpckhbw mm1,mm7
-      lea SRC,[SRC+YSTRIDE*2]
-      punpcklbw mm5,mm7
-      lea REF,[REF+YSTRIDE*2]
-      punpckhbw mm3,mm7
-      psubw mm4,mm5
-      psubw mm1,mm3
-      /*Write the answer out.*/
-      movq [RESIDUE+0x00],mm0
-      movq [RESIDUE+0x08],mm2
-      movq [RESIDUE+0x10],mm4
-      movq [RESIDUE+0x18],mm1
-      lea RESIDUE,[RESIDUE+0x20]
-      mov _residue,RESIDUE
-      mov _src,SRC
-      mov _ref,REF
-#undef SRC
-#undef YSTRIDE
-#undef RESIDUE
-#undef REF
-    }
-  }
-}
-
-void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
- const unsigned char *_src,int _ystride){
-   __asm{
-#define YSTRIDE edx
-#define YSTRIDE3 edi
-#define RESIDUE ecx
-#define SRC eax
-    mov YSTRIDE,_ystride
-    mov RESIDUE,_residue
-    mov SRC,_src
-    /*mm0=[src]*/
-    movq mm0,[SRC]
-    /*mm1=[src+ystride]*/
-    movq mm1,[SRC+YSTRIDE]
-    /*mm6={-1}x4*/
-    pcmpeqw mm6,mm6
-    /*mm2=[src+2*ystride]*/
-    movq mm2,[SRC+YSTRIDE*2]
-    /*[ystride3]=3*[ystride]*/
-    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
-    /*mm6={1}x4*/
-    psllw mm6,15
-    /*mm3=[src+3*ystride]*/
-    movq mm3,[SRC+YSTRIDE3]
-    /*mm6={128}x4*/
-    psrlw mm6,8
-    /*mm7=0*/ 
-    pxor mm7,mm7
-    /*[src]=[src]+4*[ystride]*/
-    lea SRC,[SRC+YSTRIDE*4]
-    /*Compute [src]-128 and [src+ystride]-128*/
-    movq mm4,mm0
-    punpcklbw mm0,mm7
-    movq mm5,mm1
-    punpckhbw mm4,mm7
-    psubw mm0,mm6
-    punpcklbw mm1,mm7
-    psubw mm4,mm6
-    punpckhbw mm5,mm7
-    psubw mm1,mm6
-    psubw mm5,mm6
-    /*Write the answer out.*/
-    movq [RESIDUE+0x00],mm0
-    movq [RESIDUE+0x08],mm4
-    movq [RESIDUE+0x10],mm1
-    movq [RESIDUE+0x18],mm5
-    /*mm0=[src+4*ystride]*/
-    movq mm0,[SRC]
-    /*mm1=[src+5*ystride]*/
-    movq mm1,[SRC+YSTRIDE]
-    /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
-    movq mm4,mm2
-    punpcklbw mm2,mm7
-    movq mm5,mm3
-    punpckhbw mm4,mm7
-    psubw mm2,mm6
-    punpcklbw mm3,mm7
-    psubw mm4,mm6
-    punpckhbw mm5,mm7
-    psubw mm3,mm6
-    psubw mm5,mm6
-    /*Write the answer out.*/
-    movq [RESIDUE+0x20],mm2
-    movq [RESIDUE+0x28],mm4
-    movq [RESIDUE+0x30],mm3
-    movq [RESIDUE+0x38],mm5
-    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
-    movq mm2,[SRC+YSTRIDE*2]
-    movq mm3,[SRC+YSTRIDE3]
-    movq mm4,mm0
-    punpcklbw mm0,mm7
-    movq mm5,mm1
-    punpckhbw mm4,mm7
-    psubw mm0,mm6
-    punpcklbw mm1,mm7
-    psubw mm4,mm6
-    punpckhbw mm5,mm7
-    psubw mm1,mm6
-    psubw mm5,mm6
-    /*Write the answer out.*/
-    movq [RESIDUE+0x40],mm0
-    movq [RESIDUE+0x48],mm4
-    movq [RESIDUE+0x50],mm1
-    movq [RESIDUE+0x58],mm5
-    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
-    movq mm4,mm2
-    punpcklbw mm2,mm7
-    movq mm5,mm3
-    punpckhbw mm4,mm7
-    psubw mm2,mm6
-    punpcklbw mm3,mm7
-    psubw mm4,mm6
-    punpckhbw mm5,mm7
-    psubw mm3,mm6
-    psubw mm5,mm6
-    /*Write the answer out.*/
-    movq [RESIDUE+0x60],mm2
-    movq [RESIDUE+0x68],mm4
-    movq [RESIDUE+0x70],mm3
-    movq [RESIDUE+0x78],mm5
-#undef YSTRIDE
-#undef YSTRIDE3
-#undef RESIDUE
-#undef SRC
-  }
-}
-
-void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride){
-  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
-}
-
-#endif
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
+
+ ********************************************************************/
+#include <stddef.h>
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  ptrdiff_t ret;
+  __asm{
+#define SRC esi
+#define REF edx
+#define YSTRIDE ecx
+#define YSTRIDE3 edi
+    mov YSTRIDE,_ystride
+    mov SRC,_src
+    mov REF,_ref
+    /*Load the first 4 rows of each block.*/
+    movq mm0,[SRC]
+    movq mm1,[REF]
+    movq mm2,[SRC][YSTRIDE]
+    movq mm3,[REF][YSTRIDE]
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+    movq mm4,[SRC+YSTRIDE*2]
+    movq mm5,[REF+YSTRIDE*2]
+    movq mm6,[SRC+YSTRIDE3]
+    movq mm7,[REF+YSTRIDE3]
+    /*Compute their SADs and add them in mm0*/
+    psadbw mm0,mm1
+    psadbw mm2,mm3
+    lea SRC,[SRC+YSTRIDE*4]
+    paddw mm0,mm2
+    lea REF,[REF+YSTRIDE*4]
+    /*Load the next 3 rows as registers become available.*/
+    movq mm2,[SRC]
+    movq mm3,[REF]
+    psadbw mm4,mm5
+    psadbw mm6,mm7
+    paddw mm0,mm4
+    movq mm5,[REF+YSTRIDE]
+    movq mm4,[SRC+YSTRIDE]
+    paddw mm0,mm6
+    movq mm7,[REF+YSTRIDE*2]
+    movq mm6,[SRC+YSTRIDE*2]
+    /*Start adding their SADs to mm0*/
+    psadbw mm2,mm3
+    psadbw mm4,mm5
+    paddw mm0,mm2
+    psadbw mm6,mm7
+    /*Load last row as registers become available.*/
+    movq mm2,[SRC+YSTRIDE3]
+    movq mm3,[REF+YSTRIDE3]
+    /*And finish adding up their SADs.*/
+    paddw mm0,mm4
+    psadbw mm2,mm3
+    paddw mm0,mm6
+    paddw mm0,mm2
+    movd [ret],mm0
+#undef SRC
+#undef REF
+#undef YSTRIDE
+#undef YSTRIDE3
+  }
+  return (unsigned)ret;
+}
+
+unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  /*Early termination is for suckers.*/
+  return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
+}
+
+#define OC_SAD2_LOOP __asm{ \
+  /*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \
+     pavgb computes (mm0+mm1+1>>1). \
+   The latter is exactly 1 too large when the low bit of two corresponding \
+    bytes is only set in one of them. \
+   Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
+    correct the output of pavgb.*/ \
+  __asm  movq mm6,mm0 \
+  __asm  lea REF1,[REF1+YSTRIDE*2] \
+  __asm  pxor mm0,mm1 \
+  __asm  pavgb mm6,mm1 \
+  __asm  lea REF2,[REF2+YSTRIDE*2] \
+  __asm  movq mm1,mm2 \
+  __asm  pand mm0,mm7 \
+  __asm  pavgb mm2,mm3 \
+  __asm  pxor mm1,mm3 \
+  __asm  movq mm3,[REF2+YSTRIDE] \
+  __asm  psubb mm6,mm0 \
+  __asm  movq mm0,[REF1] \
+  __asm  pand mm1,mm7 \
+  __asm  psadbw mm4,mm6 \
+  __asm  movd mm6,RET \
+  __asm  psubb mm2,mm1 \
+  __asm  movq mm1,[REF2] \
+  __asm  lea SRC,[SRC+YSTRIDE*2] \
+  __asm  psadbw mm5,mm2 \
+  __asm  movq mm2,[REF1+YSTRIDE] \
+  __asm  paddw mm5,mm4 \
+  __asm  movq mm4,[SRC] \
+  __asm  paddw mm6,mm5 \
+  __asm  movq mm5,[SRC+YSTRIDE] \
+  __asm  movd RET,mm6 \
+}
+
+/*Same as above, but does not pre-load the next two rows.*/
+#define OC_SAD2_TAIL __asm{ \
+  __asm  movq mm6,mm0 \
+  __asm  pavgb mm0,mm1 \
+  __asm  pxor mm6,mm1 \
+  __asm  movq mm1,mm2 \
+  __asm  pand mm6,mm7 \
+  __asm  pavgb mm2,mm3 \
+  __asm  pxor mm1,mm3 \
+  __asm  psubb mm0,mm6 \
+  __asm  pand mm1,mm7 \
+  __asm  psadbw mm4,mm0 \
+  __asm  psubb mm2,mm1 \
+  __asm  movd mm6,RET \
+  __asm  psadbw mm5,mm2 \
+  __asm  paddw mm5,mm4 \
+  __asm  paddw mm6,mm5 \
+  __asm  movd RET,mm6 \
+}
+
+unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  ptrdiff_t ret;
+  __asm{
+#define REF1 ecx
+#define REF2 edi
+#define YSTRIDE esi
+#define SRC edx
+#define RET eax
+    mov YSTRIDE,_ystride
+    mov SRC,_src
+    mov REF1,_ref1
+    mov REF2,_ref2
+    movq mm0,[REF1]
+    movq mm1,[REF2]
+    movq mm2,[REF1+YSTRIDE]
+    movq mm3,[REF2+YSTRIDE]
+    xor RET,RET
+    movq mm4,[SRC]
+    pxor mm7,mm7
+    pcmpeqb mm6,mm6
+    movq mm5,[SRC+YSTRIDE]
+    psubb mm7,mm6
+    OC_SAD2_LOOP
+    OC_SAD2_LOOP
+    OC_SAD2_LOOP
+    OC_SAD2_TAIL
+    mov [ret],RET
+#undef REF1
+#undef REF2
+#undef YSTRIDE
+#undef SRC
+#undef RET
+  }
+  return (unsigned)ret;
+}
+
+/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
+  16-bit difference in mm0...mm7.*/
+#define OC_LOAD_SUB_8x4(_off) __asm{ \
+  __asm  movd mm0,[_off+SRC] \
+  __asm  movd mm4,[_off+REF] \
+  __asm  movd mm1,[_off+SRC+SRC_YSTRIDE] \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  movd mm5,[_off+REF+REF_YSTRIDE] \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  movd mm2,[_off+SRC] \
+  __asm  movd mm7,[_off+REF] \
+  __asm  movd mm3,[_off+SRC+SRC_YSTRIDE] \
+  __asm  movd mm6,[_off+REF+REF_YSTRIDE] \
+  __asm  punpcklbw mm0,mm4 \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  punpcklbw mm4,mm4 \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  psubw mm0,mm4 \
+  __asm  movd mm4,[_off+SRC] \
+  __asm  movq [_off*2+BUF],mm0 \
+  __asm  movd mm0,[_off+REF] \
+  __asm  punpcklbw mm1,mm5 \
+  __asm  punpcklbw mm5,mm5 \
+  __asm  psubw mm1,mm5 \
+  __asm  movd mm5,[_off+SRC+SRC_YSTRIDE] \
+  __asm  punpcklbw mm2,mm7 \
+  __asm  punpcklbw mm7,mm7 \
+  __asm  psubw mm2,mm7 \
+  __asm  movd mm7,[_off+REF+REF_YSTRIDE] \
+  __asm  punpcklbw mm3,mm6 \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  punpcklbw mm6,mm6 \
+  __asm  psubw mm3,mm6 \
+  __asm  movd mm6,[_off+SRC] \
+  __asm  punpcklbw mm4,mm0 \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  punpcklbw mm0,mm0 \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  psubw mm4,mm0 \
+  __asm  movd mm0,[_off+REF] \
+  __asm  punpcklbw mm5,mm7 \
+  __asm  neg SRC_YSTRIDE \
+  __asm  punpcklbw mm7,mm7 \
+  __asm  psubw mm5,mm7 \
+  __asm  movd mm7,[_off+SRC+SRC_YSTRIDE] \
+  __asm  punpcklbw mm6,mm0 \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  punpcklbw mm0,mm0 \
+  __asm  neg REF_YSTRIDE \
+  __asm  psubw mm6,mm0 \
+  __asm  movd mm0,[_off+REF+REF_YSTRIDE] \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*8] \
+  __asm  punpcklbw mm7,mm0 \
+  __asm  neg SRC_YSTRIDE \
+  __asm  punpcklbw mm0,mm0 \
+  __asm  lea REF,[REF+REF_YSTRIDE*8] \
+  __asm  psubw mm7,mm0 \
+  __asm  neg REF_YSTRIDE \
+  __asm  movq mm0,[_off*2+BUF] \
+}
+
+/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
+#define OC_LOAD_8x4(_off) __asm{ \
+  __asm  movd mm0,[_off+SRC] \
+  __asm  movd mm1,[_off+SRC+YSTRIDE] \
+  __asm  movd mm2,[_off+SRC+YSTRIDE*2] \
+  __asm  pxor mm7,mm7 \
+  __asm  movd mm3,[_off+SRC+YSTRIDE3] \
+  __asm  punpcklbw mm0,mm7 \
+  __asm  movd mm4,[_off+SRC4] \
+  __asm  punpcklbw mm1,mm7 \
+  __asm  movd mm5,[_off+SRC4+YSTRIDE] \
+  __asm  punpcklbw mm2,mm7 \
+  __asm  movd mm6,[_off+SRC4+YSTRIDE*2] \
+  __asm  punpcklbw mm3,mm7 \
+  __asm  movd mm7,[_off+SRC4+YSTRIDE3] \
+  __asm  punpcklbw mm4,mm4 \
+  __asm  punpcklbw mm5,mm5 \
+  __asm  psrlw mm4,8 \
+  __asm  psrlw mm5,8 \
+  __asm  punpcklbw mm6,mm6 \
+  __asm  punpcklbw mm7,mm7 \
+  __asm  psrlw mm6,8 \
+  __asm  psrlw mm7,8 \
+}
+
+/*Performs the first two stages of an 8-point 1-D Hadamard transform.
+  The transform is performed in place, except that outputs 0-3 are swapped with
+   outputs 4-7.
+  Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
+   perform this stage in place with no temporary registers).*/
+#define OC_HADAMARD_AB_8x4 __asm{ \
+  /*Stage A: \
+    Outputs 0-3 are swapped with 4-7 here.*/ \
+  __asm  paddw mm5,mm1 \
+  __asm  paddw mm6,mm2 \
+  __asm  paddw mm1,mm1 \
+  __asm  paddw mm2,mm2 \
+  __asm  psubw mm1,mm5 \
+  __asm  psubw mm2,mm6 \
+  __asm  paddw mm7,mm3 \
+  __asm  paddw mm4,mm0 \
+  __asm  paddw mm3,mm3 \
+  __asm  paddw mm0,mm0 \
+  __asm  psubw mm3,mm7 \
+  __asm  psubw mm0,mm4 \
+   /*Stage B:*/ \
+  __asm  paddw mm0,mm2 \
+  __asm  paddw mm1,mm3 \
+  __asm  paddw mm4,mm6 \
+  __asm  paddw mm5,mm7 \
+  __asm  paddw mm2,mm2 \
+  __asm  paddw mm3,mm3 \
+  __asm  paddw mm6,mm6 \
+  __asm  paddw mm7,mm7 \
+  __asm  psubw mm2,mm0 \
+  __asm  psubw mm3,mm1 \
+  __asm  psubw mm6,mm4 \
+  __asm  psubw mm7,mm5 \
+}
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
+  Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+   place with no temporary registers).*/
+#define OC_HADAMARD_C_8x4 __asm{ \
+  /*Stage C:*/ \
+  __asm  paddw mm0,mm1 \
+  __asm  paddw mm2,mm3 \
+  __asm  paddw mm4,mm5 \
+  __asm  paddw mm6,mm7 \
+  __asm  paddw mm1,mm1 \
+  __asm  paddw mm3,mm3 \
+  __asm  paddw mm5,mm5 \
+  __asm  paddw mm7,mm7 \
+  __asm  psubw mm1,mm0 \
+  __asm  psubw mm3,mm2 \
+  __asm  psubw mm5,mm4 \
+  __asm  psubw mm7,mm6 \
+}
+
+/*Performs an 8-point 1-D Hadamard transform.
+  The transform is performed in place, except that outputs 0-3 are swapped with
+   outputs 4-7.
+  Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
+   in place with no temporary registers).*/
+#define OC_HADAMARD_8x4 __asm{ \
+  OC_HADAMARD_AB_8x4 \
+  OC_HADAMARD_C_8x4 \
+}
+
+/*Performs the first part of the final stage of the Hadamard transform and
+   summing of absolute values.
+  At the end of this part, mm1 will contain the DC coefficient of the
+   transform.*/
+#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \
+  /*We use the fact that \
+      (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
+     to merge the final butterfly with the abs and the first stage of \
+     accumulation. \
+    Thus we can avoid using pabsw, which is not available until SSSE3. \
+    Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
+     implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
+     registers). \
+    Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
+    This implementation is only 26 (+4 for spilling registers).*/ \
+  __asm  movq [_r7+BUF],mm7 \
+  __asm  movq [_r6+BUF],mm6 \
+  /*mm7={0x7FFF}x4 \
+    mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
+  __asm  pcmpeqb mm7,mm7 \
+  __asm  movq mm6,mm0 \
+  __asm  psrlw mm7,1 \
+  __asm  paddw mm6,mm1 \
+  __asm  pmaxsw mm0,mm1 \
+  __asm  paddsw mm6,mm7 \
+  __asm  psubw mm0,mm6 \
+  /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
+    mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm1,mm4 \
+  __asm  pmaxsw mm2,mm3 \
+  __asm  pmaxsw mm4,mm5 \
+  __asm  paddw mm6,mm3 \
+  __asm  paddw mm1,mm5 \
+  __asm  movq mm3,[_r7+BUF] \
+}
+
+/*Performs the second part of the final stage of the Hadamard transform and
+   summing of absolute values.*/
+#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \
+  __asm  paddsw mm6,mm7 \
+  __asm  movq mm5,[_r6+BUF] \
+  __asm  paddsw mm1,mm7 \
+  __asm  psubw mm2,mm6 \
+  __asm  psubw mm4,mm1 \
+  /*mm7={1}x4 (needed for the horizontal add that follows) \
+    mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
+  __asm  movq mm6,mm3 \
+  __asm  pmaxsw mm3,mm5 \
+  __asm  paddw mm0,mm2 \
+  __asm  paddw mm6,mm5 \
+  __asm  paddw mm0,mm4 \
+  __asm  paddsw mm6,mm7 \
+  __asm  paddw mm0,mm3 \
+  __asm  psrlw mm7,14 \
+  __asm  psubw mm0,mm6 \
+}
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
+   absolute value of each component, and accumulates everything into mm0.
+  This is the only portion of SATD which requires MMXEXT (we could use plain
+   MMX, but it takes 4 instructions and an extra register to work around the
+   lack of a pmaxsw, which is a pretty serious penalty).*/
+#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
+  OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
+  OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
+}
+
+/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
+   component, and accumulates everything into mm0.
+  Note that mm0 will have an extra 4 added to each column, and that after
+   removing this value, the remainder will be half the conventional value.*/
+#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
+  OC_HADAMARD_AB_8x4 \
+  OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
+}
+
+/*Performs two 4x4 transposes (mostly) in place.
+  On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
+   contains rows {a,b,c,d}.
+  On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and
+   {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
+#define OC_TRANSPOSE_4x4x2(_off) __asm{ \
+  /*First 4x4 transpose:*/ \
+  __asm  movq [0x10+_off+BUF],mm5 \
+  /*mm0 = e3 e2 e1 e0 \
+    mm1 = f3 f2 f1 f0 \
+    mm2 = g3 g2 g1 g0 \
+    mm3 = h3 h2 h1 h0*/ \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm2,mm3 \
+  __asm  punpckhwd mm5,mm3 \
+  __asm  movq mm3,mm0 \
+  __asm  punpcklwd mm0,mm1 \
+  __asm  punpckhwd mm3,mm1 \
+  /*mm0 = f1 e1 f0 e0 \
+    mm3 = f3 e3 f2 e2 \
+    mm2 = h1 g1 h0 g0 \
+    mm5 = h3 g3 h2 g2*/ \
+  __asm  movq mm1,mm0 \
+  __asm  punpckldq mm0,mm2 \
+  __asm  punpckhdq mm1,mm2 \
+  __asm  movq mm2,mm3 \
+  __asm  punpckhdq mm3,mm5 \
+  __asm  movq [0x40+_off+BUF],mm0 \
+  __asm  punpckldq mm2,mm5 \
+  /*mm0 = h0 g0 f0 e0 \
+    mm1 = h1 g1 f1 e1 \
+    mm2 = h2 g2 f2 e2 \
+    mm3 = h3 g3 f3 e3*/ \
+  __asm  movq mm5,[0x10+_off+BUF] \
+  /*Second 4x4 transpose:*/ \
+  /*mm4 = a3 a2 a1 a0 \
+    mm5 = b3 b2 b1 b0 \
+    mm6 = c3 c2 c1 c0 \
+    mm7 = d3 d2 d1 d0*/ \
+  __asm  movq mm0,mm6 \
+  __asm  punpcklwd mm6,mm7 \
+  __asm  movq [0x50+_off+BUF],mm1 \
+  __asm  punpckhwd mm0,mm7 \
+  __asm  movq mm7,mm4 \
+  __asm  punpcklwd mm4,mm5 \
+  __asm  movq [0x60+_off+BUF],mm2 \
+  __asm  punpckhwd mm7,mm5 \
+  /*mm4 = b1 a1 b0 a0 \
+    mm7 = b3 a3 b2 a2 \
+    mm6 = d1 c1 d0 c0 \
+    mm0 = d3 c3 d2 c2*/ \
+  __asm  movq mm5,mm4 \
+  __asm  punpckldq mm4,mm6 \
+  __asm  movq [0x70+_off+BUF],mm3 \
+  __asm  punpckhdq mm5,mm6 \
+  __asm  movq mm6,mm7 \
+  __asm  punpckhdq mm7,mm0 \
+  __asm  punpckldq mm6,mm0 \
+  /*mm4 = d0 c0 b0 a0 \
+    mm5 = d1 c1 b1 a1 \
+    mm6 = d2 c2 b2 a2 \
+    mm7 = d3 c3 b3 a3*/ \
+}
+
+static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
+ int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
+  OC_ALIGN8(ogg_int16_t  buf[64]);
+  ogg_int16_t           *bufp;
+  unsigned               ret1;
+  unsigned               ret2;
+  bufp=buf;
+  __asm{
+#define SRC esi
+#define REF eax
+#define SRC_YSTRIDE ecx
+#define REF_YSTRIDE edx
+#define BUF edi
+#define RET eax
+#define RET2 edx
+    mov SRC,_src
+    mov SRC_YSTRIDE,_src_ystride
+    mov REF,_ref
+    mov REF_YSTRIDE,_ref_ystride
+    mov BUF,bufp
+    OC_LOAD_SUB_8x4(0x00)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x00)
+    /*Finish swapping out this 8x4 block to make room for the next one.
+      mm0...mm3 have been swapped out already.*/
+    movq [0x00+BUF],mm4
+    movq [0x10+BUF],mm5
+    movq [0x20+BUF],mm6
+    movq [0x30+BUF],mm7
+    OC_LOAD_SUB_8x4(0x04)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x08)
+    /*Here the first 4x4 block of output from the last transpose is the second
+       4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place, so
+       we only have to do half the loads.*/
+    movq mm1,[0x10+BUF]
+    movq mm2,[0x20+BUF]
+    movq mm3,[0x30+BUF]
+    movq mm0,[0x00+BUF]
+    OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38)
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+       for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.
+      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
+       latency of pmaddwd by starting the next series of loads now.*/
+    mov RET2,_thresh
+    pmaddwd mm0,mm7
+    movq mm1,[0x50+BUF]
+    movq mm5,[0x58+BUF]
+    movq mm4,mm0
+    movq mm2,[0x60+BUF]
+    punpckhdq mm0,mm0
+    movq mm6,[0x68+BUF]
+    paddd mm4,mm0
+    movq mm3,[0x70+BUF]
+    movd RET,mm4
+    movq mm7,[0x78+BUF]
+    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
+       added to them, and a factor of two removed; correct the final sum here.*/
+    lea RET,[RET+RET-32]
+    movq mm0,[0x40+BUF]
+    cmp RET,RET2
+    movq mm4,[0x48+BUF]
+    jae at_end
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
+    pmaddwd mm0,mm7
+    /*There isn't much to stick in here to hide the latency this time, but the
+       alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
+       latency is even worse.*/
+    sub RET,32
+    movq mm4,mm0
+    punpckhdq mm0,mm0
+    paddd mm4,mm0
+    movd RET2,mm4
+    lea RET,[RET+RET2*2]
+    align 16
+at_end:
+    mov ret1,RET
+#undef SRC
+#undef REF
+#undef SRC_YSTRIDE
+#undef REF_YSTRIDE
+#undef BUF
+#undef RET
+#undef RET2
+  }
+  return ret1;
+}
+
+unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
+}
+
+
+/*Our internal implementation of frag_copy2 takes an extra stride parameter so
+   we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
+static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
+  __asm{
+    /*Load the first 3 rows.*/
+#define DST_YSTRIDE edi
+#define SRC_YSTRIDE esi
+#define DST eax
+#define SRC1 edx
+#define SRC2 ecx
+    mov DST_YSTRIDE,_dst_ystride
+    mov SRC_YSTRIDE,_src_ystride
+    mov DST,_dst
+    mov SRC1,_src1
+    mov SRC2,_src2
+    movq mm0,[SRC1]
+    movq mm1,[SRC2]
+    movq mm2,[SRC1+SRC_YSTRIDE]
+    lea SRC1,[SRC1+SRC_YSTRIDE*2]
+    movq mm3,[SRC2+SRC_YSTRIDE]
+    lea SRC2,[SRC2+SRC_YSTRIDE*2]
+    pxor mm7,mm7
+    movq mm4,[SRC1]
+    pcmpeqb mm6,mm6
+    movq mm5,[SRC2]
+    /*mm7={1}x8.*/
+    psubb mm7,mm6
+    /*Start averaging mm0 and mm1 into mm6.*/
+    movq mm6,mm0
+    pxor mm0,mm1
+    pavgb mm6,mm1
+    /*mm1 is free, start averaging mm3 into mm2 using mm1.*/
+    movq mm1,mm2
+    pand mm0,mm7
+    pavgb mm2,mm3
+    pxor mm1,mm3
+    /*mm3 is free.*/
+    psubb mm6,mm0
+    /*mm0 is free, start loading the next row.*/
+    movq mm0,[SRC1+SRC_YSTRIDE]
+    /*Start averaging mm5 and mm4 using mm3.*/
+    movq mm3,mm4
+    /*mm6 [row 0] is done; write it out.*/
+    movq [DST],mm6
+    pand mm1,mm7
+    pavgb mm4,mm5
+    psubb mm2,mm1
+    /*mm1 is free, continue loading the next row.*/
+    movq mm1,[SRC2+SRC_YSTRIDE]
+    pxor mm3,mm5
+    lea SRC1,[SRC1+SRC_YSTRIDE*2]
+    /*mm2 [row 1] is done; write it out.*/
+    movq [DST+DST_YSTRIDE],mm2
+    pand mm3,mm7
+    /*Start loading the next row.*/
+    movq mm2,[SRC1]
+    lea DST,[DST+DST_YSTRIDE*2]
+    psubb mm4,mm3
+    lea SRC2,[SRC2+SRC_YSTRIDE*2]
+    /*mm4 [row 2] is done; write it out.*/
+    movq [DST],mm4
+    /*Continue loading the next row.*/
+    movq mm3,[SRC2]
+    /*Start averaging mm0 and mm1 into mm6.*/
+    movq mm6,mm0
+    pxor mm0,mm1
+    /*Start loading the next row.*/
+    movq mm4,[SRC1+SRC_YSTRIDE]
+    pavgb mm6,mm1
+    /*mm1 is free; start averaging mm3 into mm2 using mm1.*/
+    movq mm1,mm2
+    pand mm0,mm7
+    /*Continue loading the next row.*/
+    movq mm5,[SRC2+SRC_YSTRIDE]
+    pavgb mm2,mm3
+    lea SRC1,[SRC1+SRC_YSTRIDE*2]
+    pxor mm1,mm3
+    /*mm3 is free.*/
+    psubb mm6,mm0
+    /*mm0 is free, start loading the next row.*/
+    movq mm0,[SRC1]
+    /*Start averaging mm5 into mm4 using mm3.*/
+    movq mm3,mm4
+    /*mm6 [row 3] is done; write it out.*/
+    movq [DST+DST_YSTRIDE],mm6
+    pand mm1,mm7
+    lea SRC2,[SRC2+SRC_YSTRIDE*2]
+    pavgb mm4,mm5
+    lea DST,[DST+DST_YSTRIDE*2]
+    psubb mm2,mm1
+    /*mm1 is free; continue loading the next row.*/
+    movq mm1,[SRC2]
+    pxor mm3,mm5
+    /*mm2 [row 4] is done; write it out.*/
+    movq [DST],mm2
+    pand mm3,mm7
+    /*Start loading the next row.*/
+    movq mm2,[SRC1+SRC_YSTRIDE]
+    psubb mm4,mm3
+    /*Start averaging mm0 and mm1 into mm6.*/
+    movq mm6,mm0
+    /*Continue loading the next row.*/
+    movq mm3,[SRC2+SRC_YSTRIDE]
+    /*mm4 [row 5] is done; write it out.*/
+    movq [DST+DST_YSTRIDE],mm4
+    pxor mm0,mm1
+    pavgb mm6,mm1
+    /*mm4 is free; start averaging mm3 into mm2 using mm4.*/
+    movq mm4,mm2
+    pand mm0,mm7
+    pavgb mm2,mm3
+    pxor mm4,mm3
+    lea DST,[DST+DST_YSTRIDE*2]
+    psubb mm6,mm0
+    pand mm4,mm7
+    /*mm6 [row 6] is done, write it out.*/
+    movq [DST],mm6
+    psubb mm2,mm4
+    /*mm2 [row 7] is done, write it out.*/
+    movq [DST+DST_YSTRIDE],mm2
+#undef SRC1
+#undef SRC2
+#undef SRC_YSTRIDE
+#undef DST_YSTRIDE
+#undef DST
+  }
+}
+
+unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  OC_ALIGN8(unsigned char ref[64]);
+  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
+  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
+}
+
+unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
+ int _ystride){
+  OC_ALIGN8(ogg_int16_t  buf[64]);
+  ogg_int16_t           *bufp;
+  unsigned               ret1;
+  unsigned               ret2;
+  bufp=buf;
+  __asm{
+#define SRC eax
+#define SRC4 esi
+#define BUF edi
+#define RET eax
+#define RET_WORD ax
+#define RET2 ecx
+#define YSTRIDE edx
+#define YSTRIDE3 ecx
+    mov SRC,_src
+    mov BUF,bufp
+    mov YSTRIDE,_ystride
+    /* src4 = src+4*ystride */
+    lea SRC4,[SRC+YSTRIDE*4]
+    /* ystride3 = 3*ystride */
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+    OC_LOAD_8x4(0x00)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x00)
+    /*Finish swapping out this 8x4 block to make room for the next one.
+      mm0...mm3 have been swapped out already.*/
+    movq [0x00+BUF],mm4
+    movq [0x10+BUF],mm5
+    movq [0x20+BUF],mm6
+    movq [0x30+BUF],mm7
+    OC_LOAD_8x4(0x04)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x08)
+    /*Here the first 4x4 block of output from the last transpose is the second
+      4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place, so
+      we only have to do half the loads.*/
+    movq mm1,[0x10+BUF]
+    movq mm2,[0x20+BUF]
+    movq mm3,[0x30+BUF]
+    movq mm0,[0x00+BUF]
+    /*We split out the stages here so we can save the DC coefficient in the
+      middle.*/
+    OC_HADAMARD_AB_8x4
+    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
+    movd RET,mm1
+    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+      difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+      for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.
+      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
+      latency of pmaddwd by starting the next series of loads now.*/
+    pmaddwd mm0,mm7
+    movq mm1,[0x50+BUF]
+    movq mm5,[0x58+BUF]
+    movq mm2,[0x60+BUF]
+    movq mm4,mm0
+    movq mm6,[0x68+BUF]
+    punpckhdq mm0,mm0
+    movq mm3,[0x70+BUF]
+    paddd mm4,mm0
+    movq mm7,[0x78+BUF]
+    movd RET2,mm4
+    movq mm0,[0x40+BUF]
+    movq mm4,[0x48+BUF]
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
+    pmaddwd mm0,mm7
+    /*We assume that the DC coefficient is always positive (which is true,
+    because the input to the INTRA transform was not a difference).*/
+    movzx RET,RET_WORD
+    add RET2,RET2
+    sub RET2,RET
+    movq mm4,mm0
+    punpckhdq mm0,mm0
+    paddd mm4,mm0
+    movd RET,mm4
+    lea RET,[-64+RET2+RET*2]
+    mov [ret1],RET
+#undef SRC
+#undef SRC4
+#undef BUF
+#undef RET
+#undef RET_WORD
+#undef RET2
+#undef YSTRIDE
+#undef YSTRIDE3
+  }
+  return ret1;
+}
+
+void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
+ const unsigned char *_src, const unsigned char *_ref,int _ystride){
+  int i;
+  __asm  pxor mm7,mm7
+  for(i=4;i-->0;){
+    __asm{
+#define SRC edx
+#define YSTRIDE esi
+#define RESIDUE eax
+#define REF ecx
+      mov YSTRIDE,_ystride
+      mov RESIDUE,_residue
+      mov SRC,_src
+      mov REF,_ref
+      /*mm0=[src]*/
+      movq mm0,[SRC]
+      /*mm1=[ref]*/
+      movq mm1,[REF]
+      /*mm4=[src+ystride]*/
+      movq mm4,[SRC+YSTRIDE]
+      /*mm5=[ref+ystride]*/
+      movq mm5,[REF+YSTRIDE]
+      /*Compute [src]-[ref].*/
+      movq mm2,mm0
+      punpcklbw mm0,mm7
+      movq mm3,mm1
+      punpckhbw mm2,mm7
+      punpcklbw mm1,mm7
+      punpckhbw mm3,mm7
+      psubw mm0,mm1
+      psubw mm2,mm3
+      /*Compute [src+ystride]-[ref+ystride].*/
+      movq mm1,mm4
+      punpcklbw mm4,mm7
+      movq mm3,mm5
+      punpckhbw mm1,mm7
+      lea SRC,[SRC+YSTRIDE*2]
+      punpcklbw mm5,mm7
+      lea REF,[REF+YSTRIDE*2]
+      punpckhbw mm3,mm7
+      psubw mm4,mm5
+      psubw mm1,mm3
+      /*Write the answer out.*/
+      movq [RESIDUE+0x00],mm0
+      movq [RESIDUE+0x08],mm2
+      movq [RESIDUE+0x10],mm4
+      movq [RESIDUE+0x18],mm1
+      lea RESIDUE,[RESIDUE+0x20]
+      mov _residue,RESIDUE
+      mov _src,SRC
+      mov _ref,REF
+#undef SRC
+#undef YSTRIDE
+#undef RESIDUE
+#undef REF
+    }
+  }
+}
+
+void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
+ const unsigned char *_src,int _ystride){
+   __asm{
+#define YSTRIDE edx
+#define YSTRIDE3 edi
+#define RESIDUE ecx
+#define SRC eax
+    mov YSTRIDE,_ystride
+    mov RESIDUE,_residue
+    mov SRC,_src
+    /*mm0=[src]*/
+    movq mm0,[SRC]
+    /*mm1=[src+ystride]*/
+    movq mm1,[SRC+YSTRIDE]
+    /*mm6={-1}x4*/
+    pcmpeqw mm6,mm6
+    /*mm2=[src+2*ystride]*/
+    movq mm2,[SRC+YSTRIDE*2]
+    /*[ystride3]=3*[ystride]*/
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+    /*mm6={1}x4*/
+    psllw mm6,15
+    /*mm3=[src+3*ystride]*/
+    movq mm3,[SRC+YSTRIDE3]
+    /*mm6={128}x4*/
+    psrlw mm6,8
+    /*mm7=0*/ 
+    pxor mm7,mm7
+    /*[src]=[src]+4*[ystride]*/
+    lea SRC,[SRC+YSTRIDE*4]
+    /*Compute [src]-128 and [src+ystride]-128*/
+    movq mm4,mm0
+    punpcklbw mm0,mm7
+    movq mm5,mm1
+    punpckhbw mm4,mm7
+    psubw mm0,mm6
+    punpcklbw mm1,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm1,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x00],mm0
+    movq [RESIDUE+0x08],mm4
+    movq [RESIDUE+0x10],mm1
+    movq [RESIDUE+0x18],mm5
+    /*mm0=[src+4*ystride]*/
+    movq mm0,[SRC]
+    /*mm1=[src+5*ystride]*/
+    movq mm1,[SRC+YSTRIDE]
+    /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
+    movq mm4,mm2
+    punpcklbw mm2,mm7
+    movq mm5,mm3
+    punpckhbw mm4,mm7
+    psubw mm2,mm6
+    punpcklbw mm3,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm3,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x20],mm2
+    movq [RESIDUE+0x28],mm4
+    movq [RESIDUE+0x30],mm3
+    movq [RESIDUE+0x38],mm5
+    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
+    movq mm2,[SRC+YSTRIDE*2]
+    movq mm3,[SRC+YSTRIDE3]
+    movq mm4,mm0
+    punpcklbw mm0,mm7
+    movq mm5,mm1
+    punpckhbw mm4,mm7
+    psubw mm0,mm6
+    punpcklbw mm1,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm1,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x40],mm0
+    movq [RESIDUE+0x48],mm4
+    movq [RESIDUE+0x50],mm1
+    movq [RESIDUE+0x58],mm5
+    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
+    movq mm4,mm2
+    punpcklbw mm2,mm7
+    movq mm5,mm3
+    punpckhbw mm4,mm7
+    psubw mm2,mm6
+    punpcklbw mm3,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm3,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x60],mm2
+    movq [RESIDUE+0x68],mm4
+    movq [RESIDUE+0x70],mm3
+    movq [RESIDUE+0x78],mm5
+#undef YSTRIDE
+#undef YSTRIDE3
+#undef RESIDUE
+#undef SRC
+  }
+}
+
+void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride){
+  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
+}
+
+#endif
diff --git a/thirdparty/libtheora/x86_vc/mmxfdct.c b/thirdparty/libtheora/x86_vc/mmxfdct.c
index dcf17c9fa79..d908ce2413a 100644
--- a/thirdparty/libtheora/x86_vc/mmxfdct.c
+++ b/thirdparty/libtheora/x86_vc/mmxfdct.c
@@ -1,670 +1,670 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************/ 
- /*MMX fDCT implementation for x86_32*/
-/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
-#include "x86enc.h"
-
-#if defined(OC_X86_ASM)
-
-#define OC_FDCT_STAGE1_8x4  __asm{ \
-  /*Stage 1:*/ \
-  /*mm0=t7'=t0-t7*/ \
-  __asm  psubw mm0,mm7 \
-  __asm  paddw mm7,mm7 \
-  /*mm1=t6'=t1-t6*/ \
-  __asm  psubw mm1, mm6 \
-  __asm  paddw mm6,mm6 \
-  /*mm2=t5'=t2-t5*/ \
-  __asm  psubw mm2,mm5 \
-  __asm  paddw mm5,mm5 \
-  /*mm3=t4'=t3-t4*/ \
-  __asm  psubw mm3,mm4 \
-  __asm  paddw mm4,mm4 \
-  /*mm7=t0'=t0+t7*/ \
-  __asm  paddw mm7,mm0 \
-  /*mm6=t1'=t1+t6*/  \
-  __asm  paddw mm6,mm1 \
-  /*mm5=t2'=t2+t5*/ \
-  __asm  paddw mm5,mm2 \
-  /*mm4=t3'=t3+t4*/ \
-  __asm  paddw mm4,mm3\
-}
-
-#define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \
-  /*Stage 2:*/ \
-  /*mm7=t3''=t0'-t3'*/ \
-  __asm  psubw mm7,mm4 \
-  __asm  paddw mm4,mm4 \
-  /*mm6=t2''=t1'-t2'*/ \
-  __asm  psubw mm6,mm5 \
-  __asm  movq [Y+_r6],mm7 \
-  __asm  paddw mm5,mm5 \
-  /*mm1=t5''=t6'-t5'*/ \
-  __asm  psubw mm1,mm2 \
-  __asm  movq [Y+_r2],mm6 \
-  /*mm4=t0''=t0'+t3'*/ \
-  __asm  paddw mm4,mm7 \
-  __asm  paddw mm2,mm2 \
-  /*mm5=t1''=t1'+t2'*/ \
-  __asm  movq [Y+_r0],mm4 \
-  __asm  paddw mm5,mm6 \
-  /*mm2=t6''=t6'+t5'*/ \
-  __asm  paddw mm2,mm1 \
-  __asm  movq [Y+_r4],mm5 \
-  /*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
-  /*mm4, mm5, mm6, mm7 are free.*/ \
-  /*Stage 3:*/ \
-  /*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
-  __asm  mov A,0x5A806A0A \
-  __asm  pcmpeqb mm6,mm6 \
-  __asm  movd mm7,A \
-  __asm  psrlw mm6,15 \
-  __asm  punpckldq mm7,mm7 \
-  __asm  paddw mm6,mm6 \
-  /*mm0=0, m2={-1}x4 \
-    mm5:mm4=t5''*27146+0xB500*/ \
-  __asm  movq mm4,mm1 \
-  __asm  movq mm5,mm1 \
-  __asm  punpcklwd mm4,mm6 \
-  __asm  movq [Y+_r3],mm2 \
-  __asm  pmaddwd mm4,mm7 \
-  __asm  movq [Y+_r7],mm0 \
-  __asm  punpckhwd mm5,mm6 \
-  __asm  pxor mm0,mm0 \
-  __asm  pmaddwd mm5,mm7 \
-  __asm  pcmpeqb mm2,mm2 \
-  /*mm2=t6'', mm1=t5''+(t5''!=0) \
-    mm4=(t5''*27146+0xB500>>16)*/ \
-  __asm  pcmpeqw mm0,mm1 \
-  __asm  psrad mm4,16 \
-  __asm  psubw mm0,mm2 \
-  __asm  movq mm2, [Y+_r3] \
-  __asm  psrad mm5,16 \
-  __asm  paddw mm1,mm0 \
-  __asm  packssdw mm4,mm5 \
-  /*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
-  __asm  paddw mm4,mm1 \
-  __asm  movq mm0, [Y+_r7] \
-  __asm  psraw mm4,1 \
-  __asm  movq mm1,mm3 \
-  /*mm3=t4''=t4'+s*/ \
-  __asm  paddw mm3,mm4 \
-  /*mm1=t5'''=t4'-s*/ \
-  __asm  psubw mm1,mm4 \
-  /*mm1=0, mm3={-1}x4 \
-    mm5:mm4=t6''*27146+0xB500*/ \
-  __asm  movq mm4,mm2 \
-  __asm  movq mm5,mm2 \
-  __asm  punpcklwd mm4,mm6 \
-  __asm  movq [Y+_r5],mm1 \
-  __asm  pmaddwd mm4,mm7 \
-  __asm  movq [Y+_r1],mm3 \
-  __asm  punpckhwd mm5,mm6 \
-  __asm  pxor mm1,mm1 \
-  __asm  pmaddwd mm5,mm7 \
-  __asm  pcmpeqb mm3,mm3 \
-  /*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
-  __asm  psrad mm4,16 \
-  __asm  pcmpeqw mm1,mm2 \
-  __asm  psrad mm5,16 \
-  __asm  psubw mm1,mm3 \
-  __asm  packssdw mm4,mm5 \
-  __asm  paddw mm2,mm1 \
-  /*mm1=t1'' \
-    mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
-  __asm  paddw mm4,mm2 \
-  __asm  movq mm1,[Y+_r4] \
-  __asm  psraw mm4,1 \
-  __asm  movq mm2,mm0 \
-  /*mm7={54491-0x7FFF,0x7FFF}x2 \
-    mm0=t7''=t7'+s*/ \
-  __asm  paddw mm0,mm4 \
-  /*mm2=t6'''=t7'-s*/ \
-  __asm  psubw mm2,mm4 \
-  /*Stage 4:*/ \
-  /*mm0=0, mm2=t0'' \
-    mm5:mm4=t1''*27146+0xB500*/ \
-  __asm  movq mm4,mm1 \
-  __asm  movq mm5,mm1 \
-  __asm  punpcklwd mm4,mm6 \
-  __asm  movq [Y+_r3],mm2 \
-  __asm  pmaddwd mm4,mm7 \
-  __asm  movq mm2,[Y+_r0] \
-  __asm  punpckhwd mm5,mm6 \
-  __asm  movq [Y+_r7],mm0 \
-  __asm  pmaddwd mm5,mm7 \
-  __asm  pxor mm0,mm0 \
-  /*mm7={27146,0x4000>>1}x2 \
-    mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
-  __asm  psrad mm4,16 \
-  __asm  mov A,0x20006A0A \
-  __asm  pcmpeqw mm0,mm1 \
-  __asm  movd mm7,A \
-  __asm  psrad mm5,16 \
-  __asm  psubw mm0,mm3 \
-  __asm  packssdw mm4,mm5 \
-  __asm  paddw mm0,mm1 \
-  __asm  punpckldq mm7,mm7 \
-  __asm  paddw mm0,mm4 \
-  /*mm6={0x00000E3D}x2 \
-    mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
-  __asm  movq mm4,mm2 \
-  __asm  movq mm5,mm2 \
-  __asm  punpcklwd mm4,mm6 \
-  __asm  mov A,0x0E3D \
-  __asm  pmaddwd mm4,mm7 \
-  __asm  punpckhwd mm5,mm6 \
-  __asm  movd mm6,A \
-  __asm  pmaddwd mm5,mm7 \
-  __asm  pxor mm1,mm1 \
-  __asm  punpckldq mm6,mm6 \
-  __asm  pcmpeqw mm1,mm2 \
-  /*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
-  __asm  psrad mm4,16 \
-  __asm  psubw mm1,mm3 \
-  __asm  psrad mm5,16 \
-  __asm  paddw mm2,mm1 \
-  __asm  packssdw mm4,mm5 \
-  __asm  movq mm1,[Y+_r5] \
-  __asm  paddw mm4,mm2 \
-  /*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
-    The naive implementation could cause overflow, so we use \
-     u=(r&s)+((r^s)>>1).*/ \
-  __asm  movq mm2,[Y+_r3] \
-  __asm  movq mm7,mm0 \
-  __asm  pxor mm0,mm4 \
-  __asm  pand mm7,mm4 \
-  __asm  psraw mm0,1 \
-  __asm  mov A,0x7FFF54DC \
-  __asm  paddw mm0,mm7 \
-  __asm  movd mm7,A \
-  /*mm7={54491-0x7FFF,0x7FFF}x2 \
-    mm4=_y[4]=v=r-u*/ \
-  __asm  psubw mm4,mm0 \
-  __asm  punpckldq mm7,mm7 \
-  __asm  movq [Y+_r4],mm4 \
-  /*mm0=0, mm7={36410}x4 \
-    mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
-  __asm  movq mm4,mm1 \
-  __asm  movq mm5,mm1 \
-  __asm  punpcklwd mm4,mm1 \
-  __asm  mov A,0x8E3A8E3A \
-  __asm  pmaddwd mm4,mm7 \
-  __asm  movq [Y+_r0],mm0 \
-  __asm  punpckhwd mm5,mm1 \
-  __asm  pxor mm0,mm0 \
-  __asm  pmaddwd mm5,mm7 \
-  __asm  pcmpeqw mm1,mm0 \
-  __asm  movd mm7,A \
-  __asm  psubw mm1,mm3 \
-  __asm  punpckldq mm7,mm7 \
-  __asm  paddd mm4,mm6 \
-  __asm  paddd mm5,mm6 \
-  /*mm0=0 \
-    mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
-  __asm  movq mm6,mm2 \
-  __asm  movq mm3,mm2 \
-  __asm  pmulhw mm6,mm7 \
-  __asm  paddw mm1,mm2 \
-  __asm  pmullw mm3,mm7 \
-  __asm  pxor mm0,mm0 \
-  __asm  paddw mm6,mm1 \
-  __asm  movq mm1,mm3 \
-  __asm  punpckhwd mm3,mm6 \
-  __asm  punpcklwd mm1,mm6 \
-  /*mm3={-1}x4, mm6={1}x4 \
-    mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
-  __asm  paddd mm5,mm3 \
-  __asm  paddd mm4,mm1 \
-  __asm  psrad mm5,16 \
-  __asm  pxor mm6,mm6 \
-  __asm  psrad mm4,16 \
-  __asm  pcmpeqb mm3,mm3 \
-  __asm  packssdw mm4,mm5 \
-  __asm  psubw mm6,mm3 \
-  /*mm1=t7'', mm7={26568,0x3400}x2 \
-    mm2=s=t6'''-(36410*u>>16)*/ \
-  __asm  movq mm1,mm4 \
-  __asm  mov A,0x340067C8 \
-  __asm  pmulhw mm4,mm7 \
-  __asm  movd mm7,A \
-  __asm  movq [Y+_r5],mm1 \
-  __asm  punpckldq mm7,mm7 \
-  __asm  paddw mm4,mm1 \
-  __asm  movq mm1,[Y+_r7] \
-  __asm  psubw mm2,mm4 \
-  /*mm6={0x00007B1B}x2 \
-    mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
-  __asm  movq mm4,mm2 \
-  __asm  movq mm5,mm2 \
-  __asm  punpcklwd mm4,mm6 \
-  __asm  pcmpeqw mm0,mm2 \
-  __asm  pmaddwd mm4,mm7 \
-  __asm  mov A,0x7B1B \
-  __asm  punpckhwd mm5,mm6 \
-  __asm  movd mm6,A \
-  __asm  pmaddwd mm5,mm7 \
-  __asm  psubw mm0,mm3 \
-  __asm  punpckldq mm6,mm6 \
-  /*mm7={64277-0x7FFF,0x7FFF}x2 \
-    mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
-  __asm  psrad mm4,17 \
-  __asm  paddw mm2,mm0 \
-  __asm  psrad mm5,17 \
-  __asm  mov A,0x7FFF7B16 \
-  __asm  packssdw mm4,mm5 \
-  __asm  movd mm7,A \
-  __asm  paddw mm2,mm4 \
-  __asm  punpckldq mm7,mm7 \
-  /*mm0=0, mm7={12785}x4 \
-    mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
-  __asm  movq mm4,mm1 \
-  __asm  movq mm5,mm1 \
-  __asm  movq [Y+_r3],mm2 \
-  __asm  punpcklwd mm4,mm1 \
-  __asm  movq mm2,[Y+_r1] \
-  __asm  pmaddwd mm4,mm7 \
-  __asm  mov A,0x31F131F1 \
-  __asm  punpckhwd mm5,mm1 \
-  __asm  pxor mm0,mm0 \
-  __asm  pmaddwd mm5,mm7 \
-  __asm  pcmpeqw mm1,mm0 \
-  __asm  movd mm7,A \
-  __asm  psubw mm1,mm3 \
-  __asm  punpckldq mm7,mm7 \
-  __asm  paddd mm4,mm6 \
-  __asm  paddd mm5,mm6 \
-  /*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
-  __asm  movq mm6,mm2 \
-  __asm  movq mm3,mm2 \
-  __asm  pmulhw mm6,mm7 \
-  __asm  pmullw mm3,mm7 \
-  __asm  paddw mm6,mm1 \
-  __asm  movq mm1,mm3 \
-  __asm  punpckhwd mm3,mm6 \
-  __asm  punpcklwd mm1,mm6 \
-  /*mm3={-1}x4, mm6={1}x4 \
-    mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
-  __asm  paddd mm5,mm3 \
-  __asm  paddd mm4,mm1 \
-  __asm  psrad mm5,16 \
-  __asm  pxor mm6,mm6 \
-  __asm  psrad mm4,16 \
-  __asm  pcmpeqb mm3,mm3 \
-  __asm  packssdw mm4,mm5 \
-  __asm  psubw mm6,mm3 \
-  /*mm1=t3'', mm7={20539,0x3000}x2 \
-    mm4=s=(12785*u>>16)-t4''*/ \
-  __asm  movq [Y+_r1],mm4 \
-  __asm  pmulhw mm4,mm7 \
-  __asm  mov A,0x3000503B \
-  __asm  movq mm1,[Y+_r6] \
-  __asm  movd mm7,A \
-  __asm  psubw mm4,mm2 \
-  __asm  punpckldq mm7,mm7 \
-  /*mm6={0x00006CB7}x2 \
-    mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
-  __asm  movq mm5,mm4 \
-  __asm  movq mm2,mm4 \
-  __asm  punpcklwd mm4,mm6 \
-  __asm  pcmpeqw mm0,mm2 \
-  __asm  pmaddwd mm4,mm7 \
-  __asm  mov A,0x6CB7 \
-  __asm  punpckhwd mm5,mm6 \
-  __asm  movd mm6,A \
-  __asm  pmaddwd mm5,mm7 \
-  __asm  psubw mm0,mm3 \
-  __asm  punpckldq mm6,mm6 \
-  /*mm7={60547-0x7FFF,0x7FFF}x2 \
-    mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
-  __asm  psrad mm4,20 \
-  __asm  paddw mm2,mm0 \
-  __asm  psrad mm5,20 \
-  __asm  mov A,0x7FFF6C84 \
-  __asm  packssdw mm4,mm5 \
-  __asm  movd mm7,A \
-  __asm  paddw mm2,mm4 \
-  __asm  punpckldq mm7,mm7 \
-  /*mm0=0, mm7={25080}x4 \
-    mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
-  __asm  movq mm4,mm1 \
-  __asm  movq mm5,mm1 \
-  __asm  movq [Y+_r7],mm2 \
-  __asm  punpcklwd mm4,mm1 \
-  __asm  movq mm2,[Y+_r2] \
-  __asm  pmaddwd mm4,mm7 \
-  __asm  mov A,0x61F861F8 \
-  __asm  punpckhwd mm5,mm1 \
-  __asm  pxor mm0,mm0 \
-  __asm  pmaddwd mm5,mm7 \
-  __asm  movd mm7,A \
-  __asm  pcmpeqw mm1,mm0 \
-  __asm  psubw mm1,mm3 \
-  __asm  punpckldq mm7,mm7 \
-  __asm  paddd mm4,mm6 \
-  __asm  paddd mm5,mm6 \
-  /*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
-  __asm  movq mm6,mm2 \
-  __asm  movq mm3,mm2 \
-  __asm  pmulhw mm6,mm7 \
-  __asm  pmullw mm3,mm7 \
-  __asm  paddw mm6,mm1 \
-  __asm  movq mm1,mm3 \
-  __asm  punpckhwd mm3,mm6 \
-  __asm  punpcklwd mm1,mm6 \
-  /*mm1={-1}x4 \
-    mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
-  __asm  paddd mm5,mm3 \
-  __asm  paddd mm4,mm1 \
-  __asm  psrad mm5,16 \
-  __asm  mov A,0x28005460 \
-  __asm  psrad mm4,16 \
-  __asm  pcmpeqb mm1,mm1 \
-  __asm  packssdw mm4,mm5 \
-  /*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
-    mm4=s=(25080*u>>16)-t2''*/ \
-  __asm  movq mm6,mm4 \
-  __asm  pmulhw mm4,mm7 \
-  __asm  pxor mm5,mm5 \
-  __asm  movd mm7,A \
-  __asm  psubw mm5,mm1 \
-  __asm  punpckldq mm7,mm7 \
-  __asm  psubw mm4,mm2 \
-  /*mm2=s+(s!=0) \
-    mm4:mm3=s*21600+0x2800*/ \
-  __asm  movq mm3,mm4 \
-  __asm  movq mm2,mm4 \
-  __asm  punpckhwd mm4,mm5 \
-  __asm  pcmpeqw mm0,mm2 \
-  __asm  pmaddwd mm4,mm7 \
-  __asm  psubw mm0,mm1 \
-  __asm  punpcklwd mm3,mm5 \
-  __asm  paddw mm2,mm0 \
-  __asm  pmaddwd mm3,mm7 \
-  /*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
-    mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
-  __asm  movq mm0,[Y+_r4] \
-  __asm  psrad mm4,18 \
-  __asm  movq mm5,[Y+_r5] \
-  __asm  psrad mm3,18 \
-  __asm  movq mm1,[Y+_r7] \
-  __asm  packssdw mm3,mm4 \
-  __asm  movq mm4,[Y+_r0] \
-  __asm  paddw mm3,mm2 \
-}
-
-/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
-  On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
-   {mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
-#define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \
-  /*First 4x4 transpose:*/ \
-  /*mm0 = e3 e2 e1 e0 \
-    mm5 = f3 f2 f1 f0 \
-    mm3 = g3 g2 g1 g0 \
-    mm1 = h3 h2 h1 h0*/ \
-  __asm  movq mm2,mm0 \
-  __asm  punpcklwd mm0,mm5 \
-  __asm  punpckhwd mm2,mm5 \
-  __asm  movq mm5,mm3 \
-  __asm  punpcklwd mm3,mm1 \
-  __asm  punpckhwd mm5,mm1 \
-  /*mm0 = f1 e1 f0 e0 \
-    mm2 = f3 e3 f2 e2 \
-    mm3 = h1 g1 h0 g0 \
-    mm5 = h3 g3 h2 g2*/ \
-  __asm  movq mm1,mm0 \
-  __asm  punpckldq mm0,mm3 \
-  __asm  movq [Y+_r4],mm0 \
-  __asm  punpckhdq mm1,mm3 \
-  __asm  movq mm0,[Y+_r1] \
-  __asm  movq mm3,mm2 \
-  __asm  punpckldq mm2,mm5 \
-  __asm  punpckhdq mm3,mm5 \
-  __asm  movq mm5,[Y+_r3] \
-  /*_y[4] = h0 g0 f0 e0 \
-   mm1  = h1 g1 f1 e1 \
-   mm2  = h2 g2 f2 e2 \
-   mm3  = h3 g3 f3 e3*/ \
-  /*Second 4x4 transpose:*/ \
-  /*mm4 = a3 a2 a1 a0 \
-    mm0 = b3 b2 b1 b0 \
-    mm6 = c3 c2 c1 c0 \
-    mm5 = d3 d2 d1 d0*/ \
-  __asm  movq mm7,mm4 \
-  __asm  punpcklwd mm4,mm0 \
-  __asm  punpckhwd mm7,mm0 \
-  __asm  movq mm0,mm6 \
-  __asm  punpcklwd mm6,mm5 \
-  __asm  punpckhwd mm0,mm5 \
-  /*mm4 = b1 a1 b0 a0 \
-    mm7 = b3 a3 b2 a2 \
-    mm6 = d1 c1 d0 c0 \
-    mm0 = d3 c3 d2 c2*/ \
-  __asm  movq mm5,mm4 \
-  __asm  punpckldq mm4,mm6 \
-  __asm  punpckhdq mm5,mm6 \
-  __asm  movq mm6,mm7 \
-  __asm  punpckhdq mm7,mm0 \
-  __asm  punpckldq mm6,mm0 \
-  /*mm4 = d0 c0 b0 a0 \
-    mm5 = d1 c1 b1 a1 \
-    mm6 = d2 c2 b2 a2 \
-    mm7 = d3 c3 b3 a3*/ \
-}
-
-/*MMX implementation of the fDCT.*/
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  ptrdiff_t a;
-  __asm{
-#define Y eax
-#define A ecx
-#define X edx
-    /*Add two extra bits of working precision to improve accuracy; any more and
-       we could overflow.*/
-    /*We also add biases to correct for some systematic error that remains in
-       the full fDCT->iDCT round trip.*/
-    mov X, _x
-    mov Y, _y
-    movq mm0,[0x00+X]
-    movq mm1,[0x10+X]
-    movq mm2,[0x20+X]
-    movq mm3,[0x30+X]
-    pcmpeqb mm4,mm4
-    pxor mm7,mm7
-    movq mm5,mm0
-    psllw mm0,2
-    pcmpeqw mm5,mm7
-    movq mm7,[0x70+X]
-    psllw mm1,2
-    psubw mm5,mm4
-    psllw mm2,2
-    mov A,1
-    pslld mm5,16
-    movd mm6,A
-    psllq mm5,16
-    mov A,0x10001
-    psllw mm3,2
-    movd mm4,A
-    punpckhwd mm5,mm6
-    psubw mm1,mm6
-    movq mm6,[0x60+X]
-    paddw mm0,mm5
-    movq mm5,[0x50+X]
-    paddw mm0,mm4
-    movq mm4,[0x40+X]
-    /*We inline stage1 of the transform here so we can get better instruction
-       scheduling with the shifts.*/
-    /*mm0=t7'=t0-t7*/
-    psllw mm7,2
-    psubw mm0,mm7
-    psllw mm6,2
-    paddw mm7,mm7
-    /*mm1=t6'=t1-t6*/
-    psllw mm5,2
-    psubw mm1,mm6
-    psllw mm4,2
-    paddw mm6,mm6
-    /*mm2=t5'=t2-t5*/
-    psubw mm2,mm5
-    paddw mm5,mm5
-    /*mm3=t4'=t3-t4*/
-    psubw mm3,mm4
-    paddw mm4,mm4
-    /*mm7=t0'=t0+t7*/
-    paddw mm7,mm0
-    /*mm6=t1'=t1+t6*/
-    paddw mm6,mm1
-    /*mm5=t2'=t2+t5*/
-    paddw mm5,mm2
-    /*mm4=t3'=t3+t4*/
-    paddw mm4,mm3
-    OC_FDCT8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70)
-    OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70)
-    /*Swap out this 8x4 block for the next one.*/
-    movq mm0,[0x08+X]
-    movq [0x30+Y],mm7
-    movq mm7,[0x78+X]
-    movq [0x50+Y],mm1
-    movq mm1,[0x18+X]
-    movq [0x20+Y],mm6
-    movq mm6,[0x68+X]
-    movq [0x60+Y],mm2
-    movq mm2,[0x28+X]
-    movq [0x10+Y],mm5
-    movq mm5,[0x58+X]
-    movq [0x70+Y],mm3
-    movq mm3,[0x38+X]
-    /*And increase its working precision, too.*/
-    psllw mm0,2
-    movq [0x00+Y],mm4
-    psllw mm7,2
-    movq mm4,[0x48+X]
-    /*We inline stage1 of the transform here so we can get better instruction
-       scheduling with the shifts.*/
-    /*mm0=t7'=t0-t7*/
-    psubw mm0,mm7
-    psllw mm1,2
-    paddw mm7,mm7
-    psllw mm6,2
-    /*mm1=t6'=t1-t6*/
-    psubw mm1,mm6
-    psllw mm2,2
-    paddw mm6,mm6
-    psllw mm5,2
-    /*mm2=t5'=t2-t5*/
-    psubw mm2,mm5
-    psllw mm3,2
-    paddw mm5,mm5
-    psllw mm4,2
-    /*mm3=t4'=t3-t4*/
-    psubw mm3,mm4
-    paddw mm4,mm4
-    /*mm7=t0'=t0+t7*/
-    paddw mm7,mm0
-    /*mm6=t1'=t1+t6*/
-    paddw mm6,mm1
-    /*mm5=t2'=t2+t5*/
-    paddw mm5,mm2
-    /*mm4=t3'=t3+t4*/
-    paddw mm4,mm3
-    OC_FDCT8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78)
-    OC_TRANSPOSE8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78)
-    /*Here the first 4x4 block of output from the last transpose is the second
-       4x4 block of input for the next transform.
-      We have cleverly arranged that it already be in the appropriate place,
-       so we only have to do half the stores and loads.*/
-    movq mm0,[0x00+Y]
-    movq [0x58+Y],mm1
-    movq mm1,[0x10+Y]
-    movq [0x68+Y],mm2
-    movq mm2,[0x20+Y]
-    movq [0x78+Y],mm3
-    movq mm3,[0x30+Y]
-    OC_FDCT_STAGE1_8x4
-    OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
-    OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
-    /*mm0={-2}x4*/
-    pcmpeqw mm0,mm0
-    paddw mm0,mm0
-    /*Round the results.*/
-    psubw mm1,mm0
-    psubw mm2,mm0
-    psraw mm1,2
-    psubw mm3,mm0
-    movq [0x18+Y],mm1
-    psraw mm2,2
-    psubw mm4,mm0
-    movq mm1,[0x08+Y]
-    psraw mm3,2
-    psubw mm5,mm0
-    psraw mm4,2
-    psubw mm6,mm0
-    psraw mm5,2
-    psubw mm7,mm0
-    psraw mm6,2
-    psubw mm1,mm0
-    psraw mm7,2
-    movq mm0,[0x40+Y]
-    psraw mm1,2
-    movq [0x30+Y],mm7
-    movq mm7,[0x78+Y]
-    movq [0x08+Y],mm1
-    movq mm1,[0x50+Y]
-    movq [0x20+Y],mm6
-    movq mm6,[0x68+Y]
-    movq [0x28+Y],mm2
-    movq mm2,[0x60+Y]
-    movq [0x10+Y],mm5
-    movq mm5,[0x58+Y]
-    movq [0x38+Y],mm3
-    movq mm3,[0x70+Y]
-    movq [0x00+Y],mm4
-    movq mm4,[0x48+Y]
-    OC_FDCT_STAGE1_8x4
-    OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
-    OC_TRANSPOSE8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
-    /*mm0={-2}x4*/
-    pcmpeqw mm0,mm0
-    paddw mm0,mm0
-    /*Round the results.*/
-    psubw mm1,mm0
-    psubw mm2,mm0
-    psraw mm1,2
-    psubw mm3,mm0
-    movq [0x58+Y],mm1
-    psraw mm2,2
-    psubw mm4,mm0
-    movq mm1,[0x48+Y]
-    psraw mm3,2
-    psubw mm5,mm0
-    movq [0x68+Y],mm2
-    psraw mm4,2
-    psubw mm6,mm0
-    movq [0x78+Y],mm3
-    psraw mm5,2
-    psubw mm7,mm0
-    movq [0x40+Y],mm4
-    psraw mm6,2
-    psubw mm1,mm0
-    movq [0x50+Y],mm5
-    psraw mm7,2
-    movq [0x60+Y],mm6
-    psraw mm1,2
-    movq [0x70+Y],mm7
-    movq [0x48+Y],mm1
-#undef Y
-#undef A
-#undef X
-  }
-}
-
-#endif
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************/ 
+ /*MMX fDCT implementation for x86_32*/
+/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+#define OC_FDCT_STAGE1_8x4  __asm{ \
+  /*Stage 1:*/ \
+  /*mm0=t7'=t0-t7*/ \
+  __asm  psubw mm0,mm7 \
+  __asm  paddw mm7,mm7 \
+  /*mm1=t6'=t1-t6*/ \
+  __asm  psubw mm1, mm6 \
+  __asm  paddw mm6,mm6 \
+  /*mm2=t5'=t2-t5*/ \
+  __asm  psubw mm2,mm5 \
+  __asm  paddw mm5,mm5 \
+  /*mm3=t4'=t3-t4*/ \
+  __asm  psubw mm3,mm4 \
+  __asm  paddw mm4,mm4 \
+  /*mm7=t0'=t0+t7*/ \
+  __asm  paddw mm7,mm0 \
+  /*mm6=t1'=t1+t6*/  \
+  __asm  paddw mm6,mm1 \
+  /*mm5=t2'=t2+t5*/ \
+  __asm  paddw mm5,mm2 \
+  /*mm4=t3'=t3+t4*/ \
+  __asm  paddw mm4,mm3\
+}
+
+#define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \
+  /*Stage 2:*/ \
+  /*mm7=t3''=t0'-t3'*/ \
+  __asm  psubw mm7,mm4 \
+  __asm  paddw mm4,mm4 \
+  /*mm6=t2''=t1'-t2'*/ \
+  __asm  psubw mm6,mm5 \
+  __asm  movq [Y+_r6],mm7 \
+  __asm  paddw mm5,mm5 \
+  /*mm1=t5''=t6'-t5'*/ \
+  __asm  psubw mm1,mm2 \
+  __asm  movq [Y+_r2],mm6 \
+  /*mm4=t0''=t0'+t3'*/ \
+  __asm  paddw mm4,mm7 \
+  __asm  paddw mm2,mm2 \
+  /*mm5=t1''=t1'+t2'*/ \
+  __asm  movq [Y+_r0],mm4 \
+  __asm  paddw mm5,mm6 \
+  /*mm2=t6''=t6'+t5'*/ \
+  __asm  paddw mm2,mm1 \
+  __asm  movq [Y+_r4],mm5 \
+  /*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
+  /*mm4, mm5, mm6, mm7 are free.*/ \
+  /*Stage 3:*/ \
+  /*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
+  __asm  mov A,0x5A806A0A \
+  __asm  pcmpeqb mm6,mm6 \
+  __asm  movd mm7,A \
+  __asm  psrlw mm6,15 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddw mm6,mm6 \
+  /*mm0=0, m2={-1}x4 \
+    mm5:mm4=t5''*27146+0xB500*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  movq [Y+_r3],mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq [Y+_r7],mm0 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqb mm2,mm2 \
+  /*mm2=t6'', mm1=t5''+(t5''!=0) \
+    mm4=(t5''*27146+0xB500>>16)*/ \
+  __asm  pcmpeqw mm0,mm1 \
+  __asm  psrad mm4,16 \
+  __asm  psubw mm0,mm2 \
+  __asm  movq mm2, [Y+_r3] \
+  __asm  psrad mm5,16 \
+  __asm  paddw mm1,mm0 \
+  __asm  packssdw mm4,mm5 \
+  /*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
+  __asm  paddw mm4,mm1 \
+  __asm  movq mm0, [Y+_r7] \
+  __asm  psraw mm4,1 \
+  __asm  movq mm1,mm3 \
+  /*mm3=t4''=t4'+s*/ \
+  __asm  paddw mm3,mm4 \
+  /*mm1=t5'''=t4'-s*/ \
+  __asm  psubw mm1,mm4 \
+  /*mm1=0, mm3={-1}x4 \
+    mm5:mm4=t6''*27146+0xB500*/ \
+  __asm  movq mm4,mm2 \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  movq [Y+_r5],mm1 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq [Y+_r1],mm3 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  pxor mm1,mm1 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqb mm3,mm3 \
+  /*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqw mm1,mm2 \
+  __asm  psrad mm5,16 \
+  __asm  psubw mm1,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  paddw mm2,mm1 \
+  /*mm1=t1'' \
+    mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
+  __asm  paddw mm4,mm2 \
+  __asm  movq mm1,[Y+_r4] \
+  __asm  psraw mm4,1 \
+  __asm  movq mm2,mm0 \
+  /*mm7={54491-0x7FFF,0x7FFF}x2 \
+    mm0=t7''=t7'+s*/ \
+  __asm  paddw mm0,mm4 \
+  /*mm2=t6'''=t7'-s*/ \
+  __asm  psubw mm2,mm4 \
+  /*Stage 4:*/ \
+  /*mm0=0, mm2=t0'' \
+    mm5:mm4=t1''*27146+0xB500*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  movq [Y+_r3],mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq mm2,[Y+_r0] \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movq [Y+_r7],mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pxor mm0,mm0 \
+  /*mm7={27146,0x4000>>1}x2 \
+    mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
+  __asm  psrad mm4,16 \
+  __asm  mov A,0x20006A0A \
+  __asm  pcmpeqw mm0,mm1 \
+  __asm  movd mm7,A \
+  __asm  psrad mm5,16 \
+  __asm  psubw mm0,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  paddw mm0,mm1 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddw mm0,mm4 \
+  /*mm6={0x00000E3D}x2 \
+    mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
+  __asm  movq mm4,mm2 \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  mov A,0x0E3D \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movd mm6,A \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pxor mm1,mm1 \
+  __asm  punpckldq mm6,mm6 \
+  __asm  pcmpeqw mm1,mm2 \
+  /*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
+  __asm  psrad mm4,16 \
+  __asm  psubw mm1,mm3 \
+  __asm  psrad mm5,16 \
+  __asm  paddw mm2,mm1 \
+  __asm  packssdw mm4,mm5 \
+  __asm  movq mm1,[Y+_r5] \
+  __asm  paddw mm4,mm2 \
+  /*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
+    The naive implementation could cause overflow, so we use \
+     u=(r&s)+((r^s)>>1).*/ \
+  __asm  movq mm2,[Y+_r3] \
+  __asm  movq mm7,mm0 \
+  __asm  pxor mm0,mm4 \
+  __asm  pand mm7,mm4 \
+  __asm  psraw mm0,1 \
+  __asm  mov A,0x7FFF54DC \
+  __asm  paddw mm0,mm7 \
+  __asm  movd mm7,A \
+  /*mm7={54491-0x7FFF,0x7FFF}x2 \
+    mm4=_y[4]=v=r-u*/ \
+  __asm  psubw mm4,mm0 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  movq [Y+_r4],mm4 \
+  /*mm0=0, mm7={36410}x4 \
+    mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  punpcklwd mm4,mm1 \
+  __asm  mov A,0x8E3A8E3A \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq [Y+_r0],mm0 \
+  __asm  punpckhwd mm5,mm1 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqw mm1,mm0 \
+  __asm  movd mm7,A \
+  __asm  psubw mm1,mm3 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddd mm4,mm6 \
+  __asm  paddd mm5,mm6 \
+  /*mm0=0 \
+    mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm3,mm2 \
+  __asm  pmulhw mm6,mm7 \
+  __asm  paddw mm1,mm2 \
+  __asm  pmullw mm3,mm7 \
+  __asm  pxor mm0,mm0 \
+  __asm  paddw mm6,mm1 \
+  __asm  movq mm1,mm3 \
+  __asm  punpckhwd mm3,mm6 \
+  __asm  punpcklwd mm1,mm6 \
+  /*mm3={-1}x4, mm6={1}x4 \
+    mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
+  __asm  paddd mm5,mm3 \
+  __asm  paddd mm4,mm1 \
+  __asm  psrad mm5,16 \
+  __asm  pxor mm6,mm6 \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqb mm3,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  psubw mm6,mm3 \
+  /*mm1=t7'', mm7={26568,0x3400}x2 \
+    mm2=s=t6'''-(36410*u>>16)*/ \
+  __asm  movq mm1,mm4 \
+  __asm  mov A,0x340067C8 \
+  __asm  pmulhw mm4,mm7 \
+  __asm  movd mm7,A \
+  __asm  movq [Y+_r5],mm1 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddw mm4,mm1 \
+  __asm  movq mm1,[Y+_r7] \
+  __asm  psubw mm2,mm4 \
+  /*mm6={0x00007B1B}x2 \
+    mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
+  __asm  movq mm4,mm2 \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  pcmpeqw mm0,mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x7B1B \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movd mm6,A \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  psubw mm0,mm3 \
+  __asm  punpckldq mm6,mm6 \
+  /*mm7={64277-0x7FFF,0x7FFF}x2 \
+    mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
+  __asm  psrad mm4,17 \
+  __asm  paddw mm2,mm0 \
+  __asm  psrad mm5,17 \
+  __asm  mov A,0x7FFF7B16 \
+  __asm  packssdw mm4,mm5 \
+  __asm  movd mm7,A \
+  __asm  paddw mm2,mm4 \
+  __asm  punpckldq mm7,mm7 \
+  /*mm0=0, mm7={12785}x4 \
+    mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  movq [Y+_r3],mm2 \
+  __asm  punpcklwd mm4,mm1 \
+  __asm  movq mm2,[Y+_r1] \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x31F131F1 \
+  __asm  punpckhwd mm5,mm1 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqw mm1,mm0 \
+  __asm  movd mm7,A \
+  __asm  psubw mm1,mm3 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddd mm4,mm6 \
+  __asm  paddd mm5,mm6 \
+  /*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm3,mm2 \
+  __asm  pmulhw mm6,mm7 \
+  __asm  pmullw mm3,mm7 \
+  __asm  paddw mm6,mm1 \
+  __asm  movq mm1,mm3 \
+  __asm  punpckhwd mm3,mm6 \
+  __asm  punpcklwd mm1,mm6 \
+  /*mm3={-1}x4, mm6={1}x4 \
+    mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
+  __asm  paddd mm5,mm3 \
+  __asm  paddd mm4,mm1 \
+  __asm  psrad mm5,16 \
+  __asm  pxor mm6,mm6 \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqb mm3,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  psubw mm6,mm3 \
+  /*mm1=t3'', mm7={20539,0x3000}x2 \
+    mm4=s=(12785*u>>16)-t4''*/ \
+  __asm  movq [Y+_r1],mm4 \
+  __asm  pmulhw mm4,mm7 \
+  __asm  mov A,0x3000503B \
+  __asm  movq mm1,[Y+_r6] \
+  __asm  movd mm7,A \
+  __asm  psubw mm4,mm2 \
+  __asm  punpckldq mm7,mm7 \
+  /*mm6={0x00006CB7}x2 \
+    mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
+  __asm  movq mm5,mm4 \
+  __asm  movq mm2,mm4 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  pcmpeqw mm0,mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x6CB7 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movd mm6,A \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  psubw mm0,mm3 \
+  __asm  punpckldq mm6,mm6 \
+  /*mm7={60547-0x7FFF,0x7FFF}x2 \
+    mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
+  __asm  psrad mm4,20 \
+  __asm  paddw mm2,mm0 \
+  __asm  psrad mm5,20 \
+  __asm  mov A,0x7FFF6C84 \
+  __asm  packssdw mm4,mm5 \
+  __asm  movd mm7,A \
+  __asm  paddw mm2,mm4 \
+  __asm  punpckldq mm7,mm7 \
+  /*mm0=0, mm7={25080}x4 \
+    mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  movq [Y+_r7],mm2 \
+  __asm  punpcklwd mm4,mm1 \
+  __asm  movq mm2,[Y+_r2] \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x61F861F8 \
+  __asm  punpckhwd mm5,mm1 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  movd mm7,A \
+  __asm  pcmpeqw mm1,mm0 \
+  __asm  psubw mm1,mm3 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddd mm4,mm6 \
+  __asm  paddd mm5,mm6 \
+  /*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm3,mm2 \
+  __asm  pmulhw mm6,mm7 \
+  __asm  pmullw mm3,mm7 \
+  __asm  paddw mm6,mm1 \
+  __asm  movq mm1,mm3 \
+  __asm  punpckhwd mm3,mm6 \
+  __asm  punpcklwd mm1,mm6 \
+  /*mm1={-1}x4 \
+    mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
+  __asm  paddd mm5,mm3 \
+  __asm  paddd mm4,mm1 \
+  __asm  psrad mm5,16 \
+  __asm  mov A,0x28005460 \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqb mm1,mm1 \
+  __asm  packssdw mm4,mm5 \
+  /*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
+    mm4=s=(25080*u>>16)-t2''*/ \
+  __asm  movq mm6,mm4 \
+  __asm  pmulhw mm4,mm7 \
+  __asm  pxor mm5,mm5 \
+  __asm  movd mm7,A \
+  __asm  psubw mm5,mm1 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  psubw mm4,mm2 \
+  /*mm2=s+(s!=0) \
+    mm4:mm3=s*21600+0x2800*/ \
+  __asm  movq mm3,mm4 \
+  __asm  movq mm2,mm4 \
+  __asm  punpckhwd mm4,mm5 \
+  __asm  pcmpeqw mm0,mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  psubw mm0,mm1 \
+  __asm  punpcklwd mm3,mm5 \
+  __asm  paddw mm2,mm0 \
+  __asm  pmaddwd mm3,mm7 \
+  /*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
+    mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
+  __asm  movq mm0,[Y+_r4] \
+  __asm  psrad mm4,18 \
+  __asm  movq mm5,[Y+_r5] \
+  __asm  psrad mm3,18 \
+  __asm  movq mm1,[Y+_r7] \
+  __asm  packssdw mm3,mm4 \
+  __asm  movq mm4,[Y+_r0] \
+  __asm  paddw mm3,mm2 \
+}
+
+/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
+  On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
+   {mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
+#define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \
+  /*First 4x4 transpose:*/ \
+  /*mm0 = e3 e2 e1 e0 \
+    mm5 = f3 f2 f1 f0 \
+    mm3 = g3 g2 g1 g0 \
+    mm1 = h3 h2 h1 h0*/ \
+  __asm  movq mm2,mm0 \
+  __asm  punpcklwd mm0,mm5 \
+  __asm  punpckhwd mm2,mm5 \
+  __asm  movq mm5,mm3 \
+  __asm  punpcklwd mm3,mm1 \
+  __asm  punpckhwd mm5,mm1 \
+  /*mm0 = f1 e1 f0 e0 \
+    mm2 = f3 e3 f2 e2 \
+    mm3 = h1 g1 h0 g0 \
+    mm5 = h3 g3 h2 g2*/ \
+  __asm  movq mm1,mm0 \
+  __asm  punpckldq mm0,mm3 \
+  __asm  movq [Y+_r4],mm0 \
+  __asm  punpckhdq mm1,mm3 \
+  __asm  movq mm0,[Y+_r1] \
+  __asm  movq mm3,mm2 \
+  __asm  punpckldq mm2,mm5 \
+  __asm  punpckhdq mm3,mm5 \
+  __asm  movq mm5,[Y+_r3] \
+  /*_y[4] = h0 g0 f0 e0 \
+   mm1  = h1 g1 f1 e1 \
+   mm2  = h2 g2 f2 e2 \
+   mm3  = h3 g3 f3 e3*/ \
+  /*Second 4x4 transpose:*/ \
+  /*mm4 = a3 a2 a1 a0 \
+    mm0 = b3 b2 b1 b0 \
+    mm6 = c3 c2 c1 c0 \
+    mm5 = d3 d2 d1 d0*/ \
+  __asm  movq mm7,mm4 \
+  __asm  punpcklwd mm4,mm0 \
+  __asm  punpckhwd mm7,mm0 \
+  __asm  movq mm0,mm6 \
+  __asm  punpcklwd mm6,mm5 \
+  __asm  punpckhwd mm0,mm5 \
+  /*mm4 = b1 a1 b0 a0 \
+    mm7 = b3 a3 b2 a2 \
+    mm6 = d1 c1 d0 c0 \
+    mm0 = d3 c3 d2 c2*/ \
+  __asm  movq mm5,mm4 \
+  __asm  punpckldq mm4,mm6 \
+  __asm  punpckhdq mm5,mm6 \
+  __asm  movq mm6,mm7 \
+  __asm  punpckhdq mm7,mm0 \
+  __asm  punpckldq mm6,mm0 \
+  /*mm4 = d0 c0 b0 a0 \
+    mm5 = d1 c1 b1 a1 \
+    mm6 = d2 c2 b2 a2 \
+    mm7 = d3 c3 b3 a3*/ \
+}
+
+/*MMX implementation of the fDCT.*/
+void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  ptrdiff_t a;
+  __asm{
+#define Y eax
+#define A ecx
+#define X edx
+    /*Add two extra bits of working precision to improve accuracy; any more and
+       we could overflow.*/
+    /*We also add biases to correct for some systematic error that remains in
+       the full fDCT->iDCT round trip.*/
+    mov X, _x
+    mov Y, _y
+    movq mm0,[0x00+X]
+    movq mm1,[0x10+X]
+    movq mm2,[0x20+X]
+    movq mm3,[0x30+X]
+    pcmpeqb mm4,mm4
+    pxor mm7,mm7
+    movq mm5,mm0
+    psllw mm0,2
+    pcmpeqw mm5,mm7
+    movq mm7,[0x70+X]
+    psllw mm1,2
+    psubw mm5,mm4
+    psllw mm2,2
+    mov A,1
+    pslld mm5,16
+    movd mm6,A
+    psllq mm5,16
+    mov A,0x10001
+    psllw mm3,2
+    movd mm4,A
+    punpckhwd mm5,mm6
+    psubw mm1,mm6
+    movq mm6,[0x60+X]
+    paddw mm0,mm5
+    movq mm5,[0x50+X]
+    paddw mm0,mm4
+    movq mm4,[0x40+X]
+    /*We inline stage1 of the transform here so we can get better instruction
+       scheduling with the shifts.*/
+    /*mm0=t7'=t0-t7*/
+    psllw mm7,2
+    psubw mm0,mm7
+    psllw mm6,2
+    paddw mm7,mm7
+    /*mm1=t6'=t1-t6*/
+    psllw mm5,2
+    psubw mm1,mm6
+    psllw mm4,2
+    paddw mm6,mm6
+    /*mm2=t5'=t2-t5*/
+    psubw mm2,mm5
+    paddw mm5,mm5
+    /*mm3=t4'=t3-t4*/
+    psubw mm3,mm4
+    paddw mm4,mm4
+    /*mm7=t0'=t0+t7*/
+    paddw mm7,mm0
+    /*mm6=t1'=t1+t6*/
+    paddw mm6,mm1
+    /*mm5=t2'=t2+t5*/
+    paddw mm5,mm2
+    /*mm4=t3'=t3+t4*/
+    paddw mm4,mm3
+    OC_FDCT8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70)
+    OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70)
+    /*Swap out this 8x4 block for the next one.*/
+    movq mm0,[0x08+X]
+    movq [0x30+Y],mm7
+    movq mm7,[0x78+X]
+    movq [0x50+Y],mm1
+    movq mm1,[0x18+X]
+    movq [0x20+Y],mm6
+    movq mm6,[0x68+X]
+    movq [0x60+Y],mm2
+    movq mm2,[0x28+X]
+    movq [0x10+Y],mm5
+    movq mm5,[0x58+X]
+    movq [0x70+Y],mm3
+    movq mm3,[0x38+X]
+    /*And increase its working precision, too.*/
+    psllw mm0,2
+    movq [0x00+Y],mm4
+    psllw mm7,2
+    movq mm4,[0x48+X]
+    /*We inline stage1 of the transform here so we can get better instruction
+       scheduling with the shifts.*/
+    /*mm0=t7'=t0-t7*/
+    psubw mm0,mm7
+    psllw mm1,2
+    paddw mm7,mm7
+    psllw mm6,2
+    /*mm1=t6'=t1-t6*/
+    psubw mm1,mm6
+    psllw mm2,2
+    paddw mm6,mm6
+    psllw mm5,2
+    /*mm2=t5'=t2-t5*/
+    psubw mm2,mm5
+    psllw mm3,2
+    paddw mm5,mm5
+    psllw mm4,2
+    /*mm3=t4'=t3-t4*/
+    psubw mm3,mm4
+    paddw mm4,mm4
+    /*mm7=t0'=t0+t7*/
+    paddw mm7,mm0
+    /*mm6=t1'=t1+t6*/
+    paddw mm6,mm1
+    /*mm5=t2'=t2+t5*/
+    paddw mm5,mm2
+    /*mm4=t3'=t3+t4*/
+    paddw mm4,mm3
+    OC_FDCT8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78)
+    OC_TRANSPOSE8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78)
+    /*Here the first 4x4 block of output from the last transpose is the second
+       4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place,
+       so we only have to do half the stores and loads.*/
+    movq mm0,[0x00+Y]
+    movq [0x58+Y],mm1
+    movq mm1,[0x10+Y]
+    movq [0x68+Y],mm2
+    movq mm2,[0x20+Y]
+    movq [0x78+Y],mm3
+    movq mm3,[0x30+Y]
+    OC_FDCT_STAGE1_8x4
+    OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
+    OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
+    /*mm0={-2}x4*/
+    pcmpeqw mm0,mm0
+    paddw mm0,mm0
+    /*Round the results.*/
+    psubw mm1,mm0
+    psubw mm2,mm0
+    psraw mm1,2
+    psubw mm3,mm0
+    movq [0x18+Y],mm1
+    psraw mm2,2
+    psubw mm4,mm0
+    movq mm1,[0x08+Y]
+    psraw mm3,2
+    psubw mm5,mm0
+    psraw mm4,2
+    psubw mm6,mm0
+    psraw mm5,2
+    psubw mm7,mm0
+    psraw mm6,2
+    psubw mm1,mm0
+    psraw mm7,2
+    movq mm0,[0x40+Y]
+    psraw mm1,2
+    movq [0x30+Y],mm7
+    movq mm7,[0x78+Y]
+    movq [0x08+Y],mm1
+    movq mm1,[0x50+Y]
+    movq [0x20+Y],mm6
+    movq mm6,[0x68+Y]
+    movq [0x28+Y],mm2
+    movq mm2,[0x60+Y]
+    movq [0x10+Y],mm5
+    movq mm5,[0x58+Y]
+    movq [0x38+Y],mm3
+    movq mm3,[0x70+Y]
+    movq [0x00+Y],mm4
+    movq mm4,[0x48+Y]
+    OC_FDCT_STAGE1_8x4
+    OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
+    OC_TRANSPOSE8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
+    /*mm0={-2}x4*/
+    pcmpeqw mm0,mm0
+    paddw mm0,mm0
+    /*Round the results.*/
+    psubw mm1,mm0
+    psubw mm2,mm0
+    psraw mm1,2
+    psubw mm3,mm0
+    movq [0x58+Y],mm1
+    psraw mm2,2
+    psubw mm4,mm0
+    movq mm1,[0x48+Y]
+    psraw mm3,2
+    psubw mm5,mm0
+    movq [0x68+Y],mm2
+    psraw mm4,2
+    psubw mm6,mm0
+    movq [0x78+Y],mm3
+    psraw mm5,2
+    psubw mm7,mm0
+    movq [0x40+Y],mm4
+    psraw mm6,2
+    psubw mm1,mm0
+    movq [0x50+Y],mm5
+    psraw mm7,2
+    movq [0x60+Y],mm6
+    psraw mm1,2
+    movq [0x70+Y],mm7
+    movq [0x48+Y],mm1
+#undef Y
+#undef A
+#undef X
+  }
+}
+
+#endif
diff --git a/thirdparty/nanosvg/LICENSE.txt b/thirdparty/nanosvg/LICENSE.txt
index 6fde401cb28..f896f2eb0f0 100644
--- a/thirdparty/nanosvg/LICENSE.txt
+++ b/thirdparty/nanosvg/LICENSE.txt
@@ -1,18 +1,18 @@
-Copyright (c) 2013-14 Mikko Mononen memon@inside.org
-
-This software is provided 'as-is', without any express or implied
-warranty.  In no event will the authors be held liable for any damages
-arising from the use of this software.
-
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it
-freely, subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not
-claim that you wrote the original software. If you use this software
-in a product, an acknowledgment in the product documentation would be
-appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be
-misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-
+Copyright (c) 2013-14 Mikko Mononen memon@inside.org
+
+This software is provided 'as-is', without any express or implied
+warranty.  In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+