diff --git a/src/dec/dsp.c b/src/dec/dsp.c
index 59ba1a3f..0dea42a4 100644
--- a/src/dec/dsp.c
+++ b/src/dec/dsp.c
@@ -62,7 +62,7 @@ static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 #define MUL(a, b) (((a) * (b)) >> 16)
 
-static void Transform(const int16_t* in, uint8_t* dst) {
+static void TransformOne(const int16_t* in, uint8_t* dst) {
   int C[4 * 4], *tmp;
   int i;
   tmp = C;
@@ -102,11 +102,16 @@ static void Transform(const int16_t* in, uint8_t* dst) {
 }
 #undef MUL
 
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
 static void TransformUV(const int16_t* in, uint8_t* dst) {
-  VP8Transform(in + 0 * 16, dst);
-  VP8Transform(in + 1 * 16, dst + 4);
-  VP8Transform(in + 2 * 16, dst + 4 * BPS);
-  VP8Transform(in + 3 * 16, dst + 4 * BPS + 4);
+  VP8Transform(in + 0 * 16, dst, 1);
+  VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
 }
 
 static void TransformDC(const int16_t *in, uint8_t* dst) {
@@ -129,7 +134,7 @@ static void TransformDCUV(const int16_t* in, uint8_t* dst) {
 #undef STORE
 
 // default C implementations:
-VP8Idct VP8Transform = Transform;
+VP8Idct2 VP8Transform = TransformTwo;
 VP8Idct VP8TransformUV = TransformUV;
 VP8Idct VP8TransformDC = TransformDC;
 VP8Idct VP8TransformDCUV = TransformDCUV;
diff --git a/src/dec/dsp_sse2.c b/src/dec/dsp_sse2.c
index c4e08676..b7427fdf 100644
--- a/src/dec/dsp_sse2.c
+++ b/src/dec/dsp_sse2.c
@@ -22,7 +22,7 @@ extern "C" {
 //-----------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
-static void TransformSSE2(const int16_t* in, uint8_t* dst) {
+static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
   // This implementation makes use of 16-bit fixed point versions of two
   // multiply constants:
   //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -43,8 +43,9 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst) {
   const __m128i k2 = _mm_set1_epi16(-30068);
   __m128i T0, T1, T2, T3;
 
-  // Load the transform coefficients. The second half of the vectors will just
-  // contain random value we'll never use nor store.
+  // Load and concatenate the transform coefficients (we'll do two transforms
+  // in parallel). In the case of only one transform, the second half of the
+  // vectors will just contain random value we'll never use nor store.
   __m128i in0, in1, in2, in3;
   {
     in0 = _mm_loadl_epi64((__m128i*)&in[0]);
@@ -55,6 +56,20 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst) {
     // a01 a11 a21 a31   x x x x
     // a02 a12 a22 a32   x x x x
     // a03 a13 a23 a33   x x x x
+    if (do_two) {
+      const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
+      const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
+      const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
+      const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
+      in0 = _mm_unpacklo_epi64(in0, inB0);
+      in1 = _mm_unpacklo_epi64(in1, inB1);
+      in2 = _mm_unpacklo_epi64(in2, inB2);
+      in3 = _mm_unpacklo_epi64(in3, inB3);
+      // a00 a10 a20 a30   b00 b10 b20 b30
+      // a01 a11 a21 a31   b01 b11 b21 b31
+      // a02 a12 a22 a32   b02 b12 b22 b32
+      // a03 a13 a23 a33   b03 b13 b23 b33
+    }
   }
 
   // Vertical pass and subsequent transpose.
@@ -179,10 +194,20 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst) {
   {
     const __m128i zero = _mm_set1_epi16(0);
     // Load the reference(s).
-    __m128i dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
-    __m128i dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
-    __m128i dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
-    __m128i dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
+    __m128i dst0, dst1, dst2, dst3;
+    if (do_two) {
+      // Load eight bytes/pixels per line.
+      dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
+      dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
+      dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
+      dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
+    } else {
+      // Load four bytes/pixels per line.
+      dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
+      dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
+      dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
+      dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
+    }
     // Convert to 16b.
     dst0 = _mm_unpacklo_epi8(dst0, zero);
     dst1 = _mm_unpacklo_epi8(dst1, zero);
@@ -198,11 +223,20 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst) {
     dst1 = _mm_packus_epi16(dst1, dst1);
     dst2 = _mm_packus_epi16(dst2, dst2);
     dst3 = _mm_packus_epi16(dst3, dst3);
-    // Store the results, four bytes/pixels per line.
-    *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
-    *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
-    *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
-    *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
+    // Store the results.
+    if (do_two) {
+      // Store eight bytes/pixels per line.
+      _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0);
+      _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1);
+      _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2);
+      _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3);
+    } else {
+      // Store four bytes/pixels per line.
+      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
+      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
+      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
+      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
+    }
   }
 }
 
diff --git a/src/dec/frame.c b/src/dec/frame.c
index 4a0adfe8..46d735f8 100644
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@@ -379,7 +379,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
         uint8_t* const dst = y_dst + kScan[n];
         VP8PredLuma4[dec->imodes_[n]](dst);
         if (dec->non_zero_ac_ & (1 << n)) {
-          VP8Transform(coeffs + n * 16, dst);
+          VP8Transform(coeffs + n * 16, dst, 0);
         } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
           VP8TransformDC(coeffs + n * 16, dst);
         }
@@ -391,7 +391,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
         for (n = 0; n < 16; n++) {
           uint8_t* const dst = y_dst + kScan[n];
           if (dec->non_zero_ac_ & (1 << n)) {
-            VP8Transform(coeffs + n * 16, dst);
+            VP8Transform(coeffs + n * 16, dst, 0);
           } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
             VP8TransformDC(coeffs + n * 16, dst);
           }
diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h
index 3f391976..10ac4912 100644
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@@ -297,7 +297,9 @@ int VP8DecodeLayer(VP8Decoder* const dec);
 
 // in dsp.c
 typedef void (*VP8Idct)(const int16_t* coeffs, uint8_t* dst);
-extern VP8Idct VP8Transform;
+// when doing two transforms, coeffs is actually int16_t[2][16].
+typedef void (*VP8Idct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
+extern VP8Idct2 VP8Transform;
 extern VP8Idct VP8TransformUV;
 extern VP8Idct VP8TransformDC;
 extern VP8Idct VP8TransformDCUV;