diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c
index 9f49744d..53332f7f 100644
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@@ -946,14 +946,14 @@ void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
   // Get the comparison results as a bitmask, consisting of two times 16 bits:
   // two identical bits for each result. Concatenate both bitmasks to get a
   // single 32 bit value. Negate the mask to get the position of entries that
-  // are not equal to zero. Finally, mask out least significant bits according
-  // to res->first.
+  // are not equal to zero. We don't need to mask out least significant bits
+  // according to res->first, since coeffs[0] is 0 if res->first > 0
   const uint32_t mask =
-      ~(((uint32_t)_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0)) &
-      -(1U << (res->first << 1));
+      ~(((uint32_t)_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0));
   // The position of the most significant non-zero bit indicates the position of
   // the last non-zero value. Divide the result by two because __movemask_epi8
   // operates on 8 bit values instead of 16 bit values.
+  assert(res->first == 0 || coeffs[0] == 0);
   res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1;
   res->coeffs = coeffs;
 }
diff --git a/src/enc/cost.c b/src/enc/cost.c
index 5d83f262..9d2cc017 100644
--- a/src/enc/cost.c
+++ b/src/enc/cost.c
@@ -562,7 +562,8 @@ static void SetResidualCoeffs(const int16_t* const coeffs,
                               VP8Residual* const res) {
   int n;
   res->last = -1;
-  for (n = 15; n >= res->first; --n) {
+  assert(res->first == 0 || coeffs[0] == 0);
+  for (n = 15; n >= 0; --n) {
     if (coeffs[n]) {
       res->last = n;
       break;
diff --git a/src/enc/quant.c b/src/enc/quant.c
index c8cdc160..9130a416 100644
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@@ -741,13 +741,17 @@ static int ReconstructIntra16(VP8EncIterator* const it,
             TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
                                  &dqm->y1_, dqm->lambda_trellis_i16_);
         it->top_nz_[x] = it->left_nz_[y] = non_zero;
+        rd->y_ac_levels[n][0] = 0;
         nz |= non_zero << n;
       }
     }
   } else {
     for (n = 0; n < 16; ++n) {
-      tmp[n][0] = 0;  // so that nz is correct below
+      // Zero-out the first coeff, so that: a) nz is correct below, and
+      // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
+      tmp[n][0] = 0;
       nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
+      assert(rd->y_ac_levels[n][0] == 0);
     }
   }