diff --git a/src/enc/analysis.c b/src/enc/analysis.c
index 4ff3edd2..77b17ab8 100644
--- a/src/enc/analysis.c
+++ b/src/enc/analysis.c
@@ -384,32 +384,111 @@ static void ResetAllMBInfo(VP8Encoder* const enc) {
   // Default susceptibilities.
   enc->dqm_[0].alpha_ = 0;
   enc->dqm_[0].beta_ = 0;
-  // Note: we can't compute this alpha_ / uv_alpha_.
+  // Note: we can't compute this alpha_ / uv_alpha_ -> set to default value.
+  enc->alpha_ = 0;
+  enc->uv_alpha_ = 0;
   WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
 }
 
+// struct used to collect job result
+typedef struct {
+  WebPWorker worker;
+  int alphas[MAX_ALPHA + 1];
+  int alpha, uv_alpha;
+  VP8EncIterator it;
+  int delta_progress;
+} SegmentJob;
+
+// main work call
+static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) {
+  int ok = 1;
+  if (!VP8IteratorIsDone(it)) {
+    uint8_t tmp[32 + ALIGN_CST];
+    uint8_t* const scratch = (uint8_t*)DO_ALIGN(tmp);
+    do {
+      // Let's pretend we have perfect lossless reconstruction.
+      VP8IteratorImport(it, scratch);
+      MBAnalyze(it, job->alphas, &job->alpha, &job->uv_alpha);
+      ok = VP8IteratorProgress(it, job->delta_progress);
+    } while (ok && VP8IteratorNext(it));
+  }
+  return ok;
+}
+
+static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
+  int i;
+  for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i];
+  dst->alpha += src->alpha;
+  dst->uv_alpha += src->uv_alpha;
+}
+
+// initialize the job struct with some TODOs
+static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
+                           int start_row, int end_row) {
+  WebPWorkerInit(&job->worker);
+  job->worker.data1 = job;
+  job->worker.data2 = &job->it;
+  job->worker.hook = (WebPWorkerHook)DoSegmentsJob;
+  VP8IteratorInit(enc, &job->it);
+  VP8IteratorSetRow(&job->it, start_row);
+  VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
+  memset(job->alphas, 0, sizeof(job->alphas));
+  job->alpha = 0;
+  job->uv_alpha = 0;
+  // only one of both jobs can record the progress, since we don't
+  // expect the user's hook to be multi-thread safe
+  job->delta_progress = (start_row == 0) ? 20 : 0;
+}
+
+// main entry point
 int VP8EncAnalyze(VP8Encoder* const enc) {
   int ok = 1;
   const int do_segments =
       enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation.
       (enc->segment_hdr_.num_segments_ > 1) ||
       (enc->method_ == 0);  // for method 0, we need preds_[] to be filled.
-  enc->alpha_ = 0;
-  enc->uv_alpha_ = 0;
   if (do_segments) {
-    int alphas[MAX_ALPHA + 1] = { 0 };
-    VP8EncIterator it;
-
-    VP8IteratorInit(enc, &it);
-    do {
-      VP8IteratorImport(&it);
-      MBAnalyze(&it, alphas, &enc->alpha_, &enc->uv_alpha_);
-      ok = VP8IteratorProgress(&it, 20);
-      // Let's pretend we have perfect lossless reconstruction.
-    } while (ok && VP8IteratorNext(&it, it.yuv_in_));
-    enc->alpha_ /= enc->mb_w_ * enc->mb_h_;
-    enc->uv_alpha_ /= enc->mb_w_ * enc->mb_h_;
-    if (ok) AssignSegments(enc, alphas);
+    const int last_row = enc->mb_h_;
+    // We give a little more than a half work to the main thread.
+    const int split_row = (9 * last_row + 15) >> 4;
+    const int total_mb = last_row * enc->mb_w_;
+#ifdef WEBP_USE_THREAD
+    const int kMinSplitRow = 2;  // minimal rows needed for mt to be worth it
+    const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow);
+#else
+    const int do_mt = 0;
+#endif
+    SegmentJob main_job;
+    if (do_mt) {
+      SegmentJob side_job;
+      // Note the use of '&' instead of '&&' because we must call the functions
+      // no matter what.
+      InitSegmentJob(enc, &main_job, 0, split_row);
+      InitSegmentJob(enc, &side_job, split_row, last_row);
+      // we don't need to call Reset() on main_job.worker, since we're calling
+      // WebPWorkerExecute() on it
+      ok &= WebPWorkerReset(&side_job.worker);
+      // launch the two jobs in parallel
+      if (ok) {
+        WebPWorkerLaunch(&side_job.worker);
+        WebPWorkerExecute(&main_job.worker);
+        ok &= WebPWorkerSync(&side_job.worker);
+        ok &= WebPWorkerSync(&main_job.worker);
+      }
+      WebPWorkerEnd(&side_job.worker);
+      if (ok) MergeJobs(&side_job, &main_job);  // merge results together
+    } else {
+      // Even for single-thread case, we use the generic Worker tools.
+      InitSegmentJob(enc, &main_job, 0, last_row);
+      WebPWorkerExecute(&main_job.worker);
+      ok &= WebPWorkerSync(&main_job.worker);
+    }
+    WebPWorkerEnd(&main_job.worker);
+    if (ok) {
+      enc->alpha_ = main_job.alpha / total_mb;
+      enc->uv_alpha_ = main_job.uv_alpha / total_mb;
+      AssignSegments(enc, main_job.alphas);
+    }
   } else {   // Use only one default segment.
     ResetAllMBInfo(enc);
   }
diff --git a/src/enc/frame.c b/src/enc/frame.c
index d71f72e8..4624d913 100644
--- a/src/enc/frame.c
+++ b/src/enc/frame.c
@@ -721,7 +721,7 @@ static int OneStatPass(VP8Encoder* const enc, float q, VP8RDLevel rd_opt,
   VP8IteratorInit(enc, &it);
   do {
     VP8ModeScore info;
-    VP8IteratorImport(&it);
+    VP8IteratorImport(&it, NULL);
     if (VP8Decimate(&it, &info, rd_opt)) {
       // Just record the number of skips and act like skip_proba is not used.
       enc->proba_.nb_skip_++;
@@ -731,7 +731,8 @@ static int OneStatPass(VP8Encoder* const enc, float q, VP8RDLevel rd_opt,
     distortion += info.D;
     if (percent_delta && !VP8IteratorProgress(&it, percent_delta))
       return 0;
-  } while (VP8IteratorNext(&it, it.yuv_out_) && --nb_mbs > 0);
+    VP8IteratorSaveBoundary(&it, it.yuv_out_);
+  } while (VP8IteratorNext(&it) && --nb_mbs > 0);
   size += FinalizeSkipProba(enc);
   size += FinalizeTokenProbas(&enc->proba_);
   size += enc->segment_hdr_.size_;
@@ -877,7 +878,7 @@ int VP8EncLoop(VP8Encoder* const enc) {
     const int dont_use_skip = !enc->proba_.use_skip_proba_;
     const VP8RDLevel rd_opt = enc->rd_opt_level_;
 
-    VP8IteratorImport(&it);
+    VP8IteratorImport(&it, NULL);
     // Warning! order is important: first call VP8Decimate() and
     // *then* decide how to code the skip decision if there's one.
     if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) {
@@ -894,7 +895,8 @@ int VP8EncLoop(VP8Encoder* const enc) {
     VP8StoreFilterStats(&it);
     VP8IteratorExport(&it);
     ok = VP8IteratorProgress(&it, 20);
-  } while (ok && VP8IteratorNext(&it, it.yuv_out_));
+    VP8IteratorSaveBoundary(&it, it.yuv_out_);
+  } while (ok && VP8IteratorNext(&it));
 
   return PostLoopFinalize(&it, ok);
 }
@@ -937,7 +939,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
     VP8TBufferClear(&enc->tokens_);
     do {
       VP8ModeScore info;
-      VP8IteratorImport(&it);
+      VP8IteratorImport(&it, NULL);
       if (--cnt < 0) {
         FinalizeTokenProbas(proba);
         VP8CalculateLevelCosts(proba);  // refresh cost tables for rd-opt
@@ -956,7 +958,8 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
         VP8IteratorExport(&it);
         ok = VP8IteratorProgress(&it, 20);
       }
-    } while (ok && VP8IteratorNext(&it, it.yuv_out_));
+      VP8IteratorSaveBoundary(&it, it.yuv_out_);
+    } while (ok && VP8IteratorNext(&it));
   }
   ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
   if (ok) {
diff --git a/src/enc/iterator.c b/src/enc/iterator.c
index 781ce9fa..6ad14aa5 100644
--- a/src/enc/iterator.c
+++ b/src/enc/iterator.c
@@ -55,13 +55,21 @@ void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
 void VP8IteratorReset(VP8EncIterator* const it) {
   VP8Encoder* const enc = it->enc_;
   VP8IteratorSetRow(it, 0);
-  it->count_down_ = enc->mb_w_ * enc->mb_h_;
+  VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_);  // default
   InitTop(it);
   InitLeft(it);
   memset(it->bit_count_, 0, sizeof(it->bit_count_));
   it->do_trellis_ = 0;
 }
 
+void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down) {
+  it->count_down_ = it->count_down0_ = count_down;
+}
+
+int VP8IteratorIsDone(const VP8EncIterator* const it) {
+  return (it->count_down_ <= 0);
+}
+
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
   it->enc_ = enc;
   it->y_stride_  = enc->pic_->y_stride;
@@ -81,9 +89,10 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
 int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
   VP8Encoder* const enc = it->enc_;
   if (delta && enc->pic_->progress_hook != NULL) {
-    const int percent = (enc->mb_h_ <= 1)
+    const int done = it->count_down0_ - it->count_down_;
+    const int percent = (it->count_down0_ <= 0)
                       ? it->percent0_
-                      : it->percent0_ + delta * it->y_ / (enc->mb_h_ - 1);
+                      : it->percent0_ + delta * done / it->count_down0_;
     return WebPReportProgress(enc->pic_, percent, &enc->percent_);
   }
   return 1;
@@ -93,6 +102,8 @@ int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
 // Import the source samples into the cache. Takes care of replicating
 // boundary pixels if necessary.
 
+static WEBP_INLINE int MinSize(int a, int b) { return (a < b) ? a : b; }
+
 static void ImportBlock(const uint8_t* src, int src_stride,
                         uint8_t* dst, int w, int h, int size) {
   int i;
@@ -110,30 +121,55 @@ static void ImportBlock(const uint8_t* src, int src_stride,
   }
 }
 
-void VP8IteratorImport(const VP8EncIterator* const it) {
+static void ImportLine(const uint8_t* src, int src_stride,
+                       uint8_t* dst, int len, int total_len) {
+  int i;
+  for (i = 0; i < len; ++i, src += src_stride) dst[i] = *src;
+  for (; i < total_len; ++i) dst[i] = dst[len - 1];
+}
+
+void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32) {
   const VP8Encoder* const enc = it->enc_;
   const int x = it->x_, y = it->y_;
   const WebPPicture* const pic = enc->pic_;
-  const uint8_t* const ysrc = pic->y + (y * pic->y_stride + x) * 16;
+  const uint8_t* const ysrc = pic->y + (y * pic->y_stride  + x) * 16;
   const uint8_t* const usrc = pic->u + (y * pic->uv_stride + x) * 8;
   const uint8_t* const vsrc = pic->v + (y * pic->uv_stride + x) * 8;
-  uint8_t* const ydst = it->yuv_in_ + Y_OFF;
-  uint8_t* const udst = it->yuv_in_ + U_OFF;
-  uint8_t* const vdst = it->yuv_in_ + V_OFF;
-  int w = (pic->width - x * 16);
-  int h = (pic->height - y * 16);
+  const int w = MinSize(pic->width - x * 16, 16);
+  const int h = MinSize(pic->height - y * 16, 16);
+  const int uv_w = (w + 1) >> 1;
+  const int uv_h = (h + 1) >> 1;
 
-  if (w > 16) w = 16;
-  if (h > 16) h = 16;
+  ImportBlock(ysrc, pic->y_stride,  it->yuv_in_ + Y_OFF, w, h, 16);
+  ImportBlock(usrc, pic->uv_stride, it->yuv_in_ + U_OFF, uv_w, uv_h, 8);
+  ImportBlock(vsrc, pic->uv_stride, it->yuv_in_ + V_OFF, uv_w, uv_h, 8);
 
-  // Luma plane
-  ImportBlock(ysrc, pic->y_stride, ydst, w, h, 16);
+  if (tmp_32 == NULL) return;
 
-  {   // U/V planes
-    const int uv_w = (w + 1) >> 1;
-    const int uv_h = (h + 1) >> 1;
-    ImportBlock(usrc, pic->uv_stride, udst, uv_w, uv_h, 8);
-    ImportBlock(vsrc, pic->uv_stride, vdst, uv_w, uv_h, 8);
+  // Import source (uncompressed) samples into boundary.
+  if (x == 0) {
+    InitLeft(it);
+  } else {
+    if (y == 0) {
+      it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] = 127;
+    } else {
+      it->y_left_[-1] = ysrc[- 1 - pic->y_stride];
+      it->u_left_[-1] = usrc[- 1 - pic->uv_stride];
+      it->v_left_[-1] = vsrc[- 1 - pic->uv_stride];
+    }
+    ImportLine(ysrc - 1, pic->y_stride,  it->y_left_, h,   16);
+    ImportLine(usrc - 1, pic->uv_stride, it->u_left_, uv_h, 8);
+    ImportLine(vsrc - 1, pic->uv_stride, it->v_left_, uv_h, 8);
+  }
+
+  it->y_top_  = tmp_32 + 0;
+  it->uv_top_ = tmp_32 + 16;
+  if (y == 0) {
+    memset(tmp_32, 127, 32 * sizeof(*tmp_32));
+  } else {
+    ImportLine(ysrc - pic->y_stride,  1, tmp_32,          w,   16);
+    ImportLine(usrc - pic->uv_stride, 1, tmp_32 + 16,     uv_w, 8);
+    ImportLine(vsrc - pic->uv_stride, 1, tmp_32 + 16 + 8, uv_w, 8);
   }
 }
 
@@ -251,40 +287,40 @@ void VP8IteratorBytesToNz(VP8EncIterator* const it) {
 //------------------------------------------------------------------------------
 // Advance to the next position, doing the bookeeping.
 
-int VP8IteratorNext(VP8EncIterator* const it,
-                    const uint8_t* const block_to_save) {
+void VP8IteratorSaveBoundary(VP8EncIterator* const it,
+                             const uint8_t* const block_to_save) {
   VP8Encoder* const enc = it->enc_;
-  if (block_to_save != NULL) {
-    const int x = it->x_, y = it->y_;
-    const uint8_t* const ysrc = block_to_save + Y_OFF;
-    const uint8_t* const usrc = block_to_save + U_OFF;
-    if (x < enc->mb_w_ - 1) {   // left
-      int i;
-      for (i = 0; i < 16; ++i) {
-        it->y_left_[i] = ysrc[15 + i * BPS];
-      }
-      for (i = 0; i < 8; ++i) {
-        it->u_left_[i] = usrc[7 + i * BPS];
-        it->v_left_[i] = usrc[15 + i * BPS];
-      }
-      // top-left (before 'top'!)
-      it->y_left_[-1] = it->y_top_[15];
-      it->u_left_[-1] = it->uv_top_[0 + 7];
-      it->v_left_[-1] = it->uv_top_[8 + 7];
+  const int x = it->x_, y = it->y_;
+  const uint8_t* const ysrc = block_to_save + Y_OFF;
+  const uint8_t* const usrc = block_to_save + U_OFF;
+  if (x < enc->mb_w_ - 1) {   // left
+    int i;
+    for (i = 0; i < 16; ++i) {
+      it->y_left_[i] = ysrc[15 + i * BPS];
     }
-    if (y < enc->mb_h_ - 1) {  // top
-      memcpy(it->y_top_, ysrc + 15 * BPS, 16);
-      memcpy(it->uv_top_, usrc + 7 * BPS, 8 + 8);
+    for (i = 0; i < 8; ++i) {
+      it->u_left_[i] = usrc[7 + i * BPS];
+      it->v_left_[i] = usrc[15 + i * BPS];
     }
+    // top-left (before 'top'!)
+    it->y_left_[-1] = it->y_top_[15];
+    it->u_left_[-1] = it->uv_top_[0 + 7];
+    it->v_left_[-1] = it->uv_top_[8 + 7];
   }
+  if (y < enc->mb_h_ - 1) {  // top
+    memcpy(it->y_top_, ysrc + 15 * BPS, 16);
+    memcpy(it->uv_top_, usrc + 7 * BPS, 8 + 8);
+  }
+}
 
+int VP8IteratorNext(VP8EncIterator* const it) {
   it->preds_ += 4;
   it->mb_ += 1;
   it->nz_ += 1;
   it->y_top_ += 16;
   it->uv_top_ += 16;
   it->x_ += 1;
-  if (it->x_ == enc->mb_w_) {
+  if (it->x_ == it->enc_->mb_w_) {
     VP8IteratorSetRow(it, ++it->y_);
   }
   return (0 < --it->count_down_);
diff --git a/src/enc/quant.c b/src/enc/quant.c
index ffeaec30..f26fafcd 100644
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@@ -367,16 +367,14 @@ const int VP8I4ModeOffsets[NUM_BMODES] = {
 };
 
 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
-  const VP8Encoder* const enc = it->enc_;
   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
-  const uint8_t* const top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL;
+  const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
   VP8EncPredLuma16(it->yuv_p_, left, top);
 }
 
 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
-  const VP8Encoder* const enc = it->enc_;
   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
-  const uint8_t* const top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL;
+  const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
   VP8EncPredChroma8(it->yuv_p_, left, top);
 }
 
diff --git a/src/enc/tree.c b/src/enc/tree.c
index 5c204bdd..27f0babe 100644
--- a/src/enc/tree.c
+++ b/src/enc/tree.c
@@ -344,7 +344,7 @@ void VP8CodeIntraModes(VP8Encoder* const enc) {
       }
     }
     PutUVMode(bw, mb->uv_mode_);
-  } while (VP8IteratorNext(&it, NULL));
+  } while (VP8IteratorNext(&it));
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/enc/vp8enci.h b/src/enc/vp8enci.h
index a43e17c9..9f4cf27f 100644
--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@@ -292,6 +292,7 @@ typedef struct {
   LFStats*      lf_stats_;         // filter stats (borrowed from enc_)
   int           do_trellis_;       // if true, perform extra level optimisation
   int           count_down_;       // number of mb still to be processed
+  int           count_down0_;      // starting counter value (for progress)
   int           percent0_;         // saved initial progress percent
 
   uint8_t* y_left_;    // left luma samples (addressable from index -1 to 15).
@@ -313,15 +314,22 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it);
 void VP8IteratorReset(VP8EncIterator* const it);
 // reset iterator position to row 'y'
 void VP8IteratorSetRow(VP8EncIterator* const it, int y);
-// import samples from source
-void VP8IteratorImport(const VP8EncIterator* const it);
+// set count down (=number of iterations to go)
+void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down);
+// return true if iteration is finished
+int VP8IteratorIsDone(const VP8EncIterator* const it);
+// Import uncompressed samples from source.
+// If tmp_32 is not NULL, import boundary samples too.
+// tmp_32 is a 32-bytes scratch buffer that must be aligned in memory.
+void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32);
 // export decimated samples
 void VP8IteratorExport(const VP8EncIterator* const it);
-// go to next macroblock. Returns false if not finished. If *block_to_save is
-// non-null, will save the boundary values to top_/left_ arrays. block_to_save
-// can be it->yuv_out_ or it->yuv_in_.
-int VP8IteratorNext(VP8EncIterator* const it,
-                    const uint8_t* const block_to_save);
+// go to next macroblock. Returns false if not finished.
+int VP8IteratorNext(VP8EncIterator* const it);
+// save the boundary values to top_/left_ arrays for next iterations.
+// block_to_save can be it->yuv_out_ or it->yuv_in_.
+void VP8IteratorSaveBoundary(VP8EncIterator* const it,
+                             const uint8_t* const block_to_save);
 // Report progression based on macroblock rows. Return 0 for user-abort request.
 int VP8IteratorProgress(const VP8EncIterator* const it,
                         int final_delta_percent);