feat(Blogger): support blog posts TF-IDF analyse

2025-07-18 23:09:54 +02:00 · 2023-04-24 16:41:26 +02:00
parent 545f630b0e
commit f9f27321ab
18 changed files with 1264 additions and 6 deletions
--- a/Blogger/README.md
+++ b/Blogger/README.md
@ -6,6 +6,7 @@ Blackend for my blog at https://blog.iohub.dev
 ## Change logs

 ### v0.2.x-a
+* Patch 11: Add TFIDF analyse functionality
 * Patch 10: Migrate code to typescript, use SQLiteDB lib for database access
 * Patch 9: Update to use the new MDE library
 * Patch 8: Support for antOS 2.0.x
--- a/Blogger/api/ai/analyse.lua
+++ b/Blogger/api/ai/analyse.lua
@ -0,0 +1,66 @@
+local args = ...
+
+local ret = { 
+    error = false,
+    result = nil
+}
+local __dir__ = debug.getinfo(1).source:match("@?(.*/)")
+LOG_DEBUG("CURRENT PATH:%s", __dir__)
+local cluster = loadfile(__dir__.."/cluster.lua")()
+local dbpath = require("vfs").ospath(args.dbpath)
+LOG_DEBUG("DB PATH:%s", dbpath)
+
+local gettext = {}
+gettext.get = function(file)
+    local db = DBModel:new{db=file}
+    db:open()
+    if not db then return nil end
+    local data, sort = db:find("blogs", {
+        where = { publish = 1 },
+        fields = {"id", "content"}
+    })
+    db:close()
+    if not data or #data == 0 then return nil end
+    return data 
+end
+
+gettext.stopwords = function(ospath)
+    local words = {}
+    for line in io.lines(ospath) do
+        words[line] = true
+    end
+    return words
+end
+
+local data = gettext.get(dbpath)
+local documents = {}
+if data then
+    local sw = gettext.stopwords(__dir__.."/stopwords.txt")
+    for k, v in pairs(data) do
+        local bag = cluster.bow(data[k].content, sw)
+        documents[data[k].id] = bag
+    end
+
+    cluster.tfidf(documents)
+    --local v = cluster.search("arm", documents)
+    --echo(JSON.encode(v))
+    local vectors, maxv, size = cluster.get_vectors(documents)
+    local analytical =  DBModel:new{db=dbpath}
+    analytical:open()
+    -- purge the table
+    analytical:delete("st_similarity", nil)
+    -- get similarity and put to the table
+    for id, v in pairs(vectors) do
+        local top = cluster.top_similarity(id, vectors, args.top, 0.1)
+        for a, b in pairs(top) do
+            local record = {pid = id, sid = a, score = b}
+            analytical:insert("st_similarity", record)
+        end
+    end
+    analytical:close()
+    ret.result = "Analyse complete"
+else
+    ret.error = "Unable to query database for post"
+end
+
+return ret
--- a/Blogger/api/ai/cluster.lua
+++ b/Blogger/api/ai/cluster.lua
@ -0,0 +1,346 @@
+local doclassify = {}
+local st = require("stmr")
+doclassify.bow = function(data, stopwords)
+    -- first step get a table of worlds that contain
+    -- world: occurences
+    local bag = {}
+    for w in data:gmatch('%w+') do
+        local word = w:lower()
+        if not stopwords[word] then
+            word = st.stmr(word)
+            if bag[word] then
+                bag[word].count = bag[word].count + 1
+            else
+                bag[word] = {count=0, tf=0, tfidf=0.0}
+                bag[word].count = 1
+            end
+        end
+    end
+    -- now calculate the tf of the bag
+    for k,v in pairs(bag) do
+        bag[k].tf = math.log(1 + bag[k].count)
+    end
+    return bag
+end
+doclassify.len = function(table)
+    local cnt = 0
+    for k,v in pairs(table) do cnt = cnt+1 end
+    return cnt
+end
+doclassify.tfidf = function(documents)
+    -- now for each term in a bag, calculate 
+    -- the inverse document frequency, which 
+    -- is a measure of how much information 
+    -- the word provides, that is, whether the
+    -- term is common or rare across all documents
+    local ndoc = doclassify.len(documents)
+    for k,bag in pairs(documents) do
+        -- for eacht term in bag
+        -- calculate its idf across all documents
+        for term,b in pairs(bag) do
+            local n = 0
+            for id,doc in pairs(documents) do
+                if doc[term] then n = n+1 end
+            end
+            --echo("term:"..term.." appears in"..n.." documents")
+            b.tfidf = b.tf*math.log(ndoc/n)
+        end
+    end
+    
+end
+
+doclassify.search = function(term, documents)
+    local r = {}
+    for id, doc in pairs(documents) do
+        if doc[term:lower()] then
+            r[id] = doc[term].tfidf
+        end
+    end
+    return r
+end
+
+doclassify.get_vectors = function(documents)
+    -- get a list of vector from documents
+    local index = 0
+    local vectors = {}
+    local maps = {}
+    local terms = {}
+    local maxv = 0
+    
+    for id in pairs(documents) do
+        maps[id] = {}
+        vectors[id] = {}
+    end 
+    -- first loop, get the term
+    for id, doc in pairs(documents) do
+        for k,v in pairs(doc) do
+            -- get max value
+            if v.tfidf > maxv then
+                maxv = v.tfidf
+            end
+            -- get the term
+            if not terms[k] then
+                index = index + 1
+                terms[k] = index
+            end
+            for pid in pairs(documents) do
+                if not maps[pid][k] then
+                    if id == pid then
+                        maps[pid][k] = v.tfidf
+                    else
+                        maps[pid][k] = 0
+                    end
+                else
+                    if maps[pid][k] == 0 and id == pid then
+                        maps[pid][k] = v.tfidf
+                    end
+                end
+            end
+        end
+    end
+    -- reindexing the vectors
+    for id in pairs(documents) do
+        for k,v in pairs(maps[id]) do
+            vectors[id][terms[k]] = v
+        end
+    end
+    --echo("Max tfidf "..maxv.." in document #"..maxid.." of term "..term)
+    return vectors, maxv, index, terms
+end
+
+doclassify.similarity = function(va, vb)
+    -- using cosin similarity
+    local dotp = 0
+    local maga = 0
+    local magb = 0
+    for k = 1,#va do
+        dotp = dotp + va[k]*vb[k]
+        maga = maga + va[k]*va[k]
+        magb = magb + vb[k]*vb[k]
+    end
+    maga = math.sqrt(maga)
+    magb = math.sqrt(magb)
+    local d  = 0
+    if maga ~= 0 and magb ~= 0 then
+        d = dotp/ (magb*maga)
+    end
+    return d
+end
+doclassify.similarities = function(v1, collection)
+    local similarities = {}
+    assert(#v1 == #(collection[1]), "Incorrect vectors size")
+    for i=1,#collection do
+        similarities[i] = doclassify.similarity(v1, collection[i])
+    end
+    return similarities
+end
+
+doclassify.mean_similarity = function(v1, v2)
+    assert(#v1 == #v2, "Incorrect vectors size")
+    local similarities = {}
+    for i = 1,#v1 do similarities[i] = doclassify.similarity(v1[i], v2[i]) end
+    return doclassify.mean(similarities)
+end
+doclassify.similarity_chart = function(id, vectors)
+    local vs = {}
+    local cnt = 0
+    local lut = {}
+    for k,v in pairs(vectors) do
+        if k ~= id then
+            cnt = cnt + 1
+            vs[cnt] = v
+            lut[cnt] = k
+        end
+    end
+    if not vs[1] then return {} end
+    return doclassify.similarities(vectors[id], vs), lut
+end
+
+doclassify.top_similarity = function(id, vectors, n, th)
+    local chart,lut = doclassify.similarity_chart(id,vectors)
+    --echo(JSON.encode(chart))
+    --echo(JSON.encode(lut))
+    if not lut or #lut <= 0 then return nil end
+    local top = {}
+    
+    local j=0
+    local goon = true
+    if not th then
+        goon = false
+        th = 0
+    end
+    
+    while j < n or goon
+    do
+        local i,maxv = doclassify.argmax(chart)
+        top[lut[i]] = maxv
+        chart[i] = 0.0
+        j=j+1
+        if maxv < th and goon then
+            goon = false
+        end
+    end
+    
+    --for j=1,n do
+    --    local i,maxv = doclassify.argmax(chart)
+    --    top[lut[i]] = maxv
+    --    chart[i] = 0.0
+    --end
+    return top
+    
+end
+doclassify.save_vectors = function(vectors, name)
+    local f = io.open(name,"w")
+    if f == nil then return false end
+    for id, v in pairs(vectors) do
+        f:write(id)
+        for i=1,#v do f:write(","..v[i]) end
+        f:write("\n")
+    end
+    f:close()
+    return true
+end
+doclassify.save_topchart = function(vectors, name,n)
+    local f = io.open(name,"w")
+    if f == nil then return false end
+    for k,v in pairs(vectors) do 
+        local top = doclassify.top_similarity(k,vectors,n, 0.1)
+        for a,b in pairs(top) do
+            f:write(k.." "..a.." "..b.."\n")
+        end
+    end
+    f:close()
+    return true
+end
+doclassify.kmean = function(nclass, documents, maxstep, ids)
+    -- now 
+    local vectors, maxv, size = doclassify.get_vectors(documents)
+    -- random centroids
+    local centroids = {}
+    local old_centroids = {}
+    local clusters = {}
+    --for pid in pairs(documents) do clusters[pid] = 0 end
+    -- add noise to mean_vector
+    for i = 1,nclass do
+        if ids == nil then
+            centroids[i] = doclassify.random(size,math.floor(maxv))
+        else
+            centroids[i] = vectors[ids[i]]
+        end
+        old_centroids[i] = doclassify.zeros(size)
+    end
+    
+    -- loop until convergence or maxstep reached
+    local similarity = doclassify.mean_similarity(centroids, old_centroids)
+    local step = maxstep
+    while 1.0-similarity > 1e-9 and step > 0 do
+        clusters = {}
+        --echo(JSON.encode(centroids))
+        for id,v in pairs(vectors) do
+            local similarities = doclassify.similarities(v, centroids)
+            --echo(JSON.encode(similarities))
+            local cluster, maxvalue = doclassify.argmax(similarities)
+            --echo("doc #"..id.." is in clusters #"..cluster.." max value is "..maxvalue)
+            clusters[id] = cluster
+        end
+        -- storing the old centroids
+        old_centroids = centroids
+        -- calculate new centroids
+        local new_centroids = {}
+        for class in pairs(centroids) do
+            local cnt = 0
+            local cvectors = {}
+            for id,v in pairs(vectors) do
+                if clusters[id] == class then
+                    cnt = cnt + 1
+                    cvectors[cnt] = v
+                end
+            end
+            new_centroids[class] = doclassify.mean_vector(cvectors, size)
+        end
+        centroids = new_centroids
+        --echo(JSON.encode(centroids))
+        --echo(JSON.encode(old_centroids))
+        similarity = doclassify.mean_similarity(centroids, old_centroids)
+        echo("step #"..step..", similarity "..similarity)
+        step = step - 1
+    end
+    local results = {}
+    for i = 1,nclass do
+        local list = {}
+        local cnt = 0
+        for id,c in pairs(clusters) do
+            if c == i then
+                cnt = cnt + 1
+                list[cnt] = id
+            end
+        end
+        results[i] = list
+    end
+    return results, clusters, centroids
+end
+
+doclassify.zeros = function(n)
+    local vector = {}
+    for i = 1,n do vector[i] = 0.0 end
+    return vector
+end
+
+doclassify.random = function(n,maxv)
+    local vector = {}
+    for i=1,n do
+        vector[i] = math.random() + math.random(0, maxv)
+    end
+    return vector
+end
+
+doclassify.sum = function(v)
+    local sum  = 0.0
+    for i=1,#v do sum = sum + v[i] end
+    return sum
+end
+
+doclassify.mean = function(v)
+    return doclassify.sum(v)/#v
+    
+end
+
+doclassify.mean_vector = function(vectors, size)
+    local means = doclassify.zeros(size)
+    if not vectors or #vectors == 0 then return means end
+    --local size = #(vectors[1])
+    local times = 0
+    for k,v in pairs(vectors) do
+        for i=1,#v do means[i] = means[i] + v[i] end
+        times = times + 1
+    end
+    for i = 1,size do means[i] = means[i]/times end
+    return means
+end
+
+doclassify.argmin = function(v)
+    local minv = 0.0
+    local mini = 0.0
+    for i = 1,#v do
+        if v[i] <= minv then
+            mini = i
+            minv = v[i]
+        end
+    end
+    --echo("min index"..mini.." val "..minv)
+    return mini, minv
+end
+
+doclassify.argmax = function(v)
+    local maxv = 0.0
+    local maxi = 0.0
+    for i = 1,#v do
+        if v[i] >= maxv then
+            maxi = i
+            maxv = v[i]
+        end
+    end
+    return maxi,maxv
+end
+
+return doclassify
--- a/Blogger/api/ai/stopwords.txt
+++ b/Blogger/api/ai/stopwords.txt
@ -0,0 +1,151 @@
+i
+me
+my
+myself
+we
+our
+ours
+ourselves
+you
+your
+yours
+yourself
+yourselves
+he
+him
+his
+himself
+she
+her
+hers
+herself
+it
+its
+itself
+they
+them
+their
+theirs
+themselves
+what
+which
+who
+whom
+this
+that
+these
+those
+am
+is
+are
+was
+were
+be
+been
+being
+have
+has
+had
+having
+do
+does
+did
+doing
+a
+an
+the
+and
+but
+if
+or
+because
+as
+until
+while
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+again
+further
+then
+once
+here
+there
+when
+where
+why
+how
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+no
+nor
+not
+only
+own
+same
+so
+than
+too
+very
+s
+t
+can
+will
+just
+don
+should
+now
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+w
+r
+s
+t
+x
+y
+z
--- a/Blogger/api/ai/test.lua
+++ b/Blogger/api/ai/test.lua
@ -0,0 +1,50 @@
+local path = require("fs/vfs").ospath("home://aiws/blog-clustering")
+local gettext = loadfile(path.."/gettext.lua")()
+local cluster = loadfile(path.."/cluster.lua")()
+
+local refresh = false
+
+local file = "/home/mrsang/test.csv"
+if refresh then
+    local data = gettext.get({publish=1})
+    local documents = {}
+    if data then
+        local sw = gettext.stopwords("home://aiws/blog-clustering/stopwords.txt")
+        for k,v in pairs(data) do
+            local bag = cluster.bow(data[k].content, sw)
+            documents[data[k].id] = bag
+        end
+        cluster.tfidf(documents)
+        --local v = cluster.search("arm", documents)
+        --echo(JSON.encode(v))
+        local vectors, maxv, size = cluster.get_vectors(documents)
+        local s = cluster.save_topchart(vectors,file, 3)
+        if s then echo("file saved") else echo("error save file") end
+        --echo(JSON.encode(r))
+        --r = cluster.similarity(vectors["14"],vectors["16"])
+        --echo("Similarity "..r)
+        
+        --local c,l = cluster.kmean(3, documents, 10)
+        --echo(JSON.encode(c))
+        --echo(JSON.encode(l))
+    else
+        echo("Data missing")
+    end
+else
+    local f = io.open(file,"r")
+    local result = {}
+    for line in f:lines() do
+        local arr = {}
+        local cnt = 0
+        for i in line:gmatch( "%S+") do
+           cnt = cnt + 1
+           arr[cnt] = i
+        end
+        if not result[arr[1]] then result[arr[1]] = {} end
+        result[arr[1]][arr[2]] = tonumber(arr[3])
+    end
+    f:close()
+    echo(JSON.encode(result))
+    --local r = cluster.top_similarity("2",vectors, 3)
+    --echo(JSON.encode(r))
+end
--- a/Blogger/build.json
+++ b/Blogger/build.json
@ -55,7 +55,7 @@
                    "data": {
                        "src": [
                            "scheme.html",
-                            "api/sendmail.lua",
+                            "api",
                            "package.json",
                            "README.md",
                            "main.css"
--- a/Blogger/build/debug/README.md
+++ b/Blogger/build/debug/README.md
@ -6,6 +6,7 @@ Blackend for my blog at https://blog.iohub.dev
 ## Change logs

 ### v0.2.x-a
+* Patch 11: Add TFIDF analyse functionality
 * Patch 10: Migrate code to typescript, use SQLiteDB lib for database access
 * Patch 9: Update to use the new MDE library
 * Patch 8: Support for antOS 2.0.x
--- a/Blogger/build/debug/api/ai/analyse.lua
+++ b/Blogger/build/debug/api/ai/analyse.lua
@ -0,0 +1,66 @@
+local args = ...
+
+local ret = { 
+    error = false,
+    result = nil
+}
+local __dir__ = debug.getinfo(1).source:match("@?(.*/)")
+LOG_DEBUG("CURRENT PATH:%s", __dir__)
+local cluster = loadfile(__dir__.."/cluster.lua")()
+local dbpath = require("vfs").ospath(args.dbpath)
+LOG_DEBUG("DB PATH:%s", dbpath)
+
+local gettext = {}
+gettext.get = function(file)
+    local db = DBModel:new{db=file}
+    db:open()
+    if not db then return nil end
+    local data, sort = db:find("blogs", {
+        where = { publish = 1 },
+        fields = {"id", "content"}
+    })
+    db:close()
+    if not data or #data == 0 then return nil end
+    return data 
+end
+
+gettext.stopwords = function(ospath)
+    local words = {}
+    for line in io.lines(ospath) do
+        words[line] = true
+    end
+    return words
+end
+
+local data = gettext.get(dbpath)
+local documents = {}
+if data then
+    local sw = gettext.stopwords(__dir__.."/stopwords.txt")
+    for k, v in pairs(data) do
+        local bag = cluster.bow(data[k].content, sw)
+        documents[data[k].id] = bag
+    end
+
+    cluster.tfidf(documents)
+    --local v = cluster.search("arm", documents)
+    --echo(JSON.encode(v))
+    local vectors, maxv, size = cluster.get_vectors(documents)
+    local analytical =  DBModel:new{db=dbpath}
+    analytical:open()
+    -- purge the table
+    analytical:delete("st_similarity", nil)
+    -- get similarity and put to the table
+    for id, v in pairs(vectors) do
+        local top = cluster.top_similarity(id, vectors, args.top, 0.1)
+        for a, b in pairs(top) do
+            local record = {pid = id, sid = a, score = b}
+            analytical:insert("st_similarity", record)
+        end
+    end
+    analytical:close()
+    ret.result = "Analyse complete"
+else
+    ret.error = "Unable to query database for post"
+end
+
+return ret
--- a/Blogger/build/debug/api/ai/cluster.lua
+++ b/Blogger/build/debug/api/ai/cluster.lua
@ -0,0 +1,346 @@
+local doclassify = {}
+local st = require("stmr")
+doclassify.bow = function(data, stopwords)
+    -- first step get a table of worlds that contain
+    -- world: occurences
+    local bag = {}
+    for w in data:gmatch('%w+') do
+        local word = w:lower()
+        if not stopwords[word] then
+            word = st.stmr(word)
+            if bag[word] then
+                bag[word].count = bag[word].count + 1
+            else
+                bag[word] = {count=0, tf=0, tfidf=0.0}
+                bag[word].count = 1
+            end
+        end
+    end
+    -- now calculate the tf of the bag
+    for k,v in pairs(bag) do
+        bag[k].tf = math.log(1 + bag[k].count)
+    end
+    return bag
+end
+doclassify.len = function(table)
+    local cnt = 0
+    for k,v in pairs(table) do cnt = cnt+1 end
+    return cnt
+end
+doclassify.tfidf = function(documents)
+    -- now for each term in a bag, calculate 
+    -- the inverse document frequency, which 
+    -- is a measure of how much information 
+    -- the word provides, that is, whether the
+    -- term is common or rare across all documents
+    local ndoc = doclassify.len(documents)
+    for k,bag in pairs(documents) do
+        -- for eacht term in bag
+        -- calculate its idf across all documents
+        for term,b in pairs(bag) do
+            local n = 0
+            for id,doc in pairs(documents) do
+                if doc[term] then n = n+1 end
+            end
+            --echo("term:"..term.." appears in"..n.." documents")
+            b.tfidf = b.tf*math.log(ndoc/n)
+        end
+    end
+    
+end
+
+doclassify.search = function(term, documents)
+    local r = {}
+    for id, doc in pairs(documents) do
+        if doc[term:lower()] then
+            r[id] = doc[term].tfidf
+        end
+    end
+    return r
+end
+
+doclassify.get_vectors = function(documents)
+    -- get a list of vector from documents
+    local index = 0
+    local vectors = {}
+    local maps = {}
+    local terms = {}
+    local maxv = 0
+    
+    for id in pairs(documents) do
+        maps[id] = {}
+        vectors[id] = {}
+    end 
+    -- first loop, get the term
+    for id, doc in pairs(documents) do
+        for k,v in pairs(doc) do
+            -- get max value
+            if v.tfidf > maxv then
+                maxv = v.tfidf
+            end
+            -- get the term
+            if not terms[k] then
+                index = index + 1
+                terms[k] = index
+            end
+            for pid in pairs(documents) do
+                if not maps[pid][k] then
+                    if id == pid then
+                        maps[pid][k] = v.tfidf
+                    else
+                        maps[pid][k] = 0
+                    end
+                else
+                    if maps[pid][k] == 0 and id == pid then
+                        maps[pid][k] = v.tfidf
+                    end
+                end
+            end
+        end
+    end
+    -- reindexing the vectors
+    for id in pairs(documents) do
+        for k,v in pairs(maps[id]) do
+            vectors[id][terms[k]] = v
+        end
+    end
+    --echo("Max tfidf "..maxv.." in document #"..maxid.." of term "..term)
+    return vectors, maxv, index, terms
+end
+
+doclassify.similarity = function(va, vb)
+    -- using cosin similarity
+    local dotp = 0
+    local maga = 0
+    local magb = 0
+    for k = 1,#va do
+        dotp = dotp + va[k]*vb[k]
+        maga = maga + va[k]*va[k]
+        magb = magb + vb[k]*vb[k]
+    end
+    maga = math.sqrt(maga)
+    magb = math.sqrt(magb)
+    local d  = 0
+    if maga ~= 0 and magb ~= 0 then
+        d = dotp/ (magb*maga)
+    end
+    return d
+end
+doclassify.similarities = function(v1, collection)
+    local similarities = {}
+    assert(#v1 == #(collection[1]), "Incorrect vectors size")
+    for i=1,#collection do
+        similarities[i] = doclassify.similarity(v1, collection[i])
+    end
+    return similarities
+end
+
+doclassify.mean_similarity = function(v1, v2)
+    assert(#v1 == #v2, "Incorrect vectors size")
+    local similarities = {}
+    for i = 1,#v1 do similarities[i] = doclassify.similarity(v1[i], v2[i]) end
+    return doclassify.mean(similarities)
+end
+doclassify.similarity_chart = function(id, vectors)
+    local vs = {}
+    local cnt = 0
+    local lut = {}
+    for k,v in pairs(vectors) do
+        if k ~= id then
+            cnt = cnt + 1
+            vs[cnt] = v
+            lut[cnt] = k
+        end
+    end
+    if not vs[1] then return {} end
+    return doclassify.similarities(vectors[id], vs), lut
+end
+
+doclassify.top_similarity = function(id, vectors, n, th)
+    local chart,lut = doclassify.similarity_chart(id,vectors)
+    --echo(JSON.encode(chart))
+    --echo(JSON.encode(lut))
+    if not lut or #lut <= 0 then return nil end
+    local top = {}
+    
+    local j=0
+    local goon = true
+    if not th then
+        goon = false
+        th = 0
+    end
+    
+    while j < n or goon
+    do
+        local i,maxv = doclassify.argmax(chart)
+        top[lut[i]] = maxv
+        chart[i] = 0.0
+        j=j+1
+        if maxv < th and goon then
+            goon = false
+        end
+    end
+    
+    --for j=1,n do
+    --    local i,maxv = doclassify.argmax(chart)
+    --    top[lut[i]] = maxv
+    --    chart[i] = 0.0
+    --end
+    return top
+    
+end
+doclassify.save_vectors = function(vectors, name)
+    local f = io.open(name,"w")
+    if f == nil then return false end
+    for id, v in pairs(vectors) do
+        f:write(id)
+        for i=1,#v do f:write(","..v[i]) end
+        f:write("\n")
+    end
+    f:close()
+    return true
+end
+doclassify.save_topchart = function(vectors, name,n)
+    local f = io.open(name,"w")
+    if f == nil then return false end
+    for k,v in pairs(vectors) do 
+        local top = doclassify.top_similarity(k,vectors,n, 0.1)
+        for a,b in pairs(top) do
+            f:write(k.." "..a.." "..b.."\n")
+        end
+    end
+    f:close()
+    return true
+end
+doclassify.kmean = function(nclass, documents, maxstep, ids)
+    -- now 
+    local vectors, maxv, size = doclassify.get_vectors(documents)
+    -- random centroids
+    local centroids = {}
+    local old_centroids = {}
+    local clusters = {}
+    --for pid in pairs(documents) do clusters[pid] = 0 end
+    -- add noise to mean_vector
+    for i = 1,nclass do
+        if ids == nil then
+            centroids[i] = doclassify.random(size,math.floor(maxv))
+        else
+            centroids[i] = vectors[ids[i]]
+        end
+        old_centroids[i] = doclassify.zeros(size)
+    end
+    
+    -- loop until convergence or maxstep reached
+    local similarity = doclassify.mean_similarity(centroids, old_centroids)
+    local step = maxstep
+    while 1.0-similarity > 1e-9 and step > 0 do
+        clusters = {}
+        --echo(JSON.encode(centroids))
+        for id,v in pairs(vectors) do
+            local similarities = doclassify.similarities(v, centroids)
+            --echo(JSON.encode(similarities))
+            local cluster, maxvalue = doclassify.argmax(similarities)
+            --echo("doc #"..id.." is in clusters #"..cluster.." max value is "..maxvalue)
+            clusters[id] = cluster
+        end
+        -- storing the old centroids
+        old_centroids = centroids
+        -- calculate new centroids
+        local new_centroids = {}
+        for class in pairs(centroids) do
+            local cnt = 0
+            local cvectors = {}
+            for id,v in pairs(vectors) do
+                if clusters[id] == class then
+                    cnt = cnt + 1
+                    cvectors[cnt] = v
+                end
+            end
+            new_centroids[class] = doclassify.mean_vector(cvectors, size)
+        end
+        centroids = new_centroids
+        --echo(JSON.encode(centroids))
+        --echo(JSON.encode(old_centroids))
+        similarity = doclassify.mean_similarity(centroids, old_centroids)
+        echo("step #"..step..", similarity "..similarity)
+        step = step - 1
+    end
+    local results = {}
+    for i = 1,nclass do
+        local list = {}
+        local cnt = 0
+        for id,c in pairs(clusters) do
+            if c == i then
+                cnt = cnt + 1
+                list[cnt] = id
+            end
+        end
+        results[i] = list
+    end
+    return results, clusters, centroids
+end
+
+doclassify.zeros = function(n)
+    local vector = {}
+    for i = 1,n do vector[i] = 0.0 end
+    return vector
+end
+
+doclassify.random = function(n,maxv)
+    local vector = {}
+    for i=1,n do
+        vector[i] = math.random() + math.random(0, maxv)
+    end
+    return vector
+end
+
+doclassify.sum = function(v)
+    local sum  = 0.0
+    for i=1,#v do sum = sum + v[i] end
+    return sum
+end
+
+doclassify.mean = function(v)
+    return doclassify.sum(v)/#v
+    
+end
+
+doclassify.mean_vector = function(vectors, size)
+    local means = doclassify.zeros(size)
+    if not vectors or #vectors == 0 then return means end
+    --local size = #(vectors[1])
+    local times = 0
+    for k,v in pairs(vectors) do
+        for i=1,#v do means[i] = means[i] + v[i] end
+        times = times + 1
+    end
+    for i = 1,size do means[i] = means[i]/times end
+    return means
+end
+
+doclassify.argmin = function(v)
+    local minv = 0.0
+    local mini = 0.0
+    for i = 1,#v do
+        if v[i] <= minv then
+            mini = i
+            minv = v[i]
+        end
+    end
+    --echo("min index"..mini.." val "..minv)
+    return mini, minv
+end
+
+doclassify.argmax = function(v)
+    local maxv = 0.0
+    local maxi = 0.0
+    for i = 1,#v do
+        if v[i] >= maxv then
+            maxi = i
+            maxv = v[i]
+        end
+    end
+    return maxi,maxv
+end
+
+return doclassify
--- a/Blogger/build/debug/api/ai/stopwords.txt
+++ b/Blogger/build/debug/api/ai/stopwords.txt
@ -0,0 +1,151 @@
+i
+me
+my
+myself
+we
+our
+ours
+ourselves
+you
+your
+yours
+yourself
+yourselves
+he
+him
+his
+himself
+she
+her
+hers
+herself
+it
+its
+itself
+they
+them
+their
+theirs
+themselves
+what
+which
+who
+whom
+this
+that
+these
+those
+am
+is
+are
+was
+were
+be
+been
+being
+have
+has
+had
+having
+do
+does
+did
+doing
+a
+an
+the
+and
+but
+if
+or
+because
+as
+until
+while
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+again
+further
+then
+once
+here
+there
+when
+where
+why
+how
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+no
+nor
+not
+only
+own
+same
+so
+than
+too
+very
+s
+t
+can
+will
+just
+don
+should
+now
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+w
+r
+s
+t
+x
+y
+z
--- a/Blogger/build/debug/api/ai/test.lua
+++ b/Blogger/build/debug/api/ai/test.lua
@ -0,0 +1,50 @@
+local path = require("fs/vfs").ospath("home://aiws/blog-clustering")
+local gettext = loadfile(path.."/gettext.lua")()
+local cluster = loadfile(path.."/cluster.lua")()
+
+local refresh = false
+
+local file = "/home/mrsang/test.csv"
+if refresh then
+    local data = gettext.get({publish=1})
+    local documents = {}
+    if data then
+        local sw = gettext.stopwords("home://aiws/blog-clustering/stopwords.txt")
+        for k,v in pairs(data) do
+            local bag = cluster.bow(data[k].content, sw)
+            documents[data[k].id] = bag
+        end
+        cluster.tfidf(documents)
+        --local v = cluster.search("arm", documents)
+        --echo(JSON.encode(v))
+        local vectors, maxv, size = cluster.get_vectors(documents)
+        local s = cluster.save_topchart(vectors,file, 3)
+        if s then echo("file saved") else echo("error save file") end
+        --echo(JSON.encode(r))
+        --r = cluster.similarity(vectors["14"],vectors["16"])
+        --echo("Similarity "..r)
+        
+        --local c,l = cluster.kmean(3, documents, 10)
+        --echo(JSON.encode(c))
+        --echo(JSON.encode(l))
+    else
+        echo("Data missing")
+    end
+else
+    local f = io.open(file,"r")
+    local result = {}
+    for line in f:lines() do
+        local arr = {}
+        local cnt = 0
+        for i in line:gmatch( "%S+") do
+           cnt = cnt + 1
+           arr[cnt] = i
+        end
+        if not result[arr[1]] then result[arr[1]] = {} end
+        result[arr[1]][arr[2]] = tonumber(arr[3])
+    end
+    f:close()
+    echo(JSON.encode(result))
+    --local r = cluster.top_similarity("2",vectors, 3)
+    --echo(JSON.encode(r))
+end
--- a/Blogger/build/debug/api/sendmail.lua
+++ b/Blogger/build/debug/api/sendmail.lua
--- a/Blogger/build/debug/main.js
+++ b/Blogger/build/debug/main.js
--- a/Blogger/build/debug/package.json
+++ b/Blogger/build/debug/package.json
@ -6,7 +6,7 @@
        "author": "Xuan Sang LE",
        "email": "xsang.le@gmail.com"
    },
-    "version": "0.2.10-a",
+    "version": "0.2.11-a",
    "category": "Internet",
    "iconclass": "fa fa-book",
    "dependencies": [
--- a/Blogger/build/release/Blogger.zip
+++ b/Blogger/build/release/Blogger.zip
--- a/Blogger/dialogs.ts
+++ b/Blogger/dialogs.ts
@ -212,7 +212,7 @@ namespace OS {
                        if (emails.length === 0) { return this.notify(__("No email selected")); }
                        // send the email
                        const data = {
-                            path: `${this.meta().path}/sendmail.lua`,
+                            path: `${this.meta().path}/api/sendmail.lua`,
                            parameters: {
                                to: emails,
                                title: (this.find("mail-title") as HTMLInputElement).value,
@ -222,7 +222,7 @@ namespace OS {
                            }
                        };
                        return this._api.apigateway(data, false)
-                            .then((d: { error: any; result: { join: (arg0: string) => any; }; }) => {
+                            .then((d) => {
                                if (d.error) {
                                    const str = d.result.join(',');
                                    return this.notify(__("Unable to send mail to: {0}", str)); }
--- a/Blogger/main.ts
+++ b/Blogger/main.ts
@ -499,6 +499,36 @@ namespace OS {
                                    this.error(__("Error sending mails: {0}", e.toString()), e);
                                }
                            }
+                        },
+                        "|",
+                        {
+                            name: __("TFIDF analyse"),
+                            className: "fa fa-area-chart",
+                            action: async (e: any) => {
+                                try {
+                                    const q = await this.openDialog("PromptDialog",{
+                                        title: __("TFIDF Analyse"),
+                                        text: __("Max number of related posts to keep per post?"),
+                                        value: "5"
+                                    });
+                                    const data = {
+                                        path: `${this.meta().path}/api/ai/analyse.lua`,
+                                        parameters: {
+                                            dbpath: this.dbhandle.info.file.path,
+                                            top: parseInt(q)
+                                        }
+                                    };
+                                    const d = await this._api.apigateway(data, false);
+                                    if (d.error) {
+                                        throw new Error(d.error);
+                                    }
+                                    this.toast(d.result);
+                                }
+                                catch(e)
+                                {
+                                    this.error(__("Error analysing posts: {0}", e.toString()), e);
+                                }
+                            }
                        }
                    ]
                });
--- a/Blogger/package.json
+++ b/Blogger/package.json
@ -6,7 +6,7 @@
        "author": "Xuan Sang LE",
        "email": "xsang.le@gmail.com"
    },
-    "version": "0.2.10-a",
+    "version": "0.2.11-a",
    "category": "Internet",
    "iconclass": "fa fa-book",
    "dependencies": [