move ai scripts to blog source

2025-07-25 18:19:48 +02:00 · 2022-02-10 09:57:21 +01:00
parent e35e8852b9
commit 87e8ba94b4
6 changed files with 576 additions and 1 deletions
--- a/blog/ai/cluster.lua
+++ b/blog/ai/cluster.lua
@ -0,0 +1,345 @@
+local doclassify = {}
+local st = require("stmr")
+doclassify.bow = function(data, stopwords)
+    -- first step get a table of worlds that contain
+    -- world: occurences
+    local bag = {}
+    for w in data:gmatch('%w+') do
+        local word = w:lower()
+        if not stopwords[word] then
+            word = st.stmr(word)
+            if bag[word] then
+                bag[word].count = bag[word].count + 1
+            else
+                bag[word] = {count=0, tf=0, tfidf=0.0}
+                bag[word].count = 1
+            end
+        end
+    end
+    -- now calculate the tf of the bag
+    for k,v in pairs(bag) do
+        bag[k].tf = math.log(1 + bag[k].count)
+    end
+    return bag
+end
+doclassify.len = function(table)
+    local cnt = 0
+    for k,v in pairs(table) do cnt = cnt+1 end
+    return cnt
+end
+doclassify.tfidf = function(documents)
+    -- now for each term in a bag, calculate 
+    -- the inverse document frequency, which 
+    -- is a measure of how much information 
+    -- the word provides, that is, whether the
+    -- term is common or rare across all documents
+    local ndoc = doclassify.len(documents)
+    for k,bag in pairs(documents) do
+        -- for eacht term in bag
+        -- calculate its idf across all documents
+        for term,b in pairs(bag) do
+            local n = 0
+            for id,doc in pairs(documents) do
+                if doc[term] then n = n+1 end
+            end
+            --echo("term:"..term.." appears in"..n.." documents")
+            b.tfidf = b.tf*math.log(ndoc/n)
+        end
+    end
+    
+end
+
+doclassify.search = function(term, documents)
+    local r = {}
+    for id, doc in pairs(documents) do
+        if doc[term:lower()] then
+            r[id] = doc[term].tfidf
+        end
+    end
+    return r
+end
+
+doclassify.get_vectors = function(documents)
+    -- get a list of vector from documents
+    local index = 0
+    local vectors = {}
+    local maps = {}
+    local terms = {}
+    local maxv = 0
+    
+    for id in pairs(documents) do
+        maps[id] = {}
+        vectors[id] = {}
+    end 
+    -- first loop, get the term
+    for id, doc in pairs(documents) do
+        for k,v in pairs(doc) do
+            -- get max value
+            if v.tfidf > maxv then
+                maxv = v.tfidf
+            end
+            -- get the term
+            if not terms[k] then
+                index = index + 1
+                terms[k] = index
+            end
+            for pid in pairs(documents) do
+                if not maps[pid][k] then
+                    if id == pid then
+                        maps[pid][k] = v.tfidf
+                    else
+                        maps[pid][k] = 0
+                    end
+                else
+                    if maps[pid][k] == 0 and id == pid then
+                        maps[pid][k] = v.tfidf
+                    end
+                end
+            end
+        end
+    end
+    -- reindexing the vectors
+    for id in pairs(documents) do
+        for k,v in pairs(maps[id]) do
+            vectors[id][terms[k]] = v
+        end
+    end
+    --echo("Max tfidf "..maxv.." in document #"..maxid.." of term "..term)
+    return vectors, maxv, index, terms
+end
+
+doclassify.similarity = function(va, vb)
+    -- using cosin similarity
+    local dotp = 0
+    local maga = 0
+    local magb = 0
+    for k = 1,#va do
+        dotp = dotp + va[k]*vb[k]
+        maga = maga + va[k]*va[k]
+        magb = magb + vb[k]*vb[k]
+    end
+    maga = math.sqrt(maga)
+    magb = math.sqrt(magb)
+    local d  = 0
+    if maga ~= 0 and magb ~= 0 then
+        d = dotp/ (magb*maga)
+    end
+    return d
+end
+doclassify.similarities = function(v1, collection)
+    local similarities = {}
+    assert(#v1 == #(collection[1]), "Incorrect vectors size")
+    for i=1,#collection do
+        similarities[i] = doclassify.similarity(v1, collection[i])
+    end
+    return similarities
+end
+
+doclassify.mean_similarity = function(v1, v2)
+    assert(#v1 == #v2, "Incorrect vectors size")
+    local similarities = {}
+    for i = 1,#v1 do similarities[i] = doclassify.similarity(v1[i], v2[i]) end
+    return doclassify.mean(similarities)
+end
+doclassify.similarity_chart = function(id, vectors)
+    local vs = {}
+    local cnt = 0
+    local lut = {}
+    for k,v in pairs(vectors) do
+        if k ~= id then
+            cnt = cnt + 1
+            vs[cnt] = v
+            lut[cnt] = k
+        end
+    end
+    if not vs[1] then return {} end
+    return doclassify.similarities(vectors[id], vs), lut
+end
+
+doclassify.top_similarity = function(id, vectors, n, th)
+    local chart,lut = doclassify.similarity_chart(id,vectors)
+    --echo(JSON.encode(chart))
+    --echo(JSON.encode(lut))
+    if not lut or #lut <= 0 then return nil end
+    local top = {}
+    
+    local j=0
+    local goon = true
+    if not th then
+        goon = false
+    end
+    
+    while j < n or goon
+    do
+        local i,maxv = doclassify.argmax(chart)
+        top[lut[i]] = maxv
+        chart[i] = 0.0
+        j=j+1
+        if maxv < th and goon then
+            goon = false
+        end
+    end
+    
+    --for j=1,n do
+    --    local i,maxv = doclassify.argmax(chart)
+    --    top[lut[i]] = maxv
+    --    chart[i] = 0.0
+    --end
+    return top
+    
+end
+doclassify.save_vectors = function(vectors, name)
+    local f = io.open(name,"w")
+    if f == nil then return false end
+    for id, v in pairs(vectors) do
+        f:write(id)
+        for i=1,#v do f:write(","..v[i]) end
+        f:write("\n")
+    end
+    f:close()
+    return true
+end
+doclassify.save_topchart = function(vectors, name,n)
+    local f = io.open(name,"w")
+    if f == nil then return false end
+    for k,v in pairs(vectors) do 
+        local top = doclassify.top_similarity(k,vectors,n, 0.1)
+        for a,b in pairs(top) do
+            f:write(k.." "..a.." "..b.."\n")
+        end
+    end
+    f:close()
+    return true
+end
+doclassify.kmean = function(nclass, documents, maxstep, ids)
+    -- now 
+    local vectors, maxv, size = doclassify.get_vectors(documents)
+    -- random centroids
+    local centroids = {}
+    local old_centroids = {}
+    local clusters = {}
+    --for pid in pairs(documents) do clusters[pid] = 0 end
+    -- add noise to mean_vector
+    for i = 1,nclass do
+        if ids == nil then
+            centroids[i] = doclassify.random(size,math.floor(maxv))
+        else
+            centroids[i] = vectors[ids[i]]
+        end
+        old_centroids[i] = doclassify.zeros(size)
+    end
+    
+    -- loop until convergence or maxstep reached
+    local similarity = doclassify.mean_similarity(centroids, old_centroids)
+    local step = maxstep
+    while 1.0-similarity > 1e-9 and step > 0 do
+        clusters = {}
+        --echo(JSON.encode(centroids))
+        for id,v in pairs(vectors) do
+            local similarities = doclassify.similarities(v, centroids)
+            --echo(JSON.encode(similarities))
+            local cluster, maxvalue = doclassify.argmax(similarities)
+            --echo("doc #"..id.." is in clusters #"..cluster.." max value is "..maxvalue)
+            clusters[id] = cluster
+        end
+        -- storing the old centroids
+        old_centroids = centroids
+        -- calculate new centroids
+        local new_centroids = {}
+        for class in pairs(centroids) do
+            local cnt = 0
+            local cvectors = {}
+            for id,v in pairs(vectors) do
+                if clusters[id] == class then
+                    cnt = cnt + 1
+                    cvectors[cnt] = v
+                end
+            end
+            new_centroids[class] = doclassify.mean_vector(cvectors, size)
+        end
+        centroids = new_centroids
+        --echo(JSON.encode(centroids))
+        --echo(JSON.encode(old_centroids))
+        similarity = doclassify.mean_similarity(centroids, old_centroids)
+        echo("step #"..step..", similarity "..similarity)
+        step = step - 1
+    end
+    local results = {}
+    for i = 1,nclass do
+        local list = {}
+        local cnt = 0
+        for id,c in pairs(clusters) do
+            if c == i then
+                cnt = cnt + 1
+                list[cnt] = id
+            end
+        end
+        results[i] = list
+    end
+    return results, clusters, centroids
+end
+
+doclassify.zeros = function(n)
+    local vector = {}
+    for i = 1,n do vector[i] = 0.0 end
+    return vector
+end
+
+doclassify.random = function(n,maxv)
+    local vector = {}
+    for i=1,n do
+        vector[i] = math.random() + math.random(0, maxv)
+    end
+    return vector
+end
+
+doclassify.sum = function(v)
+    local sum  = 0.0
+    for i=1,#v do sum = sum + v[i] end
+    return sum
+end
+
+doclassify.mean = function(v)
+    return doclassify.sum(v)/#v
+    
+end
+
+doclassify.mean_vector = function(vectors, size)
+    local means = doclassify.zeros(size)
+    if not vectors or #vectors == 0 then return means end
+    --local size = #(vectors[1])
+    local times = 0
+    for k,v in pairs(vectors) do
+        for i=1,#v do means[i] = means[i] + v[i] end
+        times = times + 1
+    end
+    for i = 1,size do means[i] = means[i]/times end
+    return means
+end
+
+doclassify.argmin = function(v)
+    local minv = 0.0
+    local mini = 0.0
+    for i = 1,#v do
+        if v[i] <= minv then
+            mini = i
+            minv = v[i]
+        end
+    end
+    --echo("min index"..mini.." val "..minv)
+    return mini, minv
+end
+
+doclassify.argmax = function(v)
+    local maxv = 0.0
+    local maxi = 0.0
+    for i = 1,#v do
+        if v[i] >= maxv then
+            maxi = i
+            maxv = v[i]
+        end
+    end
+    return maxi,maxv
+end
+
+return doclassify
--- a/blog/ai/gettext.lua
+++ b/blog/ai/gettext.lua
@ -0,0 +1,29 @@
+local gettext = {}
+require("sqlite")
+gettext.get = function(q)
+    local db = require("os.libs.dbmodel").get("mrsang","blogs",nil)
+    if not db then return nil end
+    local exp = {["="] =q}
+     local cond = { 
+        exp = exp, 
+        fields = {"id", "content"}
+    }
+    local data, sort = db:find(cond)
+    db:close()
+    if not data or #data == 0 then return nil end
+    --for k,v in pairs(data) do
+    --    data[k].content = bytes.__tostring(std.b64decode(data[k].content)):gsub("%%","%%%%")
+    --end
+    return data 
+end
+
+gettext.stopwords = function(ospath)
+    --local ospath = require("fs/vfs").ospath(path)
+    local words = {}
+    for line in io.lines(ospath) do
+        words[line] = true
+    end
+    return words
+end
+
+return gettext
--- a/blog/ai/stopwords.txt
+++ b/blog/ai/stopwords.txt
@ -0,0 +1,151 @@
+i
+me
+my
+myself
+we
+our
+ours
+ourselves
+you
+your
+yours
+yourself
+yourselves
+he
+him
+his
+himself
+she
+her
+hers
+herself
+it
+its
+itself
+they
+them
+their
+theirs
+themselves
+what
+which
+who
+whom
+this
+that
+these
+those
+am
+is
+are
+was
+were
+be
+been
+being
+have
+has
+had
+having
+do
+does
+did
+doing
+a
+an
+the
+and
+but
+if
+or
+because
+as
+until
+while
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+again
+further
+then
+once
+here
+there
+when
+where
+why
+how
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+no
+nor
+not
+only
+own
+same
+so
+than
+too
+very
+s
+t
+can
+will
+just
+don
+should
+now
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+w
+r
+s
+t
+x
+y
+z
--- a/blog/ai/test.lua
+++ b/blog/ai/test.lua
@ -0,0 +1,50 @@
+local path = require("fs/vfs").ospath("home://aiws/blog-clustering")
+local gettext = loadfile(path.."/gettext.lua")()
+local cluster = loadfile(path.."/cluster.lua")()
+
+local refresh = false
+
+local file = "/home/mrsang/test.csv"
+if refresh then
+    local data = gettext.get({publish=1})
+    local documents = {}
+    if data then
+        local sw = gettext.stopwords("home://aiws/blog-clustering/stopwords.txt")
+        for k,v in pairs(data) do
+            local bag = cluster.bow(data[k].content, sw)
+            documents[data[k].id] = bag
+        end
+        cluster.tfidf(documents)
+        --local v = cluster.search("arm", documents)
+        --echo(JSON.encode(v))
+        local vectors, maxv, size = cluster.get_vectors(documents)
+        local s = cluster.save_topchart(vectors,file, 3)
+        if s then echo("file saved") else echo("error save file") end
+        --echo(JSON.encode(r))
+        --r = cluster.similarity(vectors["14"],vectors["16"])
+        --echo("Similarity "..r)
+        
+        --local c,l = cluster.kmean(3, documents, 10)
+        --echo(JSON.encode(c))
+        --echo(JSON.encode(l))
+    else
+        echo("Data missing")
+    end
+else
+    local f = io.open(file,"r")
+    local result = {}
+    for line in f:lines() do
+        local arr = {}
+        local cnt = 0
+        for i in line:gmatch( "%S+") do
+           cnt = cnt + 1
+           arr[cnt] = i
+        end
+        if not result[arr[1]] then result[arr[1]] = {} end
+        result[arr[1]][arr[2]] = tonumber(arr[3])
+    end
+    f:close()
+    echo(JSON.encode(result))
+    --local r = cluster.top_similarity("2",vectors, 3)
+    --echo(JSON.encode(r))
+end
--- a/blog/controllers/PostController.lua
+++ b/blog/controllers/PostController.lua
@ -217,7 +217,7 @@ function PostController:analyse(n)
    if not n then
        n = 5
    end
-    local path = "/home/mrsang/aiws/blog-clustering"
+    local path = WWW_ROOT."/ai"
    local gettext = loadfile(path .. "/gettext.lua")()
    local cluster = loadfile(path .. "/cluster.lua")()
    local data = gettext.get({publish = 1})
--- a/dist/antd_web_apps.tar.gz
+++ b/dist/antd_web_apps.tar.gz