move ai scripts to blog source

2025-07-24 01:29:48 +02:00 · 2022-02-10 09:57:21 +01:00
parent e35e8852b9
commit 87e8ba94b4
6 changed files with 576 additions and 1 deletions
--- a/blog/ai/cluster.lua
+++ b/blog/ai/cluster.lua
@ -0,0 +1,345 @@
 local doclassify = {}
 local st = require("stmr")
 doclassify.bow = function(data, stopwords)
    -- first step get a table of worlds that contain
    -- world: occurences
    local bag = {}
    for w in data:gmatch('%w+') do
        local word = w:lower()
        if not stopwords[word] then
            word = st.stmr(word)
            if bag[word] then
                bag[word].count = bag[word].count + 1
            else
                bag[word] = {count=0, tf=0, tfidf=0.0}
                bag[word].count = 1
            end
        end
    end
    -- now calculate the tf of the bag
    for k,v in pairs(bag) do
        bag[k].tf = math.log(1 + bag[k].count)
    end
    return bag
 end
 doclassify.len = function(table)
    local cnt = 0
    for k,v in pairs(table) do cnt = cnt+1 end
    return cnt
 end
 doclassify.tfidf = function(documents)
    -- now for each term in a bag, calculate 
    -- the inverse document frequency, which 
    -- is a measure of how much information 
    -- the word provides, that is, whether the
    -- term is common or rare across all documents
    local ndoc = doclassify.len(documents)
    for k,bag in pairs(documents) do
        -- for eacht term in bag
        -- calculate its idf across all documents
        for term,b in pairs(bag) do
            local n = 0
            for id,doc in pairs(documents) do
                if doc[term] then n = n+1 end
            end
            --echo("term:"..term.." appears in"..n.." documents")
            b.tfidf = b.tf*math.log(ndoc/n)
        end
    end
 end
 doclassify.search = function(term, documents)
    local r = {}
    for id, doc in pairs(documents) do
        if doc[term:lower()] then
            r[id] = doc[term].tfidf
        end
    end
    return r
 end
 doclassify.get_vectors = function(documents)
    -- get a list of vector from documents
    local index = 0
    local vectors = {}
    local maps = {}
    local terms = {}
    local maxv = 0
    for id in pairs(documents) do
        maps[id] = {}
        vectors[id] = {}
    end 
    -- first loop, get the term
    for id, doc in pairs(documents) do
        for k,v in pairs(doc) do
            -- get max value
            if v.tfidf > maxv then
                maxv = v.tfidf
            end
            -- get the term
            if not terms[k] then
                index = index + 1
                terms[k] = index
            end
            for pid in pairs(documents) do
                if not maps[pid][k] then
                    if id == pid then
                        maps[pid][k] = v.tfidf
                    else
                        maps[pid][k] = 0
                    end
                else
                    if maps[pid][k] == 0 and id == pid then
                        maps[pid][k] = v.tfidf
                    end
                end
            end
        end
    end
    -- reindexing the vectors
    for id in pairs(documents) do
        for k,v in pairs(maps[id]) do
            vectors[id][terms[k]] = v
        end
    end
    --echo("Max tfidf "..maxv.." in document #"..maxid.." of term "..term)
    return vectors, maxv, index, terms
 end
 doclassify.similarity = function(va, vb)
    -- using cosin similarity
    local dotp = 0
    local maga = 0
    local magb = 0
    for k = 1,#va do
        dotp = dotp + va[k]*vb[k]
        maga = maga + va[k]*va[k]
        magb = magb + vb[k]*vb[k]
    end
    maga = math.sqrt(maga)
    magb = math.sqrt(magb)
    local d  = 0
    if maga ~= 0 and magb ~= 0 then
        d = dotp/ (magb*maga)
    end
    return d
 end
 doclassify.similarities = function(v1, collection)
    local similarities = {}
    assert(#v1 == #(collection[1]), "Incorrect vectors size")
    for i=1,#collection do
        similarities[i] = doclassify.similarity(v1, collection[i])
    end
    return similarities
 end
 doclassify.mean_similarity = function(v1, v2)
    assert(#v1 == #v2, "Incorrect vectors size")
    local similarities = {}
    for i = 1,#v1 do similarities[i] = doclassify.similarity(v1[i], v2[i]) end
    return doclassify.mean(similarities)
 end
 doclassify.similarity_chart = function(id, vectors)
    local vs = {}
    local cnt = 0
    local lut = {}
    for k,v in pairs(vectors) do
        if k ~= id then
            cnt = cnt + 1
            vs[cnt] = v
            lut[cnt] = k
        end
    end
    if not vs[1] then return {} end
    return doclassify.similarities(vectors[id], vs), lut
 end
 doclassify.top_similarity = function(id, vectors, n, th)
    local chart,lut = doclassify.similarity_chart(id,vectors)
    --echo(JSON.encode(chart))
    --echo(JSON.encode(lut))
    if not lut or #lut <= 0 then return nil end
    local top = {}
    local j=0
    local goon = true
    if not th then
        goon = false
    end
    while j < n or goon
    do
        local i,maxv = doclassify.argmax(chart)
        top[lut[i]] = maxv
        chart[i] = 0.0
        j=j+1
        if maxv < th and goon then
            goon = false
        end
    end
    --for j=1,n do
    --    local i,maxv = doclassify.argmax(chart)
    --    top[lut[i]] = maxv
    --    chart[i] = 0.0
    --end
    return top
 end
 doclassify.save_vectors = function(vectors, name)
    local f = io.open(name,"w")
    if f == nil then return false end
    for id, v in pairs(vectors) do
        f:write(id)
        for i=1,#v do f:write(","..v[i]) end
        f:write("\n")
    end
    f:close()
    return true
 end
 doclassify.save_topchart = function(vectors, name,n)
    local f = io.open(name,"w")
    if f == nil then return false end
    for k,v in pairs(vectors) do 
        local top = doclassify.top_similarity(k,vectors,n, 0.1)
        for a,b in pairs(top) do
            f:write(k.." "..a.." "..b.."\n")
        end
    end
    f:close()
    return true
 end
 doclassify.kmean = function(nclass, documents, maxstep, ids)
    -- now 
    local vectors, maxv, size = doclassify.get_vectors(documents)
    -- random centroids
    local centroids = {}
    local old_centroids = {}
    local clusters = {}
    --for pid in pairs(documents) do clusters[pid] = 0 end
    -- add noise to mean_vector
    for i = 1,nclass do
        if ids == nil then
            centroids[i] = doclassify.random(size,math.floor(maxv))
        else
            centroids[i] = vectors[ids[i]]
        end
        old_centroids[i] = doclassify.zeros(size)
    end
    -- loop until convergence or maxstep reached
    local similarity = doclassify.mean_similarity(centroids, old_centroids)
    local step = maxstep
    while 1.0-similarity > 1e-9 and step > 0 do
        clusters = {}
        --echo(JSON.encode(centroids))
        for id,v in pairs(vectors) do
            local similarities = doclassify.similarities(v, centroids)
            --echo(JSON.encode(similarities))
            local cluster, maxvalue = doclassify.argmax(similarities)
            --echo("doc #"..id.." is in clusters #"..cluster.." max value is "..maxvalue)
            clusters[id] = cluster
        end
        -- storing the old centroids
        old_centroids = centroids
        -- calculate new centroids
        local new_centroids = {}
        for class in pairs(centroids) do
            local cnt = 0
            local cvectors = {}
            for id,v in pairs(vectors) do
                if clusters[id] == class then
                    cnt = cnt + 1
                    cvectors[cnt] = v
                end
            end
            new_centroids[class] = doclassify.mean_vector(cvectors, size)
        end
        centroids = new_centroids
        --echo(JSON.encode(centroids))
        --echo(JSON.encode(old_centroids))
        similarity = doclassify.mean_similarity(centroids, old_centroids)
        echo("step #"..step..", similarity "..similarity)
        step = step - 1
    end
    local results = {}
    for i = 1,nclass do
        local list = {}
        local cnt = 0
        for id,c in pairs(clusters) do
            if c == i then
                cnt = cnt + 1
                list[cnt] = id
            end
        end
        results[i] = list
    end
    return results, clusters, centroids
 end
 doclassify.zeros = function(n)
    local vector = {}
    for i = 1,n do vector[i] = 0.0 end
    return vector
 end
 doclassify.random = function(n,maxv)
    local vector = {}
    for i=1,n do
        vector[i] = math.random() + math.random(0, maxv)
    end
    return vector
 end
 doclassify.sum = function(v)
    local sum  = 0.0
    for i=1,#v do sum = sum + v[i] end
    return sum
 end
 doclassify.mean = function(v)
    return doclassify.sum(v)/#v
 end
 doclassify.mean_vector = function(vectors, size)
    local means = doclassify.zeros(size)
    if not vectors or #vectors == 0 then return means end
    --local size = #(vectors[1])
    local times = 0
    for k,v in pairs(vectors) do
        for i=1,#v do means[i] = means[i] + v[i] end
        times = times + 1
    end
    for i = 1,size do means[i] = means[i]/times end
    return means
 end
 doclassify.argmin = function(v)
    local minv = 0.0
    local mini = 0.0
    for i = 1,#v do
        if v[i] <= minv then
            mini = i
            minv = v[i]
        end
    end
    --echo("min index"..mini.." val "..minv)
    return mini, minv
 end
 doclassify.argmax = function(v)
    local maxv = 0.0
    local maxi = 0.0
    for i = 1,#v do
        if v[i] >= maxv then
            maxi = i
            maxv = v[i]
        end
    end
    return maxi,maxv
 end
 return doclassify
--- a/blog/ai/gettext.lua
+++ b/blog/ai/gettext.lua
@ -0,0 +1,29 @@
 local gettext = {}
 require("sqlite")
 gettext.get = function(q)
    local db = require("os.libs.dbmodel").get("mrsang","blogs",nil)
    if not db then return nil end
    local exp = {["="] =q}
     local cond = { 
        exp = exp, 
        fields = {"id", "content"}
    }
    local data, sort = db:find(cond)
    db:close()
    if not data or #data == 0 then return nil end
    --for k,v in pairs(data) do
    --    data[k].content = bytes.__tostring(std.b64decode(data[k].content)):gsub("%%","%%%%")
    --end
    return data 
 end
 gettext.stopwords = function(ospath)
    --local ospath = require("fs/vfs").ospath(path)
    local words = {}
    for line in io.lines(ospath) do
        words[line] = true
    end
    return words
 end
 return gettext
--- a/blog/ai/stopwords.txt
+++ b/blog/ai/stopwords.txt
@ -0,0 +1,151 @@
 i
 me
 my
 myself
 we
 our
 ours
 ourselves
 you
 your
 yours
 yourself
 yourselves
 he
 him
 his
 himself
 she
 her
 hers
 herself
 it
 its
 itself
 they
 them
 their
 theirs
 themselves
 what
 which
 who
 whom
 this
 that
 these
 those
 am
 is
 are
 was
 were
 be
 been
 being
 have
 has
 had
 having
 do
 does
 did
 doing
 a
 an
 the
 and
 but
 if
 or
 because
 as
 until
 while
 of
 at
 by
 for
 with
 about
 against
 between
 into
 through
 during
 before
 after
 above
 below
 to
 from
 up
 down
 in
 out
 on
 off
 over
 under
 again
 further
 then
 once
 here
 there
 when
 where
 why
 how
 all
 any
 both
 each
 few
 more
 most
 other
 some
 such
 no
 nor
 not
 only
 own
 same
 so
 than
 too
 very
 s
 t
 can
 will
 just
 don
 should
 now
 a
 b
 c
 d
 e
 f
 g
 h
 i
 j
 k
 l
 m
 n
 o
 p
 q
 w
 r
 s
 t
 x
 y
 z
--- a/blog/ai/test.lua
+++ b/blog/ai/test.lua
@ -0,0 +1,50 @@
 local path = require("fs/vfs").ospath("home://aiws/blog-clustering")
 local gettext = loadfile(path.."/gettext.lua")()
 local cluster = loadfile(path.."/cluster.lua")()
 local refresh = false
 local file = "/home/mrsang/test.csv"
 if refresh then
    local data = gettext.get({publish=1})
    local documents = {}
    if data then
        local sw = gettext.stopwords("home://aiws/blog-clustering/stopwords.txt")
        for k,v in pairs(data) do
            local bag = cluster.bow(data[k].content, sw)
            documents[data[k].id] = bag
        end
        cluster.tfidf(documents)
        --local v = cluster.search("arm", documents)
        --echo(JSON.encode(v))
        local vectors, maxv, size = cluster.get_vectors(documents)
        local s = cluster.save_topchart(vectors,file, 3)
        if s then echo("file saved") else echo("error save file") end
        --echo(JSON.encode(r))
        --r = cluster.similarity(vectors["14"],vectors["16"])
        --echo("Similarity "..r)
        --local c,l = cluster.kmean(3, documents, 10)
        --echo(JSON.encode(c))
        --echo(JSON.encode(l))
    else
        echo("Data missing")
    end
 else
    local f = io.open(file,"r")
    local result = {}
    for line in f:lines() do
        local arr = {}
        local cnt = 0
        for i in line:gmatch( "%S+") do
           cnt = cnt + 1
           arr[cnt] = i
        end
        if not result[arr[1]] then result[arr[1]] = {} end
        result[arr[1]][arr[2]] = tonumber(arr[3])
    end
    f:close()
    echo(JSON.encode(result))
    --local r = cluster.top_similarity("2",vectors, 3)
    --echo(JSON.encode(r))
 end
--- a/blog/controllers/PostController.lua
+++ b/blog/controllers/PostController.lua
@ -217,7 +217,7 @@ function PostController:analyse(n)
    if not n then
        n = 5
    end
-    local path = "/home/mrsang/aiws/blog-clustering"
+    local path = WWW_ROOT."/ai"
    local gettext = loadfile(path .. "/gettext.lua")()
    local cluster = loadfile(path .. "/cluster.lua")()
    local data = gettext.get({publish = 1})
--- a/dist/antd_web_apps.tar.gz
+++ b/dist/antd_web_apps.tar.gz