diff --git a/blog/ai/cluster.lua b/blog/ai/cluster.lua new file mode 100644 index 0000000..8e7ac33 --- /dev/null +++ b/blog/ai/cluster.lua @@ -0,0 +1,345 @@ +local doclassify = {} +local st = require("stmr") +doclassify.bow = function(data, stopwords) + -- first step get a table of worlds that contain + -- world: occurences + local bag = {} + for w in data:gmatch('%w+') do + local word = w:lower() + if not stopwords[word] then + word = st.stmr(word) + if bag[word] then + bag[word].count = bag[word].count + 1 + else + bag[word] = {count=0, tf=0, tfidf=0.0} + bag[word].count = 1 + end + end + end + -- now calculate the tf of the bag + for k,v in pairs(bag) do + bag[k].tf = math.log(1 + bag[k].count) + end + return bag +end +doclassify.len = function(table) + local cnt = 0 + for k,v in pairs(table) do cnt = cnt+1 end + return cnt +end +doclassify.tfidf = function(documents) + -- now for each term in a bag, calculate + -- the inverse document frequency, which + -- is a measure of how much information + -- the word provides, that is, whether the + -- term is common or rare across all documents + local ndoc = doclassify.len(documents) + for k,bag in pairs(documents) do + -- for eacht term in bag + -- calculate its idf across all documents + for term,b in pairs(bag) do + local n = 0 + for id,doc in pairs(documents) do + if doc[term] then n = n+1 end + end + --echo("term:"..term.." appears in"..n.." documents") + b.tfidf = b.tf*math.log(ndoc/n) + end + end + +end + +doclassify.search = function(term, documents) + local r = {} + for id, doc in pairs(documents) do + if doc[term:lower()] then + r[id] = doc[term].tfidf + end + end + return r +end + +doclassify.get_vectors = function(documents) + -- get a list of vector from documents + local index = 0 + local vectors = {} + local maps = {} + local terms = {} + local maxv = 0 + + for id in pairs(documents) do + maps[id] = {} + vectors[id] = {} + end + -- first loop, get the term + for id, doc in pairs(documents) do + for k,v in pairs(doc) do + -- get max value + if v.tfidf > maxv then + maxv = v.tfidf + end + -- get the term + if not terms[k] then + index = index + 1 + terms[k] = index + end + for pid in pairs(documents) do + if not maps[pid][k] then + if id == pid then + maps[pid][k] = v.tfidf + else + maps[pid][k] = 0 + end + else + if maps[pid][k] == 0 and id == pid then + maps[pid][k] = v.tfidf + end + end + end + end + end + -- reindexing the vectors + for id in pairs(documents) do + for k,v in pairs(maps[id]) do + vectors[id][terms[k]] = v + end + end + --echo("Max tfidf "..maxv.." in document #"..maxid.." of term "..term) + return vectors, maxv, index, terms +end + +doclassify.similarity = function(va, vb) + -- using cosin similarity + local dotp = 0 + local maga = 0 + local magb = 0 + for k = 1,#va do + dotp = dotp + va[k]*vb[k] + maga = maga + va[k]*va[k] + magb = magb + vb[k]*vb[k] + end + maga = math.sqrt(maga) + magb = math.sqrt(magb) + local d = 0 + if maga ~= 0 and magb ~= 0 then + d = dotp/ (magb*maga) + end + return d +end +doclassify.similarities = function(v1, collection) + local similarities = {} + assert(#v1 == #(collection[1]), "Incorrect vectors size") + for i=1,#collection do + similarities[i] = doclassify.similarity(v1, collection[i]) + end + return similarities +end + +doclassify.mean_similarity = function(v1, v2) + assert(#v1 == #v2, "Incorrect vectors size") + local similarities = {} + for i = 1,#v1 do similarities[i] = doclassify.similarity(v1[i], v2[i]) end + return doclassify.mean(similarities) +end +doclassify.similarity_chart = function(id, vectors) + local vs = {} + local cnt = 0 + local lut = {} + for k,v in pairs(vectors) do + if k ~= id then + cnt = cnt + 1 + vs[cnt] = v + lut[cnt] = k + end + end + if not vs[1] then return {} end + return doclassify.similarities(vectors[id], vs), lut +end + +doclassify.top_similarity = function(id, vectors, n, th) + local chart,lut = doclassify.similarity_chart(id,vectors) + --echo(JSON.encode(chart)) + --echo(JSON.encode(lut)) + if not lut or #lut <= 0 then return nil end + local top = {} + + local j=0 + local goon = true + if not th then + goon = false + end + + while j < n or goon + do + local i,maxv = doclassify.argmax(chart) + top[lut[i]] = maxv + chart[i] = 0.0 + j=j+1 + if maxv < th and goon then + goon = false + end + end + + --for j=1,n do + -- local i,maxv = doclassify.argmax(chart) + -- top[lut[i]] = maxv + -- chart[i] = 0.0 + --end + return top + +end +doclassify.save_vectors = function(vectors, name) + local f = io.open(name,"w") + if f == nil then return false end + for id, v in pairs(vectors) do + f:write(id) + for i=1,#v do f:write(","..v[i]) end + f:write("\n") + end + f:close() + return true +end +doclassify.save_topchart = function(vectors, name,n) + local f = io.open(name,"w") + if f == nil then return false end + for k,v in pairs(vectors) do + local top = doclassify.top_similarity(k,vectors,n, 0.1) + for a,b in pairs(top) do + f:write(k.." "..a.." "..b.."\n") + end + end + f:close() + return true +end +doclassify.kmean = function(nclass, documents, maxstep, ids) + -- now + local vectors, maxv, size = doclassify.get_vectors(documents) + -- random centroids + local centroids = {} + local old_centroids = {} + local clusters = {} + --for pid in pairs(documents) do clusters[pid] = 0 end + -- add noise to mean_vector + for i = 1,nclass do + if ids == nil then + centroids[i] = doclassify.random(size,math.floor(maxv)) + else + centroids[i] = vectors[ids[i]] + end + old_centroids[i] = doclassify.zeros(size) + end + + -- loop until convergence or maxstep reached + local similarity = doclassify.mean_similarity(centroids, old_centroids) + local step = maxstep + while 1.0-similarity > 1e-9 and step > 0 do + clusters = {} + --echo(JSON.encode(centroids)) + for id,v in pairs(vectors) do + local similarities = doclassify.similarities(v, centroids) + --echo(JSON.encode(similarities)) + local cluster, maxvalue = doclassify.argmax(similarities) + --echo("doc #"..id.." is in clusters #"..cluster.." max value is "..maxvalue) + clusters[id] = cluster + end + -- storing the old centroids + old_centroids = centroids + -- calculate new centroids + local new_centroids = {} + for class in pairs(centroids) do + local cnt = 0 + local cvectors = {} + for id,v in pairs(vectors) do + if clusters[id] == class then + cnt = cnt + 1 + cvectors[cnt] = v + end + end + new_centroids[class] = doclassify.mean_vector(cvectors, size) + end + centroids = new_centroids + --echo(JSON.encode(centroids)) + --echo(JSON.encode(old_centroids)) + similarity = doclassify.mean_similarity(centroids, old_centroids) + echo("step #"..step..", similarity "..similarity) + step = step - 1 + end + local results = {} + for i = 1,nclass do + local list = {} + local cnt = 0 + for id,c in pairs(clusters) do + if c == i then + cnt = cnt + 1 + list[cnt] = id + end + end + results[i] = list + end + return results, clusters, centroids +end + +doclassify.zeros = function(n) + local vector = {} + for i = 1,n do vector[i] = 0.0 end + return vector +end + +doclassify.random = function(n,maxv) + local vector = {} + for i=1,n do + vector[i] = math.random() + math.random(0, maxv) + end + return vector +end + +doclassify.sum = function(v) + local sum = 0.0 + for i=1,#v do sum = sum + v[i] end + return sum +end + +doclassify.mean = function(v) + return doclassify.sum(v)/#v + +end + +doclassify.mean_vector = function(vectors, size) + local means = doclassify.zeros(size) + if not vectors or #vectors == 0 then return means end + --local size = #(vectors[1]) + local times = 0 + for k,v in pairs(vectors) do + for i=1,#v do means[i] = means[i] + v[i] end + times = times + 1 + end + for i = 1,size do means[i] = means[i]/times end + return means +end + +doclassify.argmin = function(v) + local minv = 0.0 + local mini = 0.0 + for i = 1,#v do + if v[i] <= minv then + mini = i + minv = v[i] + end + end + --echo("min index"..mini.." val "..minv) + return mini, minv +end + +doclassify.argmax = function(v) + local maxv = 0.0 + local maxi = 0.0 + for i = 1,#v do + if v[i] >= maxv then + maxi = i + maxv = v[i] + end + end + return maxi,maxv +end + +return doclassify diff --git a/blog/ai/gettext.lua b/blog/ai/gettext.lua new file mode 100644 index 0000000..b0dabbd --- /dev/null +++ b/blog/ai/gettext.lua @@ -0,0 +1,29 @@ +local gettext = {} +require("sqlite") +gettext.get = function(q) + local db = require("os.libs.dbmodel").get("mrsang","blogs",nil) + if not db then return nil end + local exp = {["="] =q} + local cond = { + exp = exp, + fields = {"id", "content"} + } + local data, sort = db:find(cond) + db:close() + if not data or #data == 0 then return nil end + --for k,v in pairs(data) do + -- data[k].content = bytes.__tostring(std.b64decode(data[k].content)):gsub("%%","%%%%") + --end + return data +end + +gettext.stopwords = function(ospath) + --local ospath = require("fs/vfs").ospath(path) + local words = {} + for line in io.lines(ospath) do + words[line] = true + end + return words +end + +return gettext \ No newline at end of file diff --git a/blog/ai/stopwords.txt b/blog/ai/stopwords.txt new file mode 100644 index 0000000..abe0cef --- /dev/null +++ b/blog/ai/stopwords.txt @@ -0,0 +1,151 @@ +i +me +my +myself +we +our +ours +ourselves +you +your +yours +yourself +yourselves +he +him +his +himself +she +her +hers +herself +it +its +itself +they +them +their +theirs +themselves +what +which +who +whom +this +that +these +those +am +is +are +was +were +be +been +being +have +has +had +having +do +does +did +doing +a +an +the +and +but +if +or +because +as +until +while +of +at +by +for +with +about +against +between +into +through +during +before +after +above +below +to +from +up +down +in +out +on +off +over +under +again +further +then +once +here +there +when +where +why +how +all +any +both +each +few +more +most +other +some +such +no +nor +not +only +own +same +so +than +too +very +s +t +can +will +just +don +should +now +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +w +r +s +t +x +y +z \ No newline at end of file diff --git a/blog/ai/test.lua b/blog/ai/test.lua new file mode 100644 index 0000000..21573a6 --- /dev/null +++ b/blog/ai/test.lua @@ -0,0 +1,50 @@ +local path = require("fs/vfs").ospath("home://aiws/blog-clustering") +local gettext = loadfile(path.."/gettext.lua")() +local cluster = loadfile(path.."/cluster.lua")() + +local refresh = false + +local file = "/home/mrsang/test.csv" +if refresh then + local data = gettext.get({publish=1}) + local documents = {} + if data then + local sw = gettext.stopwords("home://aiws/blog-clustering/stopwords.txt") + for k,v in pairs(data) do + local bag = cluster.bow(data[k].content, sw) + documents[data[k].id] = bag + end + cluster.tfidf(documents) + --local v = cluster.search("arm", documents) + --echo(JSON.encode(v)) + local vectors, maxv, size = cluster.get_vectors(documents) + local s = cluster.save_topchart(vectors,file, 3) + if s then echo("file saved") else echo("error save file") end + --echo(JSON.encode(r)) + --r = cluster.similarity(vectors["14"],vectors["16"]) + --echo("Similarity "..r) + + --local c,l = cluster.kmean(3, documents, 10) + --echo(JSON.encode(c)) + --echo(JSON.encode(l)) + else + echo("Data missing") + end +else + local f = io.open(file,"r") + local result = {} + for line in f:lines() do + local arr = {} + local cnt = 0 + for i in line:gmatch( "%S+") do + cnt = cnt + 1 + arr[cnt] = i + end + if not result[arr[1]] then result[arr[1]] = {} end + result[arr[1]][arr[2]] = tonumber(arr[3]) + end + f:close() + echo(JSON.encode(result)) + --local r = cluster.top_similarity("2",vectors, 3) + --echo(JSON.encode(r)) +end \ No newline at end of file diff --git a/blog/controllers/PostController.lua b/blog/controllers/PostController.lua index 32381c2..2ca40ee 100644 --- a/blog/controllers/PostController.lua +++ b/blog/controllers/PostController.lua @@ -217,7 +217,7 @@ function PostController:analyse(n) if not n then n = 5 end - local path = "/home/mrsang/aiws/blog-clustering" + local path = WWW_ROOT."/ai" local gettext = loadfile(path .. "/gettext.lua")() local cluster = loadfile(path .. "/cluster.lua")() local data = gettext.get({publish = 1}) diff --git a/dist/antd_web_apps.tar.gz b/dist/antd_web_apps.tar.gz index 41b05e8..0ba9588 100644 Binary files a/dist/antd_web_apps.tar.gz and b/dist/antd_web_apps.tar.gz differ