mirror of
https://github.com/antos-rde/antosdk-apps.git
synced 2024-12-25 03:38:21 +01:00
feat(Blogger): support blog posts TF-IDF analyse
This commit is contained in:
parent
545f630b0e
commit
f9f27321ab
@ -6,6 +6,7 @@ Blackend for my blog at https://blog.iohub.dev
|
||||
## Change logs
|
||||
|
||||
### v0.2.x-a
|
||||
* Patch 11: Add TFIDF analyse functionality
|
||||
* Patch 10: Migrate code to typescript, use SQLiteDB lib for database access
|
||||
* Patch 9: Update to use the new MDE library
|
||||
* Patch 8: Support for antOS 2.0.x
|
||||
|
66
Blogger/api/ai/analyse.lua
Normal file
66
Blogger/api/ai/analyse.lua
Normal file
@ -0,0 +1,66 @@
|
||||
local args = ...
|
||||
|
||||
local ret = {
|
||||
error = false,
|
||||
result = nil
|
||||
}
|
||||
local __dir__ = debug.getinfo(1).source:match("@?(.*/)")
|
||||
LOG_DEBUG("CURRENT PATH:%s", __dir__)
|
||||
local cluster = loadfile(__dir__.."/cluster.lua")()
|
||||
local dbpath = require("vfs").ospath(args.dbpath)
|
||||
LOG_DEBUG("DB PATH:%s", dbpath)
|
||||
|
||||
local gettext = {}
|
||||
gettext.get = function(file)
|
||||
local db = DBModel:new{db=file}
|
||||
db:open()
|
||||
if not db then return nil end
|
||||
local data, sort = db:find("blogs", {
|
||||
where = { publish = 1 },
|
||||
fields = {"id", "content"}
|
||||
})
|
||||
db:close()
|
||||
if not data or #data == 0 then return nil end
|
||||
return data
|
||||
end
|
||||
|
||||
gettext.stopwords = function(ospath)
|
||||
local words = {}
|
||||
for line in io.lines(ospath) do
|
||||
words[line] = true
|
||||
end
|
||||
return words
|
||||
end
|
||||
|
||||
local data = gettext.get(dbpath)
|
||||
local documents = {}
|
||||
if data then
|
||||
local sw = gettext.stopwords(__dir__.."/stopwords.txt")
|
||||
for k, v in pairs(data) do
|
||||
local bag = cluster.bow(data[k].content, sw)
|
||||
documents[data[k].id] = bag
|
||||
end
|
||||
|
||||
cluster.tfidf(documents)
|
||||
--local v = cluster.search("arm", documents)
|
||||
--echo(JSON.encode(v))
|
||||
local vectors, maxv, size = cluster.get_vectors(documents)
|
||||
local analytical = DBModel:new{db=dbpath}
|
||||
analytical:open()
|
||||
-- purge the table
|
||||
analytical:delete("st_similarity", nil)
|
||||
-- get similarity and put to the table
|
||||
for id, v in pairs(vectors) do
|
||||
local top = cluster.top_similarity(id, vectors, args.top, 0.1)
|
||||
for a, b in pairs(top) do
|
||||
local record = {pid = id, sid = a, score = b}
|
||||
analytical:insert("st_similarity", record)
|
||||
end
|
||||
end
|
||||
analytical:close()
|
||||
ret.result = "Analyse complete"
|
||||
else
|
||||
ret.error = "Unable to query database for post"
|
||||
end
|
||||
|
||||
return ret
|
346
Blogger/api/ai/cluster.lua
Normal file
346
Blogger/api/ai/cluster.lua
Normal file
@ -0,0 +1,346 @@
|
||||
local doclassify = {}
|
||||
local st = require("stmr")
|
||||
doclassify.bow = function(data, stopwords)
|
||||
-- first step get a table of worlds that contain
|
||||
-- world: occurences
|
||||
local bag = {}
|
||||
for w in data:gmatch('%w+') do
|
||||
local word = w:lower()
|
||||
if not stopwords[word] then
|
||||
word = st.stmr(word)
|
||||
if bag[word] then
|
||||
bag[word].count = bag[word].count + 1
|
||||
else
|
||||
bag[word] = {count=0, tf=0, tfidf=0.0}
|
||||
bag[word].count = 1
|
||||
end
|
||||
end
|
||||
end
|
||||
-- now calculate the tf of the bag
|
||||
for k,v in pairs(bag) do
|
||||
bag[k].tf = math.log(1 + bag[k].count)
|
||||
end
|
||||
return bag
|
||||
end
|
||||
doclassify.len = function(table)
|
||||
local cnt = 0
|
||||
for k,v in pairs(table) do cnt = cnt+1 end
|
||||
return cnt
|
||||
end
|
||||
doclassify.tfidf = function(documents)
|
||||
-- now for each term in a bag, calculate
|
||||
-- the inverse document frequency, which
|
||||
-- is a measure of how much information
|
||||
-- the word provides, that is, whether the
|
||||
-- term is common or rare across all documents
|
||||
local ndoc = doclassify.len(documents)
|
||||
for k,bag in pairs(documents) do
|
||||
-- for eacht term in bag
|
||||
-- calculate its idf across all documents
|
||||
for term,b in pairs(bag) do
|
||||
local n = 0
|
||||
for id,doc in pairs(documents) do
|
||||
if doc[term] then n = n+1 end
|
||||
end
|
||||
--echo("term:"..term.." appears in"..n.." documents")
|
||||
b.tfidf = b.tf*math.log(ndoc/n)
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
doclassify.search = function(term, documents)
|
||||
local r = {}
|
||||
for id, doc in pairs(documents) do
|
||||
if doc[term:lower()] then
|
||||
r[id] = doc[term].tfidf
|
||||
end
|
||||
end
|
||||
return r
|
||||
end
|
||||
|
||||
doclassify.get_vectors = function(documents)
|
||||
-- get a list of vector from documents
|
||||
local index = 0
|
||||
local vectors = {}
|
||||
local maps = {}
|
||||
local terms = {}
|
||||
local maxv = 0
|
||||
|
||||
for id in pairs(documents) do
|
||||
maps[id] = {}
|
||||
vectors[id] = {}
|
||||
end
|
||||
-- first loop, get the term
|
||||
for id, doc in pairs(documents) do
|
||||
for k,v in pairs(doc) do
|
||||
-- get max value
|
||||
if v.tfidf > maxv then
|
||||
maxv = v.tfidf
|
||||
end
|
||||
-- get the term
|
||||
if not terms[k] then
|
||||
index = index + 1
|
||||
terms[k] = index
|
||||
end
|
||||
for pid in pairs(documents) do
|
||||
if not maps[pid][k] then
|
||||
if id == pid then
|
||||
maps[pid][k] = v.tfidf
|
||||
else
|
||||
maps[pid][k] = 0
|
||||
end
|
||||
else
|
||||
if maps[pid][k] == 0 and id == pid then
|
||||
maps[pid][k] = v.tfidf
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
-- reindexing the vectors
|
||||
for id in pairs(documents) do
|
||||
for k,v in pairs(maps[id]) do
|
||||
vectors[id][terms[k]] = v
|
||||
end
|
||||
end
|
||||
--echo("Max tfidf "..maxv.." in document #"..maxid.." of term "..term)
|
||||
return vectors, maxv, index, terms
|
||||
end
|
||||
|
||||
doclassify.similarity = function(va, vb)
|
||||
-- using cosin similarity
|
||||
local dotp = 0
|
||||
local maga = 0
|
||||
local magb = 0
|
||||
for k = 1,#va do
|
||||
dotp = dotp + va[k]*vb[k]
|
||||
maga = maga + va[k]*va[k]
|
||||
magb = magb + vb[k]*vb[k]
|
||||
end
|
||||
maga = math.sqrt(maga)
|
||||
magb = math.sqrt(magb)
|
||||
local d = 0
|
||||
if maga ~= 0 and magb ~= 0 then
|
||||
d = dotp/ (magb*maga)
|
||||
end
|
||||
return d
|
||||
end
|
||||
doclassify.similarities = function(v1, collection)
|
||||
local similarities = {}
|
||||
assert(#v1 == #(collection[1]), "Incorrect vectors size")
|
||||
for i=1,#collection do
|
||||
similarities[i] = doclassify.similarity(v1, collection[i])
|
||||
end
|
||||
return similarities
|
||||
end
|
||||
|
||||
doclassify.mean_similarity = function(v1, v2)
|
||||
assert(#v1 == #v2, "Incorrect vectors size")
|
||||
local similarities = {}
|
||||
for i = 1,#v1 do similarities[i] = doclassify.similarity(v1[i], v2[i]) end
|
||||
return doclassify.mean(similarities)
|
||||
end
|
||||
doclassify.similarity_chart = function(id, vectors)
|
||||
local vs = {}
|
||||
local cnt = 0
|
||||
local lut = {}
|
||||
for k,v in pairs(vectors) do
|
||||
if k ~= id then
|
||||
cnt = cnt + 1
|
||||
vs[cnt] = v
|
||||
lut[cnt] = k
|
||||
end
|
||||
end
|
||||
if not vs[1] then return {} end
|
||||
return doclassify.similarities(vectors[id], vs), lut
|
||||
end
|
||||
|
||||
doclassify.top_similarity = function(id, vectors, n, th)
|
||||
local chart,lut = doclassify.similarity_chart(id,vectors)
|
||||
--echo(JSON.encode(chart))
|
||||
--echo(JSON.encode(lut))
|
||||
if not lut or #lut <= 0 then return nil end
|
||||
local top = {}
|
||||
|
||||
local j=0
|
||||
local goon = true
|
||||
if not th then
|
||||
goon = false
|
||||
th = 0
|
||||
end
|
||||
|
||||
while j < n or goon
|
||||
do
|
||||
local i,maxv = doclassify.argmax(chart)
|
||||
top[lut[i]] = maxv
|
||||
chart[i] = 0.0
|
||||
j=j+1
|
||||
if maxv < th and goon then
|
||||
goon = false
|
||||
end
|
||||
end
|
||||
|
||||
--for j=1,n do
|
||||
-- local i,maxv = doclassify.argmax(chart)
|
||||
-- top[lut[i]] = maxv
|
||||
-- chart[i] = 0.0
|
||||
--end
|
||||
return top
|
||||
|
||||
end
|
||||
doclassify.save_vectors = function(vectors, name)
|
||||
local f = io.open(name,"w")
|
||||
if f == nil then return false end
|
||||
for id, v in pairs(vectors) do
|
||||
f:write(id)
|
||||
for i=1,#v do f:write(","..v[i]) end
|
||||
f:write("\n")
|
||||
end
|
||||
f:close()
|
||||
return true
|
||||
end
|
||||
doclassify.save_topchart = function(vectors, name,n)
|
||||
local f = io.open(name,"w")
|
||||
if f == nil then return false end
|
||||
for k,v in pairs(vectors) do
|
||||
local top = doclassify.top_similarity(k,vectors,n, 0.1)
|
||||
for a,b in pairs(top) do
|
||||
f:write(k.." "..a.." "..b.."\n")
|
||||
end
|
||||
end
|
||||
f:close()
|
||||
return true
|
||||
end
|
||||
doclassify.kmean = function(nclass, documents, maxstep, ids)
|
||||
-- now
|
||||
local vectors, maxv, size = doclassify.get_vectors(documents)
|
||||
-- random centroids
|
||||
local centroids = {}
|
||||
local old_centroids = {}
|
||||
local clusters = {}
|
||||
--for pid in pairs(documents) do clusters[pid] = 0 end
|
||||
-- add noise to mean_vector
|
||||
for i = 1,nclass do
|
||||
if ids == nil then
|
||||
centroids[i] = doclassify.random(size,math.floor(maxv))
|
||||
else
|
||||
centroids[i] = vectors[ids[i]]
|
||||
end
|
||||
old_centroids[i] = doclassify.zeros(size)
|
||||
end
|
||||
|
||||
-- loop until convergence or maxstep reached
|
||||
local similarity = doclassify.mean_similarity(centroids, old_centroids)
|
||||
local step = maxstep
|
||||
while 1.0-similarity > 1e-9 and step > 0 do
|
||||
clusters = {}
|
||||
--echo(JSON.encode(centroids))
|
||||
for id,v in pairs(vectors) do
|
||||
local similarities = doclassify.similarities(v, centroids)
|
||||
--echo(JSON.encode(similarities))
|
||||
local cluster, maxvalue = doclassify.argmax(similarities)
|
||||
--echo("doc #"..id.." is in clusters #"..cluster.." max value is "..maxvalue)
|
||||
clusters[id] = cluster
|
||||
end
|
||||
-- storing the old centroids
|
||||
old_centroids = centroids
|
||||
-- calculate new centroids
|
||||
local new_centroids = {}
|
||||
for class in pairs(centroids) do
|
||||
local cnt = 0
|
||||
local cvectors = {}
|
||||
for id,v in pairs(vectors) do
|
||||
if clusters[id] == class then
|
||||
cnt = cnt + 1
|
||||
cvectors[cnt] = v
|
||||
end
|
||||
end
|
||||
new_centroids[class] = doclassify.mean_vector(cvectors, size)
|
||||
end
|
||||
centroids = new_centroids
|
||||
--echo(JSON.encode(centroids))
|
||||
--echo(JSON.encode(old_centroids))
|
||||
similarity = doclassify.mean_similarity(centroids, old_centroids)
|
||||
echo("step #"..step..", similarity "..similarity)
|
||||
step = step - 1
|
||||
end
|
||||
local results = {}
|
||||
for i = 1,nclass do
|
||||
local list = {}
|
||||
local cnt = 0
|
||||
for id,c in pairs(clusters) do
|
||||
if c == i then
|
||||
cnt = cnt + 1
|
||||
list[cnt] = id
|
||||
end
|
||||
end
|
||||
results[i] = list
|
||||
end
|
||||
return results, clusters, centroids
|
||||
end
|
||||
|
||||
doclassify.zeros = function(n)
|
||||
local vector = {}
|
||||
for i = 1,n do vector[i] = 0.0 end
|
||||
return vector
|
||||
end
|
||||
|
||||
doclassify.random = function(n,maxv)
|
||||
local vector = {}
|
||||
for i=1,n do
|
||||
vector[i] = math.random() + math.random(0, maxv)
|
||||
end
|
||||
return vector
|
||||
end
|
||||
|
||||
doclassify.sum = function(v)
|
||||
local sum = 0.0
|
||||
for i=1,#v do sum = sum + v[i] end
|
||||
return sum
|
||||
end
|
||||
|
||||
doclassify.mean = function(v)
|
||||
return doclassify.sum(v)/#v
|
||||
|
||||
end
|
||||
|
||||
doclassify.mean_vector = function(vectors, size)
|
||||
local means = doclassify.zeros(size)
|
||||
if not vectors or #vectors == 0 then return means end
|
||||
--local size = #(vectors[1])
|
||||
local times = 0
|
||||
for k,v in pairs(vectors) do
|
||||
for i=1,#v do means[i] = means[i] + v[i] end
|
||||
times = times + 1
|
||||
end
|
||||
for i = 1,size do means[i] = means[i]/times end
|
||||
return means
|
||||
end
|
||||
|
||||
doclassify.argmin = function(v)
|
||||
local minv = 0.0
|
||||
local mini = 0.0
|
||||
for i = 1,#v do
|
||||
if v[i] <= minv then
|
||||
mini = i
|
||||
minv = v[i]
|
||||
end
|
||||
end
|
||||
--echo("min index"..mini.." val "..minv)
|
||||
return mini, minv
|
||||
end
|
||||
|
||||
doclassify.argmax = function(v)
|
||||
local maxv = 0.0
|
||||
local maxi = 0.0
|
||||
for i = 1,#v do
|
||||
if v[i] >= maxv then
|
||||
maxi = i
|
||||
maxv = v[i]
|
||||
end
|
||||
end
|
||||
return maxi,maxv
|
||||
end
|
||||
|
||||
return doclassify
|
151
Blogger/api/ai/stopwords.txt
Normal file
151
Blogger/api/ai/stopwords.txt
Normal file
@ -0,0 +1,151 @@
|
||||
i
|
||||
me
|
||||
my
|
||||
myself
|
||||
we
|
||||
our
|
||||
ours
|
||||
ourselves
|
||||
you
|
||||
your
|
||||
yours
|
||||
yourself
|
||||
yourselves
|
||||
he
|
||||
him
|
||||
his
|
||||
himself
|
||||
she
|
||||
her
|
||||
hers
|
||||
herself
|
||||
it
|
||||
its
|
||||
itself
|
||||
they
|
||||
them
|
||||
their
|
||||
theirs
|
||||
themselves
|
||||
what
|
||||
which
|
||||
who
|
||||
whom
|
||||
this
|
||||
that
|
||||
these
|
||||
those
|
||||
am
|
||||
is
|
||||
are
|
||||
was
|
||||
were
|
||||
be
|
||||
been
|
||||
being
|
||||
have
|
||||
has
|
||||
had
|
||||
having
|
||||
do
|
||||
does
|
||||
did
|
||||
doing
|
||||
a
|
||||
an
|
||||
the
|
||||
and
|
||||
but
|
||||
if
|
||||
or
|
||||
because
|
||||
as
|
||||
until
|
||||
while
|
||||
of
|
||||
at
|
||||
by
|
||||
for
|
||||
with
|
||||
about
|
||||
against
|
||||
between
|
||||
into
|
||||
through
|
||||
during
|
||||
before
|
||||
after
|
||||
above
|
||||
below
|
||||
to
|
||||
from
|
||||
up
|
||||
down
|
||||
in
|
||||
out
|
||||
on
|
||||
off
|
||||
over
|
||||
under
|
||||
again
|
||||
further
|
||||
then
|
||||
once
|
||||
here
|
||||
there
|
||||
when
|
||||
where
|
||||
why
|
||||
how
|
||||
all
|
||||
any
|
||||
both
|
||||
each
|
||||
few
|
||||
more
|
||||
most
|
||||
other
|
||||
some
|
||||
such
|
||||
no
|
||||
nor
|
||||
not
|
||||
only
|
||||
own
|
||||
same
|
||||
so
|
||||
than
|
||||
too
|
||||
very
|
||||
s
|
||||
t
|
||||
can
|
||||
will
|
||||
just
|
||||
don
|
||||
should
|
||||
now
|
||||
a
|
||||
b
|
||||
c
|
||||
d
|
||||
e
|
||||
f
|
||||
g
|
||||
h
|
||||
i
|
||||
j
|
||||
k
|
||||
l
|
||||
m
|
||||
n
|
||||
o
|
||||
p
|
||||
q
|
||||
w
|
||||
r
|
||||
s
|
||||
t
|
||||
x
|
||||
y
|
||||
z
|
50
Blogger/api/ai/test.lua
Normal file
50
Blogger/api/ai/test.lua
Normal file
@ -0,0 +1,50 @@
|
||||
local path = require("fs/vfs").ospath("home://aiws/blog-clustering")
|
||||
local gettext = loadfile(path.."/gettext.lua")()
|
||||
local cluster = loadfile(path.."/cluster.lua")()
|
||||
|
||||
local refresh = false
|
||||
|
||||
local file = "/home/mrsang/test.csv"
|
||||
if refresh then
|
||||
local data = gettext.get({publish=1})
|
||||
local documents = {}
|
||||
if data then
|
||||
local sw = gettext.stopwords("home://aiws/blog-clustering/stopwords.txt")
|
||||
for k,v in pairs(data) do
|
||||
local bag = cluster.bow(data[k].content, sw)
|
||||
documents[data[k].id] = bag
|
||||
end
|
||||
cluster.tfidf(documents)
|
||||
--local v = cluster.search("arm", documents)
|
||||
--echo(JSON.encode(v))
|
||||
local vectors, maxv, size = cluster.get_vectors(documents)
|
||||
local s = cluster.save_topchart(vectors,file, 3)
|
||||
if s then echo("file saved") else echo("error save file") end
|
||||
--echo(JSON.encode(r))
|
||||
--r = cluster.similarity(vectors["14"],vectors["16"])
|
||||
--echo("Similarity "..r)
|
||||
|
||||
--local c,l = cluster.kmean(3, documents, 10)
|
||||
--echo(JSON.encode(c))
|
||||
--echo(JSON.encode(l))
|
||||
else
|
||||
echo("Data missing")
|
||||
end
|
||||
else
|
||||
local f = io.open(file,"r")
|
||||
local result = {}
|
||||
for line in f:lines() do
|
||||
local arr = {}
|
||||
local cnt = 0
|
||||
for i in line:gmatch( "%S+") do
|
||||
cnt = cnt + 1
|
||||
arr[cnt] = i
|
||||
end
|
||||
if not result[arr[1]] then result[arr[1]] = {} end
|
||||
result[arr[1]][arr[2]] = tonumber(arr[3])
|
||||
end
|
||||
f:close()
|
||||
echo(JSON.encode(result))
|
||||
--local r = cluster.top_similarity("2",vectors, 3)
|
||||
--echo(JSON.encode(r))
|
||||
end
|
@ -55,7 +55,7 @@
|
||||
"data": {
|
||||
"src": [
|
||||
"scheme.html",
|
||||
"api/sendmail.lua",
|
||||
"api",
|
||||
"package.json",
|
||||
"README.md",
|
||||
"main.css"
|
||||
|
@ -6,6 +6,7 @@ Blackend for my blog at https://blog.iohub.dev
|
||||
## Change logs
|
||||
|
||||
### v0.2.x-a
|
||||
* Patch 11: Add TFIDF analyse functionality
|
||||
* Patch 10: Migrate code to typescript, use SQLiteDB lib for database access
|
||||
* Patch 9: Update to use the new MDE library
|
||||
* Patch 8: Support for antOS 2.0.x
|
||||
|
66
Blogger/build/debug/api/ai/analyse.lua
Normal file
66
Blogger/build/debug/api/ai/analyse.lua
Normal file
@ -0,0 +1,66 @@
|
||||
local args = ...
|
||||
|
||||
local ret = {
|
||||
error = false,
|
||||
result = nil
|
||||
}
|
||||
local __dir__ = debug.getinfo(1).source:match("@?(.*/)")
|
||||
LOG_DEBUG("CURRENT PATH:%s", __dir__)
|
||||
local cluster = loadfile(__dir__.."/cluster.lua")()
|
||||
local dbpath = require("vfs").ospath(args.dbpath)
|
||||
LOG_DEBUG("DB PATH:%s", dbpath)
|
||||
|
||||
local gettext = {}
|
||||
gettext.get = function(file)
|
||||
local db = DBModel:new{db=file}
|
||||
db:open()
|
||||
if not db then return nil end
|
||||
local data, sort = db:find("blogs", {
|
||||
where = { publish = 1 },
|
||||
fields = {"id", "content"}
|
||||
})
|
||||
db:close()
|
||||
if not data or #data == 0 then return nil end
|
||||
return data
|
||||
end
|
||||
|
||||
gettext.stopwords = function(ospath)
|
||||
local words = {}
|
||||
for line in io.lines(ospath) do
|
||||
words[line] = true
|
||||
end
|
||||
return words
|
||||
end
|
||||
|
||||
local data = gettext.get(dbpath)
|
||||
local documents = {}
|
||||
if data then
|
||||
local sw = gettext.stopwords(__dir__.."/stopwords.txt")
|
||||
for k, v in pairs(data) do
|
||||
local bag = cluster.bow(data[k].content, sw)
|
||||
documents[data[k].id] = bag
|
||||
end
|
||||
|
||||
cluster.tfidf(documents)
|
||||
--local v = cluster.search("arm", documents)
|
||||
--echo(JSON.encode(v))
|
||||
local vectors, maxv, size = cluster.get_vectors(documents)
|
||||
local analytical = DBModel:new{db=dbpath}
|
||||
analytical:open()
|
||||
-- purge the table
|
||||
analytical:delete("st_similarity", nil)
|
||||
-- get similarity and put to the table
|
||||
for id, v in pairs(vectors) do
|
||||
local top = cluster.top_similarity(id, vectors, args.top, 0.1)
|
||||
for a, b in pairs(top) do
|
||||
local record = {pid = id, sid = a, score = b}
|
||||
analytical:insert("st_similarity", record)
|
||||
end
|
||||
end
|
||||
analytical:close()
|
||||
ret.result = "Analyse complete"
|
||||
else
|
||||
ret.error = "Unable to query database for post"
|
||||
end
|
||||
|
||||
return ret
|
346
Blogger/build/debug/api/ai/cluster.lua
Normal file
346
Blogger/build/debug/api/ai/cluster.lua
Normal file
@ -0,0 +1,346 @@
|
||||
local doclassify = {}
|
||||
local st = require("stmr")
|
||||
doclassify.bow = function(data, stopwords)
|
||||
-- first step get a table of worlds that contain
|
||||
-- world: occurences
|
||||
local bag = {}
|
||||
for w in data:gmatch('%w+') do
|
||||
local word = w:lower()
|
||||
if not stopwords[word] then
|
||||
word = st.stmr(word)
|
||||
if bag[word] then
|
||||
bag[word].count = bag[word].count + 1
|
||||
else
|
||||
bag[word] = {count=0, tf=0, tfidf=0.0}
|
||||
bag[word].count = 1
|
||||
end
|
||||
end
|
||||
end
|
||||
-- now calculate the tf of the bag
|
||||
for k,v in pairs(bag) do
|
||||
bag[k].tf = math.log(1 + bag[k].count)
|
||||
end
|
||||
return bag
|
||||
end
|
||||
doclassify.len = function(table)
|
||||
local cnt = 0
|
||||
for k,v in pairs(table) do cnt = cnt+1 end
|
||||
return cnt
|
||||
end
|
||||
doclassify.tfidf = function(documents)
|
||||
-- now for each term in a bag, calculate
|
||||
-- the inverse document frequency, which
|
||||
-- is a measure of how much information
|
||||
-- the word provides, that is, whether the
|
||||
-- term is common or rare across all documents
|
||||
local ndoc = doclassify.len(documents)
|
||||
for k,bag in pairs(documents) do
|
||||
-- for eacht term in bag
|
||||
-- calculate its idf across all documents
|
||||
for term,b in pairs(bag) do
|
||||
local n = 0
|
||||
for id,doc in pairs(documents) do
|
||||
if doc[term] then n = n+1 end
|
||||
end
|
||||
--echo("term:"..term.." appears in"..n.." documents")
|
||||
b.tfidf = b.tf*math.log(ndoc/n)
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
doclassify.search = function(term, documents)
|
||||
local r = {}
|
||||
for id, doc in pairs(documents) do
|
||||
if doc[term:lower()] then
|
||||
r[id] = doc[term].tfidf
|
||||
end
|
||||
end
|
||||
return r
|
||||
end
|
||||
|
||||
doclassify.get_vectors = function(documents)
|
||||
-- get a list of vector from documents
|
||||
local index = 0
|
||||
local vectors = {}
|
||||
local maps = {}
|
||||
local terms = {}
|
||||
local maxv = 0
|
||||
|
||||
for id in pairs(documents) do
|
||||
maps[id] = {}
|
||||
vectors[id] = {}
|
||||
end
|
||||
-- first loop, get the term
|
||||
for id, doc in pairs(documents) do
|
||||
for k,v in pairs(doc) do
|
||||
-- get max value
|
||||
if v.tfidf > maxv then
|
||||
maxv = v.tfidf
|
||||
end
|
||||
-- get the term
|
||||
if not terms[k] then
|
||||
index = index + 1
|
||||
terms[k] = index
|
||||
end
|
||||
for pid in pairs(documents) do
|
||||
if not maps[pid][k] then
|
||||
if id == pid then
|
||||
maps[pid][k] = v.tfidf
|
||||
else
|
||||
maps[pid][k] = 0
|
||||
end
|
||||
else
|
||||
if maps[pid][k] == 0 and id == pid then
|
||||
maps[pid][k] = v.tfidf
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
-- reindexing the vectors
|
||||
for id in pairs(documents) do
|
||||
for k,v in pairs(maps[id]) do
|
||||
vectors[id][terms[k]] = v
|
||||
end
|
||||
end
|
||||
--echo("Max tfidf "..maxv.." in document #"..maxid.." of term "..term)
|
||||
return vectors, maxv, index, terms
|
||||
end
|
||||
|
||||
doclassify.similarity = function(va, vb)
|
||||
-- using cosin similarity
|
||||
local dotp = 0
|
||||
local maga = 0
|
||||
local magb = 0
|
||||
for k = 1,#va do
|
||||
dotp = dotp + va[k]*vb[k]
|
||||
maga = maga + va[k]*va[k]
|
||||
magb = magb + vb[k]*vb[k]
|
||||
end
|
||||
maga = math.sqrt(maga)
|
||||
magb = math.sqrt(magb)
|
||||
local d = 0
|
||||
if maga ~= 0 and magb ~= 0 then
|
||||
d = dotp/ (magb*maga)
|
||||
end
|
||||
return d
|
||||
end
|
||||
doclassify.similarities = function(v1, collection)
|
||||
local similarities = {}
|
||||
assert(#v1 == #(collection[1]), "Incorrect vectors size")
|
||||
for i=1,#collection do
|
||||
similarities[i] = doclassify.similarity(v1, collection[i])
|
||||
end
|
||||
return similarities
|
||||
end
|
||||
|
||||
doclassify.mean_similarity = function(v1, v2)
|
||||
assert(#v1 == #v2, "Incorrect vectors size")
|
||||
local similarities = {}
|
||||
for i = 1,#v1 do similarities[i] = doclassify.similarity(v1[i], v2[i]) end
|
||||
return doclassify.mean(similarities)
|
||||
end
|
||||
doclassify.similarity_chart = function(id, vectors)
|
||||
local vs = {}
|
||||
local cnt = 0
|
||||
local lut = {}
|
||||
for k,v in pairs(vectors) do
|
||||
if k ~= id then
|
||||
cnt = cnt + 1
|
||||
vs[cnt] = v
|
||||
lut[cnt] = k
|
||||
end
|
||||
end
|
||||
if not vs[1] then return {} end
|
||||
return doclassify.similarities(vectors[id], vs), lut
|
||||
end
|
||||
|
||||
doclassify.top_similarity = function(id, vectors, n, th)
|
||||
local chart,lut = doclassify.similarity_chart(id,vectors)
|
||||
--echo(JSON.encode(chart))
|
||||
--echo(JSON.encode(lut))
|
||||
if not lut or #lut <= 0 then return nil end
|
||||
local top = {}
|
||||
|
||||
local j=0
|
||||
local goon = true
|
||||
if not th then
|
||||
goon = false
|
||||
th = 0
|
||||
end
|
||||
|
||||
while j < n or goon
|
||||
do
|
||||
local i,maxv = doclassify.argmax(chart)
|
||||
top[lut[i]] = maxv
|
||||
chart[i] = 0.0
|
||||
j=j+1
|
||||
if maxv < th and goon then
|
||||
goon = false
|
||||
end
|
||||
end
|
||||
|
||||
--for j=1,n do
|
||||
-- local i,maxv = doclassify.argmax(chart)
|
||||
-- top[lut[i]] = maxv
|
||||
-- chart[i] = 0.0
|
||||
--end
|
||||
return top
|
||||
|
||||
end
|
||||
doclassify.save_vectors = function(vectors, name)
|
||||
local f = io.open(name,"w")
|
||||
if f == nil then return false end
|
||||
for id, v in pairs(vectors) do
|
||||
f:write(id)
|
||||
for i=1,#v do f:write(","..v[i]) end
|
||||
f:write("\n")
|
||||
end
|
||||
f:close()
|
||||
return true
|
||||
end
|
||||
doclassify.save_topchart = function(vectors, name,n)
|
||||
local f = io.open(name,"w")
|
||||
if f == nil then return false end
|
||||
for k,v in pairs(vectors) do
|
||||
local top = doclassify.top_similarity(k,vectors,n, 0.1)
|
||||
for a,b in pairs(top) do
|
||||
f:write(k.." "..a.." "..b.."\n")
|
||||
end
|
||||
end
|
||||
f:close()
|
||||
return true
|
||||
end
|
||||
doclassify.kmean = function(nclass, documents, maxstep, ids)
|
||||
-- now
|
||||
local vectors, maxv, size = doclassify.get_vectors(documents)
|
||||
-- random centroids
|
||||
local centroids = {}
|
||||
local old_centroids = {}
|
||||
local clusters = {}
|
||||
--for pid in pairs(documents) do clusters[pid] = 0 end
|
||||
-- add noise to mean_vector
|
||||
for i = 1,nclass do
|
||||
if ids == nil then
|
||||
centroids[i] = doclassify.random(size,math.floor(maxv))
|
||||
else
|
||||
centroids[i] = vectors[ids[i]]
|
||||
end
|
||||
old_centroids[i] = doclassify.zeros(size)
|
||||
end
|
||||
|
||||
-- loop until convergence or maxstep reached
|
||||
local similarity = doclassify.mean_similarity(centroids, old_centroids)
|
||||
local step = maxstep
|
||||
while 1.0-similarity > 1e-9 and step > 0 do
|
||||
clusters = {}
|
||||
--echo(JSON.encode(centroids))
|
||||
for id,v in pairs(vectors) do
|
||||
local similarities = doclassify.similarities(v, centroids)
|
||||
--echo(JSON.encode(similarities))
|
||||
local cluster, maxvalue = doclassify.argmax(similarities)
|
||||
--echo("doc #"..id.." is in clusters #"..cluster.." max value is "..maxvalue)
|
||||
clusters[id] = cluster
|
||||
end
|
||||
-- storing the old centroids
|
||||
old_centroids = centroids
|
||||
-- calculate new centroids
|
||||
local new_centroids = {}
|
||||
for class in pairs(centroids) do
|
||||
local cnt = 0
|
||||
local cvectors = {}
|
||||
for id,v in pairs(vectors) do
|
||||
if clusters[id] == class then
|
||||
cnt = cnt + 1
|
||||
cvectors[cnt] = v
|
||||
end
|
||||
end
|
||||
new_centroids[class] = doclassify.mean_vector(cvectors, size)
|
||||
end
|
||||
centroids = new_centroids
|
||||
--echo(JSON.encode(centroids))
|
||||
--echo(JSON.encode(old_centroids))
|
||||
similarity = doclassify.mean_similarity(centroids, old_centroids)
|
||||
echo("step #"..step..", similarity "..similarity)
|
||||
step = step - 1
|
||||
end
|
||||
local results = {}
|
||||
for i = 1,nclass do
|
||||
local list = {}
|
||||
local cnt = 0
|
||||
for id,c in pairs(clusters) do
|
||||
if c == i then
|
||||
cnt = cnt + 1
|
||||
list[cnt] = id
|
||||
end
|
||||
end
|
||||
results[i] = list
|
||||
end
|
||||
return results, clusters, centroids
|
||||
end
|
||||
|
||||
doclassify.zeros = function(n)
|
||||
local vector = {}
|
||||
for i = 1,n do vector[i] = 0.0 end
|
||||
return vector
|
||||
end
|
||||
|
||||
doclassify.random = function(n,maxv)
|
||||
local vector = {}
|
||||
for i=1,n do
|
||||
vector[i] = math.random() + math.random(0, maxv)
|
||||
end
|
||||
return vector
|
||||
end
|
||||
|
||||
doclassify.sum = function(v)
|
||||
local sum = 0.0
|
||||
for i=1,#v do sum = sum + v[i] end
|
||||
return sum
|
||||
end
|
||||
|
||||
doclassify.mean = function(v)
|
||||
return doclassify.sum(v)/#v
|
||||
|
||||
end
|
||||
|
||||
doclassify.mean_vector = function(vectors, size)
|
||||
local means = doclassify.zeros(size)
|
||||
if not vectors or #vectors == 0 then return means end
|
||||
--local size = #(vectors[1])
|
||||
local times = 0
|
||||
for k,v in pairs(vectors) do
|
||||
for i=1,#v do means[i] = means[i] + v[i] end
|
||||
times = times + 1
|
||||
end
|
||||
for i = 1,size do means[i] = means[i]/times end
|
||||
return means
|
||||
end
|
||||
|
||||
doclassify.argmin = function(v)
|
||||
local minv = 0.0
|
||||
local mini = 0.0
|
||||
for i = 1,#v do
|
||||
if v[i] <= minv then
|
||||
mini = i
|
||||
minv = v[i]
|
||||
end
|
||||
end
|
||||
--echo("min index"..mini.." val "..minv)
|
||||
return mini, minv
|
||||
end
|
||||
|
||||
doclassify.argmax = function(v)
|
||||
local maxv = 0.0
|
||||
local maxi = 0.0
|
||||
for i = 1,#v do
|
||||
if v[i] >= maxv then
|
||||
maxi = i
|
||||
maxv = v[i]
|
||||
end
|
||||
end
|
||||
return maxi,maxv
|
||||
end
|
||||
|
||||
return doclassify
|
151
Blogger/build/debug/api/ai/stopwords.txt
Normal file
151
Blogger/build/debug/api/ai/stopwords.txt
Normal file
@ -0,0 +1,151 @@
|
||||
i
|
||||
me
|
||||
my
|
||||
myself
|
||||
we
|
||||
our
|
||||
ours
|
||||
ourselves
|
||||
you
|
||||
your
|
||||
yours
|
||||
yourself
|
||||
yourselves
|
||||
he
|
||||
him
|
||||
his
|
||||
himself
|
||||
she
|
||||
her
|
||||
hers
|
||||
herself
|
||||
it
|
||||
its
|
||||
itself
|
||||
they
|
||||
them
|
||||
their
|
||||
theirs
|
||||
themselves
|
||||
what
|
||||
which
|
||||
who
|
||||
whom
|
||||
this
|
||||
that
|
||||
these
|
||||
those
|
||||
am
|
||||
is
|
||||
are
|
||||
was
|
||||
were
|
||||
be
|
||||
been
|
||||
being
|
||||
have
|
||||
has
|
||||
had
|
||||
having
|
||||
do
|
||||
does
|
||||
did
|
||||
doing
|
||||
a
|
||||
an
|
||||
the
|
||||
and
|
||||
but
|
||||
if
|
||||
or
|
||||
because
|
||||
as
|
||||
until
|
||||
while
|
||||
of
|
||||
at
|
||||
by
|
||||
for
|
||||
with
|
||||
about
|
||||
against
|
||||
between
|
||||
into
|
||||
through
|
||||
during
|
||||
before
|
||||
after
|
||||
above
|
||||
below
|
||||
to
|
||||
from
|
||||
up
|
||||
down
|
||||
in
|
||||
out
|
||||
on
|
||||
off
|
||||
over
|
||||
under
|
||||
again
|
||||
further
|
||||
then
|
||||
once
|
||||
here
|
||||
there
|
||||
when
|
||||
where
|
||||
why
|
||||
how
|
||||
all
|
||||
any
|
||||
both
|
||||
each
|
||||
few
|
||||
more
|
||||
most
|
||||
other
|
||||
some
|
||||
such
|
||||
no
|
||||
nor
|
||||
not
|
||||
only
|
||||
own
|
||||
same
|
||||
so
|
||||
than
|
||||
too
|
||||
very
|
||||
s
|
||||
t
|
||||
can
|
||||
will
|
||||
just
|
||||
don
|
||||
should
|
||||
now
|
||||
a
|
||||
b
|
||||
c
|
||||
d
|
||||
e
|
||||
f
|
||||
g
|
||||
h
|
||||
i
|
||||
j
|
||||
k
|
||||
l
|
||||
m
|
||||
n
|
||||
o
|
||||
p
|
||||
q
|
||||
w
|
||||
r
|
||||
s
|
||||
t
|
||||
x
|
||||
y
|
||||
z
|
50
Blogger/build/debug/api/ai/test.lua
Normal file
50
Blogger/build/debug/api/ai/test.lua
Normal file
@ -0,0 +1,50 @@
|
||||
local path = require("fs/vfs").ospath("home://aiws/blog-clustering")
|
||||
local gettext = loadfile(path.."/gettext.lua")()
|
||||
local cluster = loadfile(path.."/cluster.lua")()
|
||||
|
||||
local refresh = false
|
||||
|
||||
local file = "/home/mrsang/test.csv"
|
||||
if refresh then
|
||||
local data = gettext.get({publish=1})
|
||||
local documents = {}
|
||||
if data then
|
||||
local sw = gettext.stopwords("home://aiws/blog-clustering/stopwords.txt")
|
||||
for k,v in pairs(data) do
|
||||
local bag = cluster.bow(data[k].content, sw)
|
||||
documents[data[k].id] = bag
|
||||
end
|
||||
cluster.tfidf(documents)
|
||||
--local v = cluster.search("arm", documents)
|
||||
--echo(JSON.encode(v))
|
||||
local vectors, maxv, size = cluster.get_vectors(documents)
|
||||
local s = cluster.save_topchart(vectors,file, 3)
|
||||
if s then echo("file saved") else echo("error save file") end
|
||||
--echo(JSON.encode(r))
|
||||
--r = cluster.similarity(vectors["14"],vectors["16"])
|
||||
--echo("Similarity "..r)
|
||||
|
||||
--local c,l = cluster.kmean(3, documents, 10)
|
||||
--echo(JSON.encode(c))
|
||||
--echo(JSON.encode(l))
|
||||
else
|
||||
echo("Data missing")
|
||||
end
|
||||
else
|
||||
local f = io.open(file,"r")
|
||||
local result = {}
|
||||
for line in f:lines() do
|
||||
local arr = {}
|
||||
local cnt = 0
|
||||
for i in line:gmatch( "%S+") do
|
||||
cnt = cnt + 1
|
||||
arr[cnt] = i
|
||||
end
|
||||
if not result[arr[1]] then result[arr[1]] = {} end
|
||||
result[arr[1]][arr[2]] = tonumber(arr[3])
|
||||
end
|
||||
f:close()
|
||||
echo(JSON.encode(result))
|
||||
--local r = cluster.top_similarity("2",vectors, 3)
|
||||
--echo(JSON.encode(r))
|
||||
end
|
File diff suppressed because one or more lines are too long
@ -6,7 +6,7 @@
|
||||
"author": "Xuan Sang LE",
|
||||
"email": "xsang.le@gmail.com"
|
||||
},
|
||||
"version": "0.2.10-a",
|
||||
"version": "0.2.11-a",
|
||||
"category": "Internet",
|
||||
"iconclass": "fa fa-book",
|
||||
"dependencies": [
|
||||
|
Binary file not shown.
@ -212,7 +212,7 @@ namespace OS {
|
||||
if (emails.length === 0) { return this.notify(__("No email selected")); }
|
||||
// send the email
|
||||
const data = {
|
||||
path: `${this.meta().path}/sendmail.lua`,
|
||||
path: `${this.meta().path}/api/sendmail.lua`,
|
||||
parameters: {
|
||||
to: emails,
|
||||
title: (this.find("mail-title") as HTMLInputElement).value,
|
||||
@ -222,7 +222,7 @@ namespace OS {
|
||||
}
|
||||
};
|
||||
return this._api.apigateway(data, false)
|
||||
.then((d: { error: any; result: { join: (arg0: string) => any; }; }) => {
|
||||
.then((d) => {
|
||||
if (d.error) {
|
||||
const str = d.result.join(',');
|
||||
return this.notify(__("Unable to send mail to: {0}", str)); }
|
||||
|
@ -499,6 +499,36 @@ namespace OS {
|
||||
this.error(__("Error sending mails: {0}", e.toString()), e);
|
||||
}
|
||||
}
|
||||
},
|
||||
"|",
|
||||
{
|
||||
name: __("TFIDF analyse"),
|
||||
className: "fa fa-area-chart",
|
||||
action: async (e: any) => {
|
||||
try {
|
||||
const q = await this.openDialog("PromptDialog",{
|
||||
title: __("TFIDF Analyse"),
|
||||
text: __("Max number of related posts to keep per post?"),
|
||||
value: "5"
|
||||
});
|
||||
const data = {
|
||||
path: `${this.meta().path}/api/ai/analyse.lua`,
|
||||
parameters: {
|
||||
dbpath: this.dbhandle.info.file.path,
|
||||
top: parseInt(q)
|
||||
}
|
||||
};
|
||||
const d = await this._api.apigateway(data, false);
|
||||
if (d.error) {
|
||||
throw new Error(d.error);
|
||||
}
|
||||
this.toast(d.result);
|
||||
}
|
||||
catch(e)
|
||||
{
|
||||
this.error(__("Error analysing posts: {0}", e.toString()), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
});
|
||||
|
@ -6,7 +6,7 @@
|
||||
"author": "Xuan Sang LE",
|
||||
"email": "xsang.le@gmail.com"
|
||||
},
|
||||
"version": "0.2.10-a",
|
||||
"version": "0.2.11-a",
|
||||
"category": "Internet",
|
||||
"iconclass": "fa fa-book",
|
||||
"dependencies": [
|
||||
|
Loading…
Reference in New Issue
Block a user