initial commit, ok inoguchi@

This commit is contained in:
uaa 2022-01-02 20:45:24 +00:00
parent ede447a05a
commit 653add3360
6 changed files with 184 additions and 0 deletions

View File

@ -0,0 +1,25 @@
# $OpenBSD: Makefile,v 1.1 2022/01/02 20:45:24 uaa Exp $
COMMENT = language model data for libkkc
VERSION = 0.2.7
DISTNAME = libkkc-data-${VERSION}
CATEGORIES = inputmethods japanese
# GPLv3
PERMIT_PACKAGE = Yes
MASTER_SITES = https://github.com/ueno/libkkc/releases/download/v0.3.5/
EXTRACT_SUFX = .tar.xz
MODULES = lang/python
BUILD_DEPENDS = textproc/libmarisa,-python
RUN_DEPENDS = inputmethods/libkkc
CONFIGURE_STYLE = gnu
NO_TEST = Yes
.include <bsd.port.mk>

View File

@ -0,0 +1,2 @@
SHA256 (libkkc-data-0.2.7.tar.xz) = nmeHVaAwBD2mjjekBJqilsKWhp/x+55scAJrJUFZW5k=
SIZE (libkkc-data-0.2.7.tar.xz) = 22262552

View File

@ -0,0 +1,54 @@
--- tools/genfilter.py.orig Mon Jul 29 09:53:28 2013
+++ tools/genfilter.py Thu Sep 23 10:02:41 2021
@@ -1,7 +1,7 @@
#!/usr/bin/python
-# Copyright (C) 2011-2013 Daiki Ueno <ueno@gnu.org>
-# Copyright (C) 2011-2013 Red Hat, Inc.
+# Copyright (C) 2011-2014 Daiki Ueno <ueno@gnu.org>
+# Copyright (C) 2011-2014 Red Hat, Inc.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -84,24 +84,24 @@ class FilterGenerator(object):
def generate(self):
size = os.fstat(self.infile.fileno()).st_size
- n = size / self.record_size
+ n = size // self.record_size
m = int(math.ceil(-n*math.log10(ERROR_RATE) /
math.pow(math.log10(2), 2)))
- m = (m/8 + 1)*8
+ m = (m//8 + 1)*8
inmem = mmap.mmap(self.infile.fileno(),
size,
access=mmap.ACCESS_READ)
- outmem = bytearray(m/8)
- for i in xrange(0, n):
+ outmem = bytearray(m//8)
+ for i in range(0, n):
offset = i*self.record_size
b0, b1 = struct.unpack("=LL", inmem[offset:offset+8])
- for k in xrange(0, 4):
+ for k in range(0, 4):
h = murmur_hash3_32(b0, b1, k)
h = int(h * (m / float(0xFFFFFFFF)))
- outmem[h/8] |= (1 << (h%8))
+ outmem[h//8] |= (1 << (h%8))
inmem.close()
- # Convert bytearray to str, for Python 2.6 compatibility.
- self.outfile.write(str(outmem))
+ # Convert bytearray to bytes, for Python 3 compatibility.
+ self.outfile.write(bytes(outmem))
if __name__ == '__main__':
import sys
@@ -110,7 +110,7 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(description='filter')
parser.add_argument('infile', type=argparse.FileType('r'),
help='input file')
- parser.add_argument('outfile', type=argparse.FileType('w'),
+ parser.add_argument('outfile', type=argparse.FileType('wb'),
help='output file')
parser.add_argument('record_size', type=int,
help='record size')

View File

@ -0,0 +1,90 @@
--- tools/sortlm.py.orig Mon Jul 29 09:53:28 2013
+++ tools/sortlm.py Thu Sep 23 10:02:37 2021
@@ -1,7 +1,7 @@
#!/usr/bin/python
-# Copyright (C) 2011-2013 Daiki Ueno <ueno@gnu.org>
-# Copyright (C) 2011-2013 Red Hat, Inc.
+# Copyright (C) 2011-2014 Daiki Ueno <ueno@gnu.org>
+# Copyright (C) 2011-2014 Red Hat, Inc.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -40,10 +40,10 @@ class SortedGenerator(object):
self.__min_cost = 0.0
def read(self):
- print "reading N-grams"
+ print("reading N-grams")
self.__read_tries()
self.__read_ngrams()
- print "min cost = %lf" % self.__min_cost
+ print("min cost = %lf" % self.__min_cost)
def __read_tries(self):
while True:
@@ -58,7 +58,7 @@ class SortedGenerator(object):
line = self.__infile.readline()
if line == "":
break
- line = line.strip()
+ line = line.strip('\n')
if line == "":
break
match = self.__ngram_line_regex.match(line)
@@ -89,7 +89,7 @@ class SortedGenerator(object):
line = self.__infile.readline()
if line == "":
break
- line = line.strip()
+ line = line.strip('\n')
if line == "":
break
match = self.__ngram_line_regex.match(line)
@@ -125,14 +125,11 @@ class SortedGenerator(object):
def quantize(cost, min_cost):
return max(0, min(65535, int(cost * 65535 / min_cost)))
- def cmp_header(a, b):
- return cmp(a[0], b[0])
-
- print "writing 1-gram file"
+ print("writing 1-gram file")
unigram_offsets = {}
unigram_file = open("%s.1gram" % self.__output_prefix, "wb")
offset = 0
- for ids, value in sorted(self.__ngram_entries[0].iteritems()):
+ for ids, value in sorted(self.__ngram_entries[0].items()):
unigram_offsets[ids[0]] = offset
s = struct.pack("=HHH",
quantize(value[0], self.__min_cost),
@@ -143,13 +140,13 @@ class SortedGenerator(object):
offset += 1
unigram_file.close()
- print "writing 2-gram file"
+ print("writing 2-gram file")
bigram_offsets = {}
bigram_file = open("%s.2gram" % self.__output_prefix, "wb")
keys = self.__ngram_entries[1].keys()
items = [(struct.pack("=LL", ids[1], unigram_offsets[ids[0]]), ids) for ids in keys]
offset = 0
- for header, ids in sorted(items, cmp=cmp_header):
+ for header, ids in sorted(items, key=lambda x: x[0]):
value = self.__ngram_entries[1][ids]
bigram_offsets[ids] = offset
s = struct.pack("=HH",
@@ -160,11 +157,11 @@ class SortedGenerator(object):
bigram_file.close()
if len(self.__ngram_entries[2]) > 0:
- print "writing 3-gram file"
+ print("writing 3-gram file")
trigram_file = open("%s.3gram" % self.__output_prefix, "wb")
keys = self.__ngram_entries[2].keys()
items = [(struct.pack("=LL", ids[2], bigram_offsets[(ids[0], ids[1])]), ids) for ids in keys]
- for header, ids in sorted(items, cmp=cmp_header):
+ for header, ids in sorted(items, key=lambda x: x[0]):
value = self.__ngram_entries[2][ids]
s = struct.pack("=H",
quantize(value[0], self.__min_cost))

View File

@ -0,0 +1 @@
language model data for libkkc

View File

@ -0,0 +1,12 @@
@comment $OpenBSD: PLIST,v 1.1 2022/01/02 20:45:24 uaa Exp $
lib/libkkc/
lib/libkkc/models/
lib/libkkc/models/sorted3/
lib/libkkc/models/sorted3/data.1gram
lib/libkkc/models/sorted3/data.1gram.index
lib/libkkc/models/sorted3/data.2gram
lib/libkkc/models/sorted3/data.2gram.filter
lib/libkkc/models/sorted3/data.3gram
lib/libkkc/models/sorted3/data.3gram.filter
lib/libkkc/models/sorted3/data.input
lib/libkkc/models/sorted3/metadata.json