initial commit, ok inoguchi@

2022-01-02 20:45:24 +00:00 · 2022-01-02 20:45:24 +00:00 · 653add3360
commit 653add3360
parent ede447a05a
6 changed files with 184 additions and 0 deletions
--- a/inputmethods/libkkc-data/Makefile
+++ b/inputmethods/libkkc-data/Makefile
@ -0,0 +1,25 @@
+# $OpenBSD: Makefile,v 1.1 2022/01/02 20:45:24 uaa Exp $
+
+COMMENT =	language model data for libkkc
+
+VERSION =	0.2.7
+DISTNAME =	libkkc-data-${VERSION}
+
+CATEGORIES =	inputmethods japanese
+
+# GPLv3
+PERMIT_PACKAGE =	Yes
+
+MASTER_SITES =		https://github.com/ueno/libkkc/releases/download/v0.3.5/
+EXTRACT_SUFX =		.tar.xz
+
+MODULES =		lang/python
+
+BUILD_DEPENDS =		textproc/libmarisa,-python
+RUN_DEPENDS =		inputmethods/libkkc
+
+CONFIGURE_STYLE =	gnu
+
+NO_TEST =		Yes
+
+.include <bsd.port.mk>
--- a/inputmethods/libkkc-data/distinfo
+++ b/inputmethods/libkkc-data/distinfo
@ -0,0 +1,2 @@
+SHA256 (libkkc-data-0.2.7.tar.xz) = nmeHVaAwBD2mjjekBJqilsKWhp/x+55scAJrJUFZW5k=
+SIZE (libkkc-data-0.2.7.tar.xz) = 22262552
--- a/inputmethods/libkkc-data/patches/patch-genfilter_py
+++ b/inputmethods/libkkc-data/patches/patch-genfilter_py
@ -0,0 +1,54 @@
+--- tools/genfilter.py.orig	Mon Jul 29 09:53:28 2013
+++ tools/genfilter.py	Thu Sep 23 10:02:41 2021
+@@ -1,7 +1,7 @@
+ #!/usr/bin/python
+ 
+-# Copyright (C) 2011-2013 Daiki Ueno <ueno@gnu.org>
+-# Copyright (C) 2011-2013 Red Hat, Inc.
+# Copyright (C) 2011-2014 Daiki Ueno <ueno@gnu.org>
+# Copyright (C) 2011-2014 Red Hat, Inc.
+ 
+ # This program is free software: you can redistribute it and/or modify
+ # it under the terms of the GNU General Public License as published by
+@@ -84,24 +84,24 @@ class FilterGenerator(object):
+ 
+     def generate(self):
+         size = os.fstat(self.infile.fileno()).st_size
+-        n = size / self.record_size
+        n = size // self.record_size
+         m = int(math.ceil(-n*math.log10(ERROR_RATE) /
+                           math.pow(math.log10(2), 2)))
+-        m = (m/8 + 1)*8
+        m = (m//8 + 1)*8
+         inmem = mmap.mmap(self.infile.fileno(),
+                           size,
+                           access=mmap.ACCESS_READ)
+-        outmem = bytearray(m/8)
+-        for i in xrange(0, n):
+        outmem = bytearray(m//8)
+        for i in range(0, n):
+             offset = i*self.record_size
+             b0, b1 = struct.unpack("=LL", inmem[offset:offset+8])
+-            for k in xrange(0, 4):
+            for k in range(0, 4):
+                 h = murmur_hash3_32(b0, b1, k)
+                 h = int(h * (m / float(0xFFFFFFFF)))
+-                outmem[h/8] |= (1 << (h%8))
+                outmem[h//8] |= (1 << (h%8))
+         inmem.close()
+-        # Convert bytearray to str, for Python 2.6 compatibility.
+-        self.outfile.write(str(outmem))
+        # Convert bytearray to bytes, for Python 3 compatibility.
+        self.outfile.write(bytes(outmem))
+ 
+ if __name__ == '__main__':
+     import sys
+@@ -110,7 +110,7 @@ if __name__ == '__main__':
+     parser = argparse.ArgumentParser(description='filter')
+     parser.add_argument('infile', type=argparse.FileType('r'),
+                         help='input file')
+-    parser.add_argument('outfile', type=argparse.FileType('w'),
+    parser.add_argument('outfile', type=argparse.FileType('wb'),
+                         help='output file')
+     parser.add_argument('record_size', type=int,
+                         help='record size')
--- a/inputmethods/libkkc-data/patches/patch-sortlm_py
+++ b/inputmethods/libkkc-data/patches/patch-sortlm_py
@ -0,0 +1,90 @@
+--- tools/sortlm.py.orig	Mon Jul 29 09:53:28 2013
+++ tools/sortlm.py	Thu Sep 23 10:02:37 2021
+@@ -1,7 +1,7 @@
+ #!/usr/bin/python
+ 
+-# Copyright (C) 2011-2013 Daiki Ueno <ueno@gnu.org>
+-# Copyright (C) 2011-2013 Red Hat, Inc.
+# Copyright (C) 2011-2014 Daiki Ueno <ueno@gnu.org>
+# Copyright (C) 2011-2014 Red Hat, Inc.
+ 
+ # This program is free software: you can redistribute it and/or modify
+ # it under the terms of the GNU General Public License as published by
+@@ -40,10 +40,10 @@ class SortedGenerator(object):
+         self.__min_cost = 0.0
+ 
+     def read(self):
+-        print "reading N-grams"
+        print("reading N-grams")
+         self.__read_tries()
+         self.__read_ngrams()
+-        print "min cost = %lf" % self.__min_cost
+        print("min cost = %lf" % self.__min_cost)
+ 
+     def __read_tries(self):
+         while True:
+@@ -58,7 +58,7 @@ class SortedGenerator(object):
+             line = self.__infile.readline()
+             if line == "":
+                 break
+-            line = line.strip()
+            line = line.strip('\n')
+             if line == "":
+                 break
+             match = self.__ngram_line_regex.match(line)
+@@ -89,7 +89,7 @@ class SortedGenerator(object):
+                 line = self.__infile.readline()
+                 if line == "":
+                     break
+-                line = line.strip()
+                line = line.strip('\n')
+                 if line == "":
+                     break
+                 match = self.__ngram_line_regex.match(line)
+@@ -125,14 +125,11 @@ class SortedGenerator(object):
+         def quantize(cost, min_cost):
+             return max(0, min(65535, int(cost * 65535 / min_cost)))
+ 
+-        def cmp_header(a, b):
+-            return cmp(a[0], b[0])
+-
+-        print "writing 1-gram file"
+        print("writing 1-gram file")
+         unigram_offsets = {}
+         unigram_file = open("%s.1gram" % self.__output_prefix, "wb")
+         offset = 0
+-        for ids, value in sorted(self.__ngram_entries[0].iteritems()):
+        for ids, value in sorted(self.__ngram_entries[0].items()):
+             unigram_offsets[ids[0]] = offset
+             s = struct.pack("=HHH",
+                             quantize(value[0], self.__min_cost),
+@@ -143,13 +140,13 @@ class SortedGenerator(object):
+             offset += 1
+         unigram_file.close()
+ 
+-        print "writing 2-gram file"
+        print("writing 2-gram file")
+         bigram_offsets = {}
+         bigram_file = open("%s.2gram" % self.__output_prefix, "wb")
+         keys = self.__ngram_entries[1].keys()
+         items = [(struct.pack("=LL", ids[1], unigram_offsets[ids[0]]), ids) for ids in keys]
+         offset = 0
+-        for header, ids in sorted(items, cmp=cmp_header):
+        for header, ids in sorted(items, key=lambda x: x[0]):
+             value = self.__ngram_entries[1][ids]
+             bigram_offsets[ids] = offset
+             s = struct.pack("=HH",
+@@ -160,11 +157,11 @@ class SortedGenerator(object):
+         bigram_file.close()
+ 
+         if len(self.__ngram_entries[2]) > 0:
+-            print "writing 3-gram file"
+            print("writing 3-gram file")
+             trigram_file = open("%s.3gram" % self.__output_prefix, "wb")
+             keys = self.__ngram_entries[2].keys()
+             items = [(struct.pack("=LL", ids[2], bigram_offsets[(ids[0], ids[1])]), ids) for ids in keys]
+-            for header, ids in sorted(items, cmp=cmp_header):
+            for header, ids in sorted(items, key=lambda x: x[0]):
+                 value = self.__ngram_entries[2][ids]
+                 s = struct.pack("=H",
+                                 quantize(value[0], self.__min_cost))
--- a/inputmethods/libkkc-data/pkg/DESCR
+++ b/inputmethods/libkkc-data/pkg/DESCR
@ -0,0 +1 @@
+language model data for libkkc
--- a/inputmethods/libkkc-data/pkg/PLIST
+++ b/inputmethods/libkkc-data/pkg/PLIST
@ -0,0 +1,12 @@
+@comment $OpenBSD: PLIST,v 1.1 2022/01/02 20:45:24 uaa Exp $
+lib/libkkc/
+lib/libkkc/models/
+lib/libkkc/models/sorted3/
+lib/libkkc/models/sorted3/data.1gram
+lib/libkkc/models/sorted3/data.1gram.index
+lib/libkkc/models/sorted3/data.2gram
+lib/libkkc/models/sorted3/data.2gram.filter
+lib/libkkc/models/sorted3/data.3gram
+lib/libkkc/models/sorted3/data.3gram.filter
+lib/libkkc/models/sorted3/data.input
+lib/libkkc/models/sorted3/metadata.json