interpret/kjv-code/src/kjv.re_c.c

370 lines
9.7 KiB
C

/** @license 2022 Neil Edelman, distributed under the terms of the
[MIT License](https://opensource.org/licenses/MIT).
Is intended to use
<https://github.com/scrollmapper/bible_databases/master/txt/KJV/>.
@std C13 */
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <errno.h>
#include <dirent.h> /* opendir readdir closedir */
#include <unistd.h> /* chdir (POSIX) (because I'm lazy) */
/* Dynamic contiguous string that is used to load files. */
#define ARRAY_NAME char
#define ARRAY_TYPE char
#include "../src/array.h"
/** Append a text file, `fn`, to `c`, and add a '\0'.
@return The start of the appended file or null on error. A partial read is a
failure. @throws[fopen, fread, malloc]
@throws[EISEQ] The text file has embedded nulls.
@throws[ERANGE] If the standard library does not follow POSIX. */
static char *append_file(struct char_array *text, const char *const fn) {
FILE *fp = 0;
const size_t granularity = 1024;
size_t nread, start;
char *cursor;
int success = 1;
assert(text && fn);
start = text->size;
if(!(fp = fopen(fn, "r"))) goto catch;
/* Read entire file in chunks. */
do if(!(cursor = char_array_buffer(text, granularity))
|| (nread = fread(cursor, 1, granularity, fp), ferror(fp))
|| !char_array_append(text, nread)) goto catch;
while(nread == granularity);
/* File to `C` string. */
if(!(cursor = char_array_new(text))) goto catch;
*cursor = '\0';
/* Binary files with embedded '\0' are not allowed; check just this read. */
if(strchr(text->data + start, '\0') != cursor)
{ errno = EILSEQ; goto catch; }
goto finally;
catch:
if(!errno) errno = EILSEQ; /* Will never be true on POSIX. */
success = 0;
finally:
if(fp) fclose(fp);
return success ? text->data + start : 0;
}
/** Helper to parse unsigned; [`s`,`e`) => `n`. */
static int parse_natural(const char *s, const char *const e, unsigned *const n) {
unsigned accum = 0;
while(s < e) {
unsigned next = accum * 10 + (unsigned)(*s - '0');
if(accum >= next) return errno = ERANGE, 0;
accum = next;
s++;
}
*n = accum;
return 1;
}
/* Enumerate books. */
#define BOOKS \
X(Genesis),\
X(Exodus),\
X(Leviticus),\
X(Numbers),\
X(Deuteronomy),\
X(Joshua),\
X(Judges),\
X(Ruth),\
X(ISamuel),\
X(IISamuel),\
X(IKings),\
X(IIKings),\
X(IChronicles),\
X(IIChronicles),\
X(Ezra),\
X(Nehemiah),\
X(Esther),\
X(Job),\
X(Psalms),\
X(Proverbs),\
X(Ecclesiastes),\
X(Song_of_Solomon),\
X(Isaiah),\
X(Jeremiah),\
X(Lamentations),\
X(Ezekiel),\
X(Daniel),\
X(Hosea),\
X(Joel),\
X(Amos),\
X(Obadiah),\
X(Jonah),\
X(Micah),\
X(Nahum),\
X(Habakkuk),\
X(Zephaniah),\
X(Haggai),\
X(Zechariah),\
X(Malachi),\
\
X(Matthew),\
X(Mark),\
X(Luke),\
X(John),\
X(Acts),\
X(Romans),\
X(ICorinthians),\
X(IICorinthians),\
X(Galatians),\
X(Ephesians),\
X(Philippians),\
X(Colossians),\
X(IThessalonians),\
X(IIThessalonians),\
X(ITimothy),\
X(IITimothy),\
X(Titus),\
X(Philemon),\
X(Hebrews),\
X(James),\
X(IPeter),\
X(IIPeter),\
X(IJohn),\
X(IIJohn),\
X(IIIJohn),\
X(Jude),\
X(Revelation),\
X(KJV_BOOK_SIZE)
#define X(book) book
enum kjv_book { BOOKS };
#undef X
#define X(book) #book
static const char *kjv_book_string[] = { BOOKS };
#undef X
#undef BOOKS
/* Parse filename of books. This works with
<https://github.com/scrollmapper/bible_databases/tree/master/txt/KJV> */
/*!re2c /**/
re2c:yyfill:enable = 0;
re2c:define:YYCTYPE = char;
natural = [1-9][0-9]*;
whitespace = [ \t\v\f];
word = [^ \t\v\f\n\x00]+;
*/
/** `fn` contains "<number>[*].txt", sticks that in `book_no`, otherwise
returns false. */
static int kjv_filename(const char *fn, unsigned *const book_no) {
const char *YYCURSOR = fn, *YYMARKER, *yyt1, *yyt2, *s0, *s1;
assert(fn && book_no);
/*!re2c /**/
*
{ return 0; }
@s0 natural @s1 [^.\x00]* ".txt" "\x00"
{ return parse_natural(s0, s1, book_no); }
*/
}
/* Parse book contents. */
struct lex {
size_t line;
const char *cursor;
int error;
unsigned chapter, verse, words;
};
static struct lex lex(const char *cursor) {
struct lex lex;
assert(cursor);
lex.line = 1;
lex.cursor = cursor;
lex.error = 0;
lex.chapter = lex.verse = lex.words = 0;
return lex;
}
/*!conditions:re2c*/
static int lex_next_verse(struct lex *const lex) {
const char *YYMARKER, *yyt1 = 0, *yyt2 = 0, *s0, *s1, *t0, *t1;
enum YYCONDTYPE condition = yycline;
/*!re2c /**/
re2c:define:YYCURSOR = lex->cursor;
re2c:define:YYGETCONDITION = "condition";
re2c:define:YYSETCONDITION = "condition = @@;";
re2c:define:YYGETCONDITION:naked = 1;
re2c:define:YYSETCONDITION:naked = 1; */
assert(lex && lex->cursor);
lex->error = 0;
scan:
/*!re2c /**/
<*> * { return errno = EILSEQ, lex->error = 1, 0; }
<line> [^[\]\n\x00]* "\n" { lex->line++; goto scan; }
<line> "\x00" { return 0; }
<line> "[" @s0 natural @s1 ":" @t0 natural @t1 "]" => verse {
if(!parse_natural(s0, s1, &lex->chapter)
|| !parse_natural(t0, t1, &lex->verse))
return errno = EILSEQ, lex->error = 1, 0;
lex->words = 0;
/*printf("%u:%u", lex->chapter, lex->verse);*/
goto scan;
}
<verse> whitespace+ { goto scan; }
<verse> @s0 word @s1 { lex->words++; goto scan; }
<verse> "\n" { /*printf(" -> %u\n", lex->words);*/ lex->line++; return 1; }
*/
}
/* Reversible hash map to store data on bible. */
#include <stdint.h>
/** <https://nullprogram.com/blog/2018/07/31/>
<https://github.com/skeeto/hash-prospector> on `x`. */
static uint32_t lowbias32(uint32_t x) {
x ^= x >> 16;
x *= 0x7feb352dU;
x ^= x >> 15;
x *= 0x846ca68bU;
x ^= x >> 16;
return x;
}
/* Inverts `x`. */
static uint32_t lowbias32_r(uint32_t x) {
x ^= x >> 16;
x *= 0x43021123U;
x ^= x >> 15 ^ x >> 30;
x *= 0x1d69e2a5U;
x ^= x >> 16;
return x;
}
/** Two hash-tables use the same structure. */
union kjvcite {
/* Overkill, but no initializing unused bits, 12 + 13 + 7 = 32. */
struct { unsigned verse : 12, chapter : 13, book : 7; };
uint32_t u32;
};
static uint32_t kjv_hash(const union kjvcite x) { return lowbias32(x.u32); }
static union kjvcite kjv_unhash(const uint32_t x) {
union kjvcite k;
k.u32 = lowbias32_r(x);
return k;
}
static void kjv_to_string(const union kjvcite x, char (*const a)[12])
{ sprintf(*a, "%.4s%u:%u", kjv_book_string[x.book],
(x.chapter + 1) % 1000, (x.verse + 1) % 1000); }
/** Derived information on verse word count. */
static uint32_t verse_hash(const union kjvcite x) { return kjv_hash(x); }
static union kjvcite verse_unhash(const uint32_t x) { return kjv_unhash(x); }
static void verse_to_string(const union kjvcite x, char (*const a)[12])
{ kjv_to_string(x, a); }
#define TABLE_NAME verse
#define TABLE_KEY union kjvcite
#define TABLE_UINT uint32_t
#define TABLE_VALUE unsigned
#define TABLE_INVERSE
#define TABLE_DEFAULT 0
#define TABLE_TO_STRING
#include "../src/table.h"
/* A set of verses. */
static uint32_t kjvset_hash(const union kjvcite x) { return kjv_hash(x); }
static union kjvcite kjvset_unhash(const uint32_t x) { return kjv_unhash(x); }
static void kjvset_to_string(const union kjvcite x, char (*const a)[12])
{ kjv_to_string(x, a); }
#define TABLE_NAME kjvset
#define TABLE_KEY union kjvcite
#define TABLE_UINT uint32_t
#define TABLE_INVERSE
#define TABLE_TO_STRING
#include "../src/table.h"
int main(void) {
const char *const dir_kjv = "KJV";
struct {
struct char_array backing;
struct verse_table verses;
size_t words;
} kjv = { 0 };
DIR *dir = 0;
struct dirent *de = 0;
struct { size_t offset; int is; } build[KJV_BOOK_SIZE] = { 0 };
enum kjv_book b = 0;
int success = EXIT_SUCCESS, attempted_closedir = 0;
errno = 0;
/* For all files in directory KJV with <#>*.txt, read into backing. */
if(chdir(dir_kjv) == -1 || !(dir = opendir("."))) goto catch;
while((de = readdir(dir))) {
unsigned ordinal;
char *unstable_book;
if(!kjv_filename(de->d_name, &ordinal)) continue; /* Extract no. */
/*fprintf(stderr, "<%s> ordinal: %u\n", de->d_name, ordinal);*/
if(ordinal < 1 || ordinal > KJV_BOOK_SIZE)
{ errno = ERANGE; goto catch; } /* Not in range. */
if(build[b = ordinal - 1].is) /* Convert to zero-based. */
{ errno = EDOM; goto catch; } /* Is duplicate. */
if(!(unstable_book = append_file(&kjv.backing, de->d_name))) goto catch;
build[b].is = 1;
build[b].offset = (size_t)(unstable_book - kjv.backing.data);
}
if(attempted_closedir = 1, closedir(dir) == -1) goto catch; dir = 0;
/* Now backing is stable; count all the words for each verse. */
for(b = 0; b < KJV_BOOK_SIZE; b++) {
struct lex x;
if(!build[b].is) { fprintf(stderr, "Missing book [%u]%s.\n",
b + 1, kjv_book_string[b]); errno = EDOM; goto catch; }
x = lex(kjv.backing.data + build[b].offset);
while(lex_next_verse(&x)) {
const union kjvcite cite
= { .book = b, .chapter = x.chapter, .verse = x.verse };
unsigned *words;
switch(verse_table_assign(&kjv.verses, cite, &words)) {
case TABLE_PRESENT: fprintf(stderr, "[%u]%s %u:%u duplicated.\n",
b + 1, kjv_book_string[b], x.chapter, x.verse); errno = EDOM;
case TABLE_ERROR: goto catch;
case TABLE_ABSENT: break;
}
*words = x.words, kjv.words += x.words;
}
if(x.error) { fprintf(stderr, "[%u]%s on line %zu\n",
b + 1, kjv_book_string[b], x.line); goto catch; }
}
printf("words: %s\n", verse_table_to_string(&kjv.verses));
printf("kjv: %zu total words\n", kjv.words);
{
union kjvcite c;
struct verse_table_iterator it = verse_table_begin(&kjv.verses);
unsigned *w;
while(verse_table_next(&it, &c, &w))
printf("%s %u:%u -> %u\n",
kjv_book_string[c.book], c.chapter, c.verse, *w);
c = (union kjvcite){ .book = Genesis, .chapter = 1, .verse = 1 };
printf("1:1:1 -> %u\n", verse_table_get(&kjv.verses, c));
}
goto finally;
catch:
success = EXIT_FAILURE;
if(de) fprintf(stderr, "While reading %s.\n", de->d_name);
perror(de ? de->d_name : dir_kjv);
if(dir && !attempted_closedir && closedir(dir) == -1) perror(dir_kjv);
finally:
verse_table_(&kjv.verses);
char_array_(&kjv.backing);
return success;
}