interpret/kjv/src/kjv.re_c.c

/** @license 2022 Neil Edelman, distributed under the terms of the
 [MIT License](https://opensource.org/licenses/MIT).
 Is intended to use
 <https://github.com/scrollmapper/bible_databases/master/txt/KJV/>.
 @std C13 */

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <errno.h>
#include <dirent.h> /* opendir readdir closedir */
#include <unistd.h> /* chdir (POSIX) (because I'm lazy) */


/* Dynamic contiguous string that is used to load files. */

#define ARRAY_NAME char
#define ARRAY_TYPE char
#include "../src/array.h"

/** Append a text file, `fn`, to `c`, and add a '\0'.
 @return Success. A partial read is failure. @throws[fopen, fread, malloc]
 @throws[EISEQ] The text file has embedded nulls.
 @throws[ERANGE] If the standard library does not follow POSIX. */
static int append_file(struct char_array *c, const char *const fn) {
	FILE *fp = 0;
	const size_t granularity = 1024;
	size_t nread;
	char *cursor;
	int success = 0;
	assert(c && fn);
	if(!(fp = fopen(fn, "r"))) goto catch;
	/* Read entire file in chunks. */
	do if(!(cursor = char_array_buffer(c, granularity))
		|| (nread = fread(cursor, 1, granularity, fp), ferror(fp))
		|| !char_array_append(c, nread)) goto catch;
	while(nread == granularity);
	/* File to `C` string. */
	if(!(cursor = char_array_new(c))) goto catch;
	*cursor = '\0';
	/* Binary files with embedded '\0' are not allowed. */
	if(strchr(c->data, '\0') != cursor) { errno = EILSEQ; goto catch; }
	{ success = 1; goto finally; }
catch:
	if(!errno) errno = EILSEQ; /* Will never be true on POSIX. */
finally:
	if(fp) fclose(fp);
	return success;
}


/** Helper to parse unsigned; [`s`,`e`) => `n`. */
static int parse_natural(const char *s, const char *const e, unsigned *const n) {
	unsigned accum = 0;
	while(s < e) {
		unsigned next = accum * 10 + (unsigned)(*s - '0');
		if(accum >= next) return errno = ERANGE, 0;
		accum = next;
		s++;
	}
	*n = accum;
	return 1;
}


/* Enumerate books. */

#define BOOKS \
	X(Genesis),\
	X(Exodus),\
	X(Leviticus),\
	X(Numbers),\
	X(Deuteronomy),\
	X(Joshua),\
	X(Judges),\
	X(Ruth),\
	X(ISamuel),\
	X(IISamuel),\
	X(IKings),\
	X(IIKings),\
	X(IChronicles),\
	X(IIChronicles),\
	X(Ezra),\
	X(Nehemiah),\
	X(Esther),\
	X(Job),\
	X(Psalms),\
	X(Proverbs),\
	X(Ecclesiastes),\
	X(Song_of_Solomon),\
	X(Isaiah),\
	X(Jeremiah),\
	X(Lamentations),\
	X(Ezekiel),\
	X(Daniel),\
	X(Hosea),\
	X(Joel),\
	X(Amos),\
	X(Obadiah),\
	X(Jonah),\
	X(Micah),\
	X(Nahum),\
	X(Habakkuk),\
	X(Zephaniah),\
	X(Haggai),\
	X(Zechariah),\
	X(Malachi),\
	\
	X(Matthew),\
	X(Mark),\
	X(Luke),\
	X(John),\
	X(Acts),\
	X(Romans),\
	X(ICorinthians),\
	X(IICorinthians),\
	X(Galatians),\
	X(Ephesians),\
	X(Philippians),\
	X(Colossians),\
	X(IThessalonians),\
	X(IIThessalonians),\
	X(ITimothy),\
	X(IITimothy),\
	X(Titus),\
	X(Philemon),\
	X(Hebrews),\
	X(James),\
	X(IPeter),\
	X(IIPeter),\
	X(IJohn),\
	X(IIJohn),\
	X(IIIJohn),\
	X(Jude),\
	X(Revelation),\
	X(KJV_BOOK_SIZE)

#define X(book) book
enum kjv_book { BOOKS };
#undef X
#define X(book) #book
static const char *kjv_book_string[] = { BOOKS };
#undef X
#undef BOOKS


/* Parse filename of books. This works with
 <https://github.com/scrollmapper/bible_databases/tree/master/txt/KJV> */

/*!re2c /**/
re2c:yyfill:enable = 0;
re2c:define:YYCTYPE = char;
natural = [1-9][0-9]*;
whitespace = [ \t\v\f];
word = [^ \t\v\f\n\x00]+;
*/

/** `fn` contains "<number>[*].txt", sticks that in `book_no`, otherwise
 returns false. */
static int kjv_filename(const char *fn, unsigned *const book_no) {
	const char *YYCURSOR = fn, *YYMARKER, *yyt1, *yyt2, *s0, *s1;
	assert(fn && book_no);
	/*!re2c /**/
	*
		{ return 0; }
	@s0 natural @s1 [^.\x00]* ".txt" "\x00"
		{ return parse_natural(s0, s1, book_no); }
	*/
}


/* Parse book contents. */

struct lex {
	size_t line;
	const char *cursor;
	int error;
	unsigned chapter, verse, words;
};

static struct lex lex(const char *cursor) {
	struct lex lex;
	assert(cursor);
	lex.line = 1;
	lex.cursor = cursor;
	lex.error = 0;
	lex.chapter = lex.verse = lex.words = 0;
	return lex;
}

/*!conditions:re2c*/

static int lex_next_verse(struct lex *const lex) {
	const char *YYMARKER, *yyt1 = 0, *yyt2 = 0, *s0, *s1, *t0, *t1;
	enum YYCONDTYPE condition = yycline;
	/*!re2c /**/
	re2c:define:YYCURSOR = lex->cursor;
	re2c:define:YYGETCONDITION = "condition";
	re2c:define:YYSETCONDITION = "condition = @@;";
	re2c:define:YYGETCONDITION:naked = 1;
	re2c:define:YYSETCONDITION:naked = 1; */
	assert(lex && lex->cursor);
	lex->error = 0;
scan:
	/*!re2c /**/
	<*> * { return errno = EILSEQ, lex->error = 1, 0; }
	<line> [^[\]\n\x00]* "\n" { lex->line++; goto scan; }
	<line> "\x00" { return 0; }
	<line> "[" @s0 natural @s1 ":" @t0 natural @t1 "]" => verse {
		if(!parse_natural(s0, s1, &lex->chapter)
			|| !parse_natural(t0, t1, &lex->verse))
			return errno = EILSEQ, lex->error = 1, 0;
		lex->words = 0;
		/*printf("%u:%u", lex->chapter, lex->verse);*/
		goto scan;
	}
	<verse> whitespace+ { goto scan; }
	<verse> @s0 word @s1 { lex->words++; goto scan; }
	<verse> "\n" { /*printf(" -> %u\n", lex->words);*/ lex->line++; return 1; }
	*/
}


/* Reversible hash map to store data on bible. */

#include <stdint.h>

/** <https://nullprogram.com/blog/2018/07/31/>
 <https://github.com/skeeto/hash-prospector> on `x`. */
static uint32_t lowbias32(uint32_t x) {
	x ^= x >> 16;
	x *= 0x7feb352dU;
	x ^= x >> 15;
	x *= 0x846ca68bU;
	x ^= x >> 16;
	return x;
}
/* Inverts `x`. */
static uint32_t lowbias32_r(uint32_t x) {
	x ^= x >> 16;
	x *= 0x43021123U;
	x ^= x >> 15 ^ x >> 30;
	x *= 0x1d69e2a5U;
	x ^= x >> 16;
	return x;
}

union kjvcite {
	uint32_t u32;
	struct { unsigned verse : 8, chapter : 8, book : 7; };
};

static uint32_t kjv_hash(const union kjvcite x) { return lowbias32(x.u32); }
static union kjvcite kjv_unhash(const uint32_t x) {
	union kjvcite k;
	k.u32 = lowbias32_r(x);
	return k;
}
static void kjv_to_string(const union kjvcite x, char (*const a)[12])
	{ sprintf(*a, "%.4s%u:%u", kjv_book_string[x.book],
	(x.chapter + 1) % 1000, (x.verse + 1) % 1000); }

static uint32_t words_hash(const union kjvcite x) { return kjv_hash(x); }
static union kjvcite words_unhash(const uint32_t x) { return kjv_unhash(x); }
static void words_to_string(const union kjvcite x, char (*const a)[12])
	{ kjv_to_string(x, a); }

#define TABLE_NAME words
#define TABLE_KEY union kjvcite
#define TABLE_UINT uint32_t
#define TABLE_VALUE unsigned
#define TABLE_DEFAULT 0
#define TABLE_INVERSE
#define TABLE_TO_STRING
#include "../src/table.h"

static uint32_t kjvset_hash(const union kjvcite x) { return kjv_hash(x); }
static union kjvcite kjvset_unhash(const uint32_t x) { return kjv_unhash(x); }
static void kjvset_to_string(const union kjvcite x, char (*const a)[12])
	{ kjv_to_string(x, a); }

#define TABLE_NAME kjvset
#define TABLE_KEY union kjvcite
#define TABLE_UINT uint32_t
#define TABLE_INVERSE
#define TABLE_TO_STRING
#include "../src/table.h"

int main(void) {
	const char *const dir_name = "KJV";
	struct char_array kjv[KJV_BOOK_SIZE] = { 0 };
	struct words_table words = { 0 };
	int success = EXIT_SUCCESS;
	DIR *dir = 0;
	struct dirent *de = 0;
	unsigned i;
	size_t cum_words = 0;
	errno = 0;

	/* Read in the kjv from all files. This is overkill, we don't need to keep
	 all the data, just count. Maybe we'll do something else later? */
	if(chdir(dir_name) == -1 || !(dir = opendir("."))) goto catch;
	while((de = readdir(dir))) { /* For all files in directory. */
		unsigned ordinal;
		enum kjv_book b;
		if(!kjv_filename(de->d_name, &ordinal)) /* Extract ordinal. */
			{ /*fprintf(stderr, "Ignored <%s>.\n", de->d_name);*/ continue; }
		/*fprintf(stderr, "<%s> ordinal: %u\n", de->d_name, ordinal);*/
		if(ordinal < 1 || ordinal > KJV_BOOK_SIZE)
			{ errno = ERANGE; goto catch; } /* Not in range. */
		if(kjv[b = ordinal - 1].data) /* Convert to zero-based. */
			{ errno = EDOM; goto catch; } /* Duplicate. */
		if(!append_file(kjv + b, de->d_name)) goto catch;
	}
	closedir(dir), de = 0, dir = 0;

	/* Parse number of words. */
	for(i = 0; i < KJV_BOOK_SIZE; i++) {
		struct lex x = lex(kjv[i].data);
		if(!x.cursor) { fprintf(stderr, "Missing book [%u]%s.\n",
			i + 1, kjv_book_string[i]); errno = EDOM; goto catch; }
		printf("[%u]%s: cumulative %zu.\n",
			i + 1, kjv_book_string[i], cum_words);
		while(lex_next_verse(&x)) {
			const union kjvcite c
				= { .verse = x.verse, .chapter = x.chapter, .book = i };
			unsigned *w;
			switch(words_table_assign(&words, c, &w)) {
			case TABLE_PRESENT: fprintf(stderr, "[%u]%s %u:%u duplicated.\n",
				i + 1, kjv_book_string[i], x.chapter, x.verse); errno = EDOM;
			case TABLE_ERROR: goto catch;
			case TABLE_ABSENT: *w = x.words; break;
			}
			cum_words += x.words;
		}
		if(x.error) { fprintf(stderr, "[%u]%s on line %zu\n",
			i + 1, kjv_book_string[i], x.line); goto catch; }
	}
	printf("words: %s\n", words_table_to_string(&words));

	printf("kjv: %zu words\n", cum_words);
	{
		union kjvcite c;
		struct words_table_iterator it = words_table_begin(&words);
		unsigned *w;
		while(words_table_next(&it, &c, &w))
			printf("%s %u:%u -> %u\n",
			kjv_book_string[c.book], c.chapter, c.verse, *w);
		c.u32 = 0; /* Unnecessary? */
		c.book = Genesis, c.chapter = 1, c.verse = 1;
		printf("1:1:1 -> %u\n", words_table_get(&words, c));
	}
	goto finally;
catch:
	success = EXIT_FAILURE;
	perror(de ? de->d_name : dir_name);
	if(dir && closedir(dir)) perror(dir_name);
finally:
	for(i = 0; i < KJV_BOOK_SIZE; i++) char_array_(kjv + i);
	return success;
}