2022-12-12 02:48:06 -05:00
|
|
|
/** @license 2022 Neil Edelman, distributed under the terms of the
|
|
|
|
[MIT License](https://opensource.org/licenses/MIT).
|
|
|
|
Is intended to use
|
2022-12-14 00:05:29 -05:00
|
|
|
<https://github.com/scrollmapper/bible_databases/master/txt/KJV/>.
|
|
|
|
@std C13 */
|
2022-12-12 02:48:06 -05:00
|
|
|
|
2022-12-13 03:31:56 -05:00
|
|
|
#include <stdlib.h>
|
|
|
|
#include <stdio.h>
|
2022-12-12 02:48:06 -05:00
|
|
|
#include <assert.h>
|
|
|
|
#include <errno.h>
|
2022-12-13 03:31:56 -05:00
|
|
|
#include <dirent.h> /* opendir readdir closedir */
|
|
|
|
#include <unistd.h> /* chdir (POSIX) (because I'm lazy) */
|
|
|
|
|
|
|
|
|
2022-12-13 17:37:50 -05:00
|
|
|
/* Dynamic contiguous string that is used to load files. */
|
2022-12-13 03:31:56 -05:00
|
|
|
|
|
|
|
#define ARRAY_NAME char
|
|
|
|
#define ARRAY_TYPE char
|
|
|
|
#include "../src/array.h"
|
|
|
|
|
|
|
|
/** Append a text file, `fn`, to `c`, and add a '\0'.
|
|
|
|
@return Success. A partial read is failure. @throws[fopen, fread, malloc]
|
|
|
|
@throws[EISEQ] The text file has embedded nulls.
|
|
|
|
@throws[ERANGE] If the standard library does not follow POSIX. */
|
|
|
|
static int append_file(struct char_array *c, const char *const fn) {
|
|
|
|
FILE *fp = 0;
|
|
|
|
const size_t granularity = 1024;
|
|
|
|
size_t nread;
|
|
|
|
char *cursor;
|
|
|
|
int success = 0;
|
|
|
|
assert(c && fn);
|
|
|
|
if(!(fp = fopen(fn, "r"))) goto catch;
|
|
|
|
/* Read entire file in chunks. */
|
|
|
|
do if(!(cursor = char_array_buffer(c, granularity))
|
|
|
|
|| (nread = fread(cursor, 1, granularity, fp), ferror(fp))
|
|
|
|
|| !char_array_append(c, nread)) goto catch;
|
|
|
|
while(nread == granularity);
|
|
|
|
/* File to `C` string. */
|
|
|
|
if(!(cursor = char_array_new(c))) goto catch;
|
|
|
|
*cursor = '\0';
|
|
|
|
/* Binary files with embedded '\0' are not allowed. */
|
|
|
|
if(strchr(c->data, '\0') != cursor) { errno = EILSEQ; goto catch; }
|
|
|
|
{ success = 1; goto finally; }
|
|
|
|
catch:
|
|
|
|
if(!errno) errno = EILSEQ; /* Will never be true on POSIX. */
|
|
|
|
finally:
|
|
|
|
if(fp) fclose(fp);
|
|
|
|
return success;
|
|
|
|
}
|
2022-12-12 02:48:06 -05:00
|
|
|
|
2022-12-13 17:37:50 -05:00
|
|
|
|
|
|
|
/** Helper to parse unsigned; [`s`,`e`) => `n`. */
|
2022-12-12 02:48:06 -05:00
|
|
|
static int parse_natural(const char *s, const char *const e, unsigned *const n) {
|
2022-12-12 03:45:41 -05:00
|
|
|
unsigned accum = 0;
|
2022-12-12 02:48:06 -05:00
|
|
|
while(s < e) {
|
2022-12-12 03:45:41 -05:00
|
|
|
unsigned next = accum * 10 + (unsigned)(*s - '0');
|
|
|
|
if(accum >= next) return errno = ERANGE, 0;
|
|
|
|
accum = next;
|
2022-12-12 02:48:06 -05:00
|
|
|
s++;
|
|
|
|
}
|
2022-12-12 03:45:41 -05:00
|
|
|
*n = accum;
|
2022-12-12 02:48:06 -05:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2022-12-13 03:31:56 -05:00
|
|
|
|
2022-12-13 17:37:50 -05:00
|
|
|
/* Enumerate books. */
|
2022-12-13 03:31:56 -05:00
|
|
|
|
|
|
|
#define BOOKS \
|
|
|
|
X(Genesis),\
|
|
|
|
X(Exodus),\
|
|
|
|
X(Leviticus),\
|
|
|
|
X(Numbers),\
|
|
|
|
X(Deuteronomy),\
|
|
|
|
X(Joshua),\
|
|
|
|
X(Judges),\
|
|
|
|
X(Ruth),\
|
|
|
|
X(ISamuel),\
|
|
|
|
X(IISamuel),\
|
|
|
|
X(IKings),\
|
|
|
|
X(IIKings),\
|
|
|
|
X(IChronicles),\
|
|
|
|
X(IIChronicles),\
|
|
|
|
X(Ezra),\
|
|
|
|
X(Nehemiah),\
|
|
|
|
X(Esther),\
|
|
|
|
X(Job),\
|
|
|
|
X(Psalms),\
|
|
|
|
X(Proverbs),\
|
|
|
|
X(Ecclesiastes),\
|
|
|
|
X(Song_of_Solomon),\
|
|
|
|
X(Isaiah),\
|
|
|
|
X(Jeremiah),\
|
|
|
|
X(Lamentations),\
|
|
|
|
X(Ezekiel),\
|
|
|
|
X(Daniel),\
|
|
|
|
X(Hosea),\
|
|
|
|
X(Joel),\
|
|
|
|
X(Amos),\
|
|
|
|
X(Obadiah),\
|
|
|
|
X(Jonah),\
|
|
|
|
X(Micah),\
|
|
|
|
X(Nahum),\
|
|
|
|
X(Habakkuk),\
|
|
|
|
X(Zephaniah),\
|
|
|
|
X(Haggai),\
|
|
|
|
X(Zechariah),\
|
|
|
|
X(Malachi),\
|
|
|
|
\
|
|
|
|
X(Matthew),\
|
|
|
|
X(Mark),\
|
|
|
|
X(Luke),\
|
|
|
|
X(John),\
|
|
|
|
X(Acts),\
|
|
|
|
X(Romans),\
|
|
|
|
X(ICorinthians),\
|
|
|
|
X(IICorinthians),\
|
|
|
|
X(Galatians),\
|
|
|
|
X(Ephesians),\
|
|
|
|
X(Philippians),\
|
|
|
|
X(Colossians),\
|
|
|
|
X(IThessalonians),\
|
|
|
|
X(IIThessalonians),\
|
|
|
|
X(ITimothy),\
|
|
|
|
X(IITimothy),\
|
|
|
|
X(Titus),\
|
|
|
|
X(Philemon),\
|
|
|
|
X(Hebrews),\
|
|
|
|
X(James),\
|
|
|
|
X(IPeter),\
|
|
|
|
X(IIPeter),\
|
|
|
|
X(IJohn),\
|
|
|
|
X(IIJohn),\
|
|
|
|
X(IIIJohn),\
|
|
|
|
X(Jude),\
|
|
|
|
X(Revelation),\
|
|
|
|
X(KJV_BOOK_SIZE)
|
|
|
|
|
|
|
|
#define X(book) book
|
|
|
|
enum kjv_book { BOOKS };
|
|
|
|
#undef X
|
|
|
|
#define X(book) #book
|
|
|
|
static const char *kjv_book_string[] = { BOOKS };
|
|
|
|
#undef X
|
|
|
|
#undef BOOKS
|
|
|
|
|
2022-12-13 17:37:50 -05:00
|
|
|
|
|
|
|
/* Parse filename of books. This works with
|
|
|
|
<https://github.com/scrollmapper/bible_databases/tree/master/txt/KJV> */
|
|
|
|
|
2022-12-12 02:48:06 -05:00
|
|
|
/*!re2c /**/
|
|
|
|
re2c:yyfill:enable = 0;
|
|
|
|
re2c:define:YYCTYPE = char;
|
|
|
|
natural = [1-9][0-9]*;
|
2022-12-13 00:25:28 -05:00
|
|
|
whitespace = [ \t\v\f];
|
|
|
|
word = [^ \t\v\f\n\x00]+;
|
2022-12-12 02:48:06 -05:00
|
|
|
*/
|
|
|
|
|
2022-12-13 00:25:28 -05:00
|
|
|
/** `fn` contains "<number>[*].txt", sticks that in `book_no`, otherwise
|
|
|
|
returns false. */
|
2022-12-13 03:31:56 -05:00
|
|
|
static int kjv_filename(const char *fn, unsigned *const book_no) {
|
2022-12-12 02:48:06 -05:00
|
|
|
const char *YYCURSOR = fn, *YYMARKER, *yyt1, *yyt2, *s0, *s1;
|
|
|
|
assert(fn && book_no);
|
|
|
|
/*!re2c /**/
|
|
|
|
*
|
|
|
|
{ return 0; }
|
2022-12-13 00:25:28 -05:00
|
|
|
@s0 natural @s1 [^.\x00]* ".txt" "\x00"
|
2022-12-12 02:48:06 -05:00
|
|
|
{ return parse_natural(s0, s1, book_no); }
|
|
|
|
*/
|
|
|
|
}
|
2022-12-13 00:25:28 -05:00
|
|
|
|
2022-12-13 17:37:50 -05:00
|
|
|
|
|
|
|
/* Parse book contents. */
|
|
|
|
|
2022-12-13 03:31:56 -05:00
|
|
|
struct lex {
|
|
|
|
size_t line;
|
|
|
|
const char *cursor;
|
|
|
|
int error;
|
|
|
|
unsigned chapter, verse, words;
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct lex lex(const char *cursor) {
|
|
|
|
struct lex lex;
|
|
|
|
assert(cursor);
|
|
|
|
lex.line = 1;
|
|
|
|
lex.cursor = cursor;
|
|
|
|
lex.error = 0;
|
|
|
|
lex.chapter = lex.verse = lex.words = 0;
|
|
|
|
return lex;
|
|
|
|
}
|
2022-12-13 00:25:28 -05:00
|
|
|
|
|
|
|
/*!conditions:re2c*/
|
|
|
|
|
2022-12-13 03:31:56 -05:00
|
|
|
static int lex_next_verse(struct lex *const lex) {
|
|
|
|
const char *YYMARKER, *yyt1 = 0, *yyt2 = 0, *s0, *s1, *t0, *t1;
|
|
|
|
enum YYCONDTYPE condition = yycline;
|
2022-12-13 00:25:28 -05:00
|
|
|
/*!re2c /**/
|
2022-12-13 03:31:56 -05:00
|
|
|
re2c:define:YYCURSOR = lex->cursor;
|
|
|
|
re2c:define:YYGETCONDITION = "condition";
|
|
|
|
re2c:define:YYSETCONDITION = "condition = @@;";
|
|
|
|
re2c:define:YYGETCONDITION:naked = 1;
|
|
|
|
re2c:define:YYSETCONDITION:naked = 1; */
|
|
|
|
assert(lex && lex->cursor);
|
|
|
|
lex->error = 0;
|
2022-12-13 00:25:28 -05:00
|
|
|
scan:
|
|
|
|
/*!re2c /**/
|
2022-12-13 17:37:50 -05:00
|
|
|
<*> * { return errno = EILSEQ, lex->error = 1, 0; }
|
|
|
|
<line> [^[\]\n\x00]* "\n" { lex->line++; goto scan; }
|
|
|
|
<line> "\x00" { return 0; }
|
2022-12-13 03:31:56 -05:00
|
|
|
<line> "[" @s0 natural @s1 ":" @t0 natural @t1 "]" => verse {
|
|
|
|
if(!parse_natural(s0, s1, &lex->chapter)
|
|
|
|
|| !parse_natural(t0, t1, &lex->verse))
|
|
|
|
return errno = EILSEQ, lex->error = 1, 0;
|
|
|
|
lex->words = 0;
|
2022-12-13 17:37:50 -05:00
|
|
|
/*printf("%u:%u", lex->chapter, lex->verse);*/
|
2022-12-13 03:31:56 -05:00
|
|
|
goto scan;
|
2022-12-13 00:25:28 -05:00
|
|
|
}
|
2022-12-13 03:31:56 -05:00
|
|
|
<verse> whitespace+ { goto scan; }
|
|
|
|
<verse> @s0 word @s1 { lex->words++; goto scan; }
|
2022-12-13 17:37:50 -05:00
|
|
|
<verse> "\n" { /*printf(" -> %u\n", lex->words);*/ lex->line++; return 1; }
|
2022-12-13 00:25:28 -05:00
|
|
|
*/
|
|
|
|
}
|
|
|
|
|
2022-12-13 17:37:50 -05:00
|
|
|
|
|
|
|
/* Reversible hash map to store data on bible. */
|
|
|
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
2022-12-14 00:05:29 -05:00
|
|
|
/** <https://nullprogram.com/blog/2018/07/31/>
|
|
|
|
<https://github.com/skeeto/hash-prospector> on `x`. */
|
|
|
|
static uint32_t lowbias32(uint32_t x) {
|
|
|
|
x ^= x >> 16;
|
|
|
|
x *= 0x7feb352dU;
|
|
|
|
x ^= x >> 15;
|
|
|
|
x *= 0x846ca68bU;
|
|
|
|
x ^= x >> 16;
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
/* Inverts `x`. */
|
|
|
|
static uint32_t lowbias32_r(uint32_t x) {
|
|
|
|
x ^= x >> 16;
|
|
|
|
x *= 0x43021123U;
|
|
|
|
x ^= x >> 15 ^ x >> 30;
|
|
|
|
x *= 0x1d69e2a5U;
|
|
|
|
x ^= x >> 16;
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
|
|
|
|
union kjvcite {
|
|
|
|
uint32_t u32;
|
|
|
|
struct { unsigned verse : 8, chapter : 8, book : 7; };
|
|
|
|
};
|
|
|
|
|
|
|
|
static uint32_t kjv_hash(const union kjvcite x) { return lowbias32(x.u32); }
|
|
|
|
static union kjvcite kjv_unhash(const uint32_t x) {
|
|
|
|
union kjvcite k;
|
|
|
|
k.u32 = lowbias32_r(x);
|
|
|
|
return k;
|
|
|
|
}
|
|
|
|
static void kjv_to_string(const union kjvcite x, char (*const a)[12])
|
|
|
|
{ sprintf(*a, "%.4s%u:%u", kjv_book_string[x.book],
|
|
|
|
(x.chapter + 1) % 1000, (x.verse + 1) % 1000); }
|
|
|
|
|
|
|
|
static uint32_t words_hash(const union kjvcite x) { return kjv_hash(x); }
|
|
|
|
static union kjvcite words_unhash(const uint32_t x) { return kjv_unhash(x); }
|
|
|
|
static void words_to_string(const union kjvcite x, char (*const a)[12])
|
|
|
|
{ kjv_to_string(x, a); }
|
|
|
|
|
|
|
|
#define TABLE_NAME words
|
|
|
|
#define TABLE_KEY union kjvcite
|
|
|
|
#define TABLE_UINT uint32_t
|
|
|
|
#define TABLE_VALUE unsigned
|
|
|
|
#define TABLE_INVERSE
|
|
|
|
#define TABLE_TO_STRING
|
|
|
|
#include "../src/table.h"
|
|
|
|
|
|
|
|
static uint32_t kjvset_hash(const union kjvcite x) { return kjv_hash(x); }
|
|
|
|
static union kjvcite kjvset_unhash(const uint32_t x) { return kjv_unhash(x); }
|
|
|
|
static void kjvset_to_string(const union kjvcite x, char (*const a)[12])
|
|
|
|
{ kjv_to_string(x, a); }
|
|
|
|
|
|
|
|
#define TABLE_NAME kjvset
|
|
|
|
#define TABLE_KEY union kjvcite
|
|
|
|
#define TABLE_UINT uint32_t
|
|
|
|
#define TABLE_INVERSE
|
|
|
|
#define TABLE_TO_STRING
|
|
|
|
#include "../src/table.h"
|
|
|
|
|
2022-12-13 03:31:56 -05:00
|
|
|
int main(void) {
|
|
|
|
const char *const dir_name = "KJV";
|
|
|
|
struct char_array kjv[KJV_BOOK_SIZE] = { 0 };
|
2022-12-14 00:05:29 -05:00
|
|
|
struct words_table words = { 0 };
|
2022-12-13 03:31:56 -05:00
|
|
|
int success = EXIT_SUCCESS;
|
|
|
|
DIR *dir = 0;
|
|
|
|
struct dirent *de = 0;
|
|
|
|
unsigned i;
|
2022-12-14 00:05:29 -05:00
|
|
|
size_t cum_words = 0;
|
2022-12-13 03:31:56 -05:00
|
|
|
errno = 0;
|
|
|
|
|
|
|
|
/* Read in the kjv from all files. This is overkill, we don't need to keep
|
|
|
|
all the data, just count. Maybe we'll do something else later? */
|
|
|
|
if(chdir(dir_name) == -1 || !(dir = opendir("."))) goto catch;
|
|
|
|
while((de = readdir(dir))) { /* For all files in directory. */
|
|
|
|
unsigned ordinal;
|
|
|
|
enum kjv_book b;
|
|
|
|
if(!kjv_filename(de->d_name, &ordinal)) /* Extract ordinal. */
|
2022-12-13 17:37:50 -05:00
|
|
|
{ /*fprintf(stderr, "Ignored <%s>.\n", de->d_name);*/ continue; }
|
|
|
|
/*fprintf(stderr, "<%s> ordinal: %u\n", de->d_name, ordinal);*/
|
2022-12-13 03:31:56 -05:00
|
|
|
if(ordinal < 1 || ordinal > KJV_BOOK_SIZE)
|
|
|
|
{ errno = ERANGE; goto catch; } /* Not in range. */
|
|
|
|
if(kjv[b = ordinal - 1].data) /* Convert to zero-based. */
|
|
|
|
{ errno = EDOM; goto catch; } /* Duplicate. */
|
|
|
|
if(!append_file(kjv + b, de->d_name)) goto catch;
|
|
|
|
}
|
|
|
|
closedir(dir), de = 0, dir = 0;
|
|
|
|
|
2022-12-14 00:05:29 -05:00
|
|
|
/* Parse number of words. */
|
2022-12-13 03:31:56 -05:00
|
|
|
for(i = 0; i < KJV_BOOK_SIZE; i++) {
|
|
|
|
struct lex x = lex(kjv[i].data);
|
2022-12-13 17:37:50 -05:00
|
|
|
if(!x.cursor) { fprintf(stderr, "Missing book [%u]%s.\n",
|
|
|
|
i + 1, kjv_book_string[i]); errno = EDOM; goto catch; }
|
2022-12-14 00:05:29 -05:00
|
|
|
printf("[%u]%s: cumulative %zu.\n",
|
|
|
|
i + 1, kjv_book_string[i], cum_words);
|
|
|
|
while(lex_next_verse(&x)) {
|
|
|
|
const union kjvcite c
|
|
|
|
= { .verse = x.verse, .chapter = x.chapter, .book = i };
|
|
|
|
unsigned *w;
|
|
|
|
switch(words_table_assign(&words, c, &w)) {
|
|
|
|
case TABLE_PRESENT: fprintf(stderr, "[%u]%s %u:%u duplicated.\n",
|
|
|
|
i + 1, kjv_book_string[i], x.chapter, x.verse); errno = EDOM;
|
|
|
|
case TABLE_ERROR: goto catch;
|
|
|
|
case TABLE_ABSENT: *w = x.words; break;
|
|
|
|
}
|
|
|
|
cum_words += x.words;
|
|
|
|
}
|
2022-12-13 17:37:50 -05:00
|
|
|
if(x.error) { fprintf(stderr, "[%u]%s on line %zu\n",
|
|
|
|
i + 1, kjv_book_string[i], x.line); goto catch; }
|
2022-12-13 03:31:56 -05:00
|
|
|
}
|
2022-12-14 00:05:29 -05:00
|
|
|
printf("words: %s\n", words_table_to_string(&words));
|
2022-12-13 03:31:56 -05:00
|
|
|
|
2022-12-14 00:05:29 -05:00
|
|
|
printf("kjv: %zu words\n", cum_words);
|
2022-12-13 03:31:56 -05:00
|
|
|
goto finally;
|
|
|
|
catch:
|
|
|
|
success = EXIT_FAILURE;
|
|
|
|
perror(de ? de->d_name : dir_name);
|
|
|
|
if(dir && closedir(dir)) perror(dir_name);
|
|
|
|
finally:
|
|
|
|
for(i = 0; i < KJV_BOOK_SIZE; i++) char_array_(kjv + i);
|
|
|
|
return success;
|
|
|
|
}
|