1
0
mirror of https://github.com/rkd77/elinks.git synced 2024-12-04 14:46:47 -05:00

[python] If page encoding is not "utf-8", encode it twice in pre_format_html_hook.

Once to utf-8 before pre_format_html_hook, and second time to the original encoding after python
script execution. I know it is inefficient, but computers are quite fast nowadays.
This commit is contained in:
Witold Filipczyk 2020-08-09 17:45:22 +02:00
parent bec41b6e2f
commit 8b8f57ed75

View File

@ -7,6 +7,7 @@
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <iconv.h>
#include <stdarg.h>
#include <string.h>
@ -14,6 +15,7 @@
#include "cache/cache.h"
#include "main/event.h"
#include "protocol/header.h"
#include "protocol/uri.h"
#include "scripting/python/core.h"
#include "session/session.h"
@ -79,6 +81,60 @@ script_hook_url(va_list ap, void *data)
return EVENT_HOOK_STATUS_NEXT;
}
static int
get_codepage(unsigned char *head)
{
int cp_index = -1;
unsigned char *part = head;
if (!head) {
goto none;
}
while (cp_index == -1) {
unsigned char *ct_charset;
/* scan_http_equiv() appends the meta http-equiv directives to
* the protocol header before this function is called, but the
* HTTP Content-Type header has precedence, so the HTTP header
* will be used if it exists and the meta header is only used
* as a fallback. See bug 983. */
unsigned char *a = parse_header(part, "Content-Type", &part);
if (!a) break;
parse_header_param(a, "charset", &ct_charset, 0);
if (ct_charset) {
cp_index = get_cp_index(ct_charset);
mem_free(ct_charset);
}
mem_free(a);
}
if (cp_index == -1) {
unsigned char *a = parse_header(head, "Content-Charset", NULL);
if (a) {
cp_index = get_cp_index(a);
mem_free(a);
}
}
if (cp_index == -1) {
unsigned char *a = parse_header(head, "Charset", NULL);
if (a) {
cp_index = get_cp_index(a);
mem_free(a);
}
}
none:
if (cp_index == -1) {
cp_index = get_cp_index("System");
}
return cp_index;
}
/* Call a Python hook for a pre-format-html event. */
static enum evhook_status
@ -88,9 +144,10 @@ script_hook_pre_format_html(va_list ap, void *data)
struct cache_entry *cached = va_arg(ap, struct cache_entry *);
struct fragment *fragment = get_cache_fragment(cached);
unsigned char *url = struri(cached->uri);
int codepage = get_codepage(cached->head);
char *method = "pre_format_html_hook";
struct session *saved_python_ses = python_ses;
PyObject *result;
PyObject *result = NULL;
int success = 0;
evhook_use_params(ses && cached);
@ -101,8 +158,39 @@ script_hook_pre_format_html(va_list ap, void *data)
python_ses = ses;
result = PyObject_CallMethod(python_hooks, method, "ss#", url,
fragment->data, fragment->length);
if (!is_cp_utf8(codepage)) {
size_t iconv_res;
size_t ileft;
size_t oleft;
char *inbuf, *outbuf;
char *utf8_data = mem_alloc(fragment->length * 8);
iconv_t cd;
if (!utf8_data) {
goto error;
}
cd = iconv_open("utf-8", get_cp_mime_name(codepage));
if (cd == (iconv_t)-1) {
mem_free(utf8_data);
goto error;
}
inbuf = fragment->data;
outbuf = utf8_data;
ileft = fragment->length;
oleft = fragment->length * 8;
iconv_res = iconv(cd, &inbuf, &ileft, &outbuf, &oleft);
if (iconv_res == -1) {
mem_free(utf8_data);
goto error;
}
iconv_close(cd);
result = PyObject_CallMethod(python_hooks, method, "ss#", url, utf8_data, fragment->length * 8 - oleft);
mem_free(utf8_data);
} else {
result = PyObject_CallMethod(python_hooks, method, "ss#", url, fragment->data, fragment->length);
}
if (!result) goto error;
if (result != Py_None) {
@ -114,12 +202,43 @@ script_hook_pre_format_html(va_list ap, void *data)
goto error;
}
/* This assumes the Py_ssize_t len is not too large to
* fit in the off_t parameter of normalize_cache_entry().
* add_fragment() itself seems to assume the same thing,
* and there is no standard OFF_MAX macro against which
* ELinks could check the value. */
(void) add_fragment(cached, 0, str, len);
if (!is_cp_utf8(codepage)) {
size_t iconv_res;
size_t ileft;
size_t oleft;
char *inbuf, *outbuf;
char *dec_data = mem_alloc(len * 4);
iconv_t cd;
if (!dec_data) {
goto error;
}
cd = iconv_open(get_cp_mime_name(codepage), "utf-8");
if (cd == (iconv_t)-1) {
mem_free(dec_data);
goto error;
}
inbuf = str;
outbuf = dec_data;
ileft = len;
oleft = len * 4;
iconv_res = iconv(cd, &inbuf, &ileft, &outbuf, &oleft);
if (iconv_res == -1) {
mem_free(dec_data);
goto error;
}
iconv_close(cd);
(void) add_fragment(cached, 0, dec_data, len * 4 - oleft);
mem_free(dec_data);
} else {
/* This assumes the Py_ssize_t len is not too large to
* fit in the off_t parameter of normalize_cache_entry().
* add_fragment() itself seems to assume the same thing,
* and there is no standard OFF_MAX macro against which
* ELinks could check the value. */
(void) add_fragment(cached, 0, str, len);
}
normalize_cache_entry(cached, len);
}