mirror of
https://github.com/rkd77/elinks.git
synced 2025-02-02 15:09:23 -05:00
bug 153, 1066: Convert XBEL bookmarks to/from UTF-8.
When the file is being read, Expat provides the strings to ELinks in UTF-8, so ELinks can put them in struct bookmark without conversions. Make sure gettext returns any placeholder strings in UTF-8, too. Replace '\r' with ' ' in bookmark titles and URLs. When the file is being written, put encoding="UTF-8" in the XML declaration, and then write out the strings from struct bookmark without character set conversions. Do replace some characters with entity references though, by calling add_html_to_string().
This commit is contained in:
parent
8c0ae2a215
commit
73f925ce21
@ -11,6 +11,7 @@
|
|||||||
#endif /* HAVE_CONFIG_H */
|
#endif /* HAVE_CONFIG_H */
|
||||||
|
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
|
#include <errno.h>
|
||||||
#include <expat.h>
|
#include <expat.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
@ -54,9 +55,14 @@ static unsigned char *get_attribute_value(struct tree_node *node,
|
|||||||
unsigned char *name);
|
unsigned char *name);
|
||||||
|
|
||||||
|
|
||||||
|
struct read_bookmarks_xbel {
|
||||||
|
int utf8_cp;
|
||||||
|
};
|
||||||
|
|
||||||
static void read_bookmarks_xbel(FILE *f);
|
static void read_bookmarks_xbel(FILE *f);
|
||||||
static unsigned char * filename_bookmarks_xbel(int writing);
|
static unsigned char * filename_bookmarks_xbel(int writing);
|
||||||
static int xbeltree_to_bookmarks_list(struct tree_node *root,
|
static int xbeltree_to_bookmarks_list(const struct read_bookmarks_xbel *preload,
|
||||||
|
struct tree_node *root,
|
||||||
struct bookmark *current_parent);
|
struct bookmark *current_parent);
|
||||||
static void write_bookmarks_list(struct secure_save_info *ssi,
|
static void write_bookmarks_list(struct secure_save_info *ssi,
|
||||||
LIST_OF(struct bookmark) *bookmarks_list,
|
LIST_OF(struct bookmark) *bookmarks_list,
|
||||||
@ -90,6 +96,7 @@ read_bookmarks_xbel(FILE *f)
|
|||||||
XML_Parser p;
|
XML_Parser p;
|
||||||
int done = 0;
|
int done = 0;
|
||||||
int err = 0;
|
int err = 0;
|
||||||
|
struct read_bookmarks_xbel preload;
|
||||||
|
|
||||||
readok = 0;
|
readok = 0;
|
||||||
|
|
||||||
@ -126,7 +133,12 @@ read_bookmarks_xbel(FILE *f)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!err) readok = xbeltree_to_bookmarks_list(root_node->children, NULL); /* Top node is xbel */
|
if (!err) {
|
||||||
|
preload.utf8_cp = get_cp_index("UTF-8");
|
||||||
|
readok = xbeltree_to_bookmarks_list(&preload,
|
||||||
|
root_node->children, /* Top node is xbel */
|
||||||
|
NULL);
|
||||||
|
}
|
||||||
|
|
||||||
XML_ParserFree(p);
|
XML_ParserFree(p);
|
||||||
free_xbeltree(root_node);
|
free_xbeltree(root_node);
|
||||||
@ -141,7 +153,7 @@ write_bookmarks_xbel(struct secure_save_info *ssi,
|
|||||||
/* We check for readok in filename_bookmarks_xbel(). */
|
/* We check for readok in filename_bookmarks_xbel(). */
|
||||||
|
|
||||||
secure_fputs(ssi,
|
secure_fputs(ssi,
|
||||||
"<?xml version=\"1.0\"?>\n"
|
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
|
||||||
"<!DOCTYPE xbel PUBLIC \"+//IDN python.org//DTD XML "
|
"<!DOCTYPE xbel PUBLIC \"+//IDN python.org//DTD XML "
|
||||||
"Bookmark Exchange Language 1.0//EN//XML\"\n"
|
"Bookmark Exchange Language 1.0//EN//XML\"\n"
|
||||||
" "
|
" "
|
||||||
@ -169,42 +181,20 @@ indentation(struct secure_save_info *ssi, int num)
|
|||||||
secure_fputs(ssi, " ");
|
secure_fputs(ssi, " ");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* FIXME This is totally broken, we should use the Unicode value in
|
|
||||||
* numeric entities.
|
|
||||||
* Additionally it is slow, not elegant, incomplete and
|
|
||||||
* if you pay enough attention you can smell the unmistakable
|
|
||||||
* odor of doom coming from it. --fabio */
|
|
||||||
static void
|
static void
|
||||||
print_xml_entities(struct secure_save_info *ssi, const unsigned char *str)
|
print_xml_entities(struct secure_save_info *ssi, const unsigned char *str)
|
||||||
{
|
{
|
||||||
#define accept_char(x) (isident((x)) || (x) == ' ' || (x) == '.' \
|
struct string entitized = NULL_STRING;
|
||||||
|| (x) == ':' || (x) == ';' \
|
|
||||||
|| (x) == '/' || (x) == '(' \
|
|
||||||
|| (x) == ')' || (x) == '}' \
|
|
||||||
|| (x) == '{' || (x) == '%' \
|
|
||||||
|| (x) == '+')
|
|
||||||
|
|
||||||
static int cp = -1;
|
if (init_string(&entitized)
|
||||||
|
&& add_html_to_string(&entitized, str, strlen(str))) {
|
||||||
if (cp == -1) cp = get_cp_index("us-ascii");
|
secure_fputs(ssi, entitized.source);
|
||||||
|
} else {
|
||||||
for (; *str; str++) {
|
secsave_errno = SS_ERR_OUT_OF_MEM;
|
||||||
if (accept_char(*str))
|
ssi->err = ENOMEM;
|
||||||
secure_fputc(ssi, *str);
|
|
||||||
else {
|
|
||||||
if (isascii(*str)) {
|
|
||||||
secure_fprintf(ssi, "&#%i;", (int) *str);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
const unsigned char *s = u2cp_no_nbsp(*str, cp);
|
|
||||||
|
|
||||||
if (s) print_xml_entities(ssi, s);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef accept_char
|
done_string(&entitized);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@ -226,7 +216,6 @@ write_bookmarks_list(struct secure_save_info *ssi,
|
|||||||
|
|
||||||
indentation(ssi, n + 2);
|
indentation(ssi, n + 2);
|
||||||
secure_fputs(ssi, "<title>");
|
secure_fputs(ssi, "<title>");
|
||||||
/** @todo Bug 153: bm->title should be UTF-8 */
|
|
||||||
print_xml_entities(ssi, bm->title);
|
print_xml_entities(ssi, bm->title);
|
||||||
secure_fputs(ssi, "</title>\n");
|
secure_fputs(ssi, "</title>\n");
|
||||||
|
|
||||||
@ -239,13 +228,11 @@ write_bookmarks_list(struct secure_save_info *ssi,
|
|||||||
} else if (bm->box_item->type == BI_LEAF) {
|
} else if (bm->box_item->type == BI_LEAF) {
|
||||||
|
|
||||||
secure_fputs(ssi, "<bookmark href=\"");
|
secure_fputs(ssi, "<bookmark href=\"");
|
||||||
/** @todo Bug 1066: bm->url should be UTF-8 */
|
|
||||||
print_xml_entities(ssi, bm->url);
|
print_xml_entities(ssi, bm->url);
|
||||||
secure_fputs(ssi, "\">\n");
|
secure_fputs(ssi, "\">\n");
|
||||||
|
|
||||||
indentation(ssi, n + 2);
|
indentation(ssi, n + 2);
|
||||||
secure_fputs(ssi, "<title>");
|
secure_fputs(ssi, "<title>");
|
||||||
/** @todo Bug 153: bm->title should be UTF-8 */
|
|
||||||
print_xml_entities(ssi, bm->title);
|
print_xml_entities(ssi, bm->title);
|
||||||
secure_fputs(ssi, "</title>\n");
|
secure_fputs(ssi, "</title>\n");
|
||||||
|
|
||||||
@ -315,25 +302,34 @@ on_element_close(void *data, const char *name)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static unsigned char *
|
static unsigned char *
|
||||||
delete_whites(unsigned char *s)
|
delete_whites(const unsigned char *s)
|
||||||
{
|
{
|
||||||
unsigned char *r;
|
unsigned char *r;
|
||||||
int count = 0, c = 0, i;
|
int last_was_space = 0, c = 0, i;
|
||||||
int len = strlen(s);
|
int len = strlen(s);
|
||||||
|
|
||||||
r = mem_alloc(len + 1);
|
r = mem_alloc(len + 1);
|
||||||
if (!r) return NULL;
|
if (!r) return NULL;
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
for (i = 0; i < len; i++) {
|
||||||
if (isspace(s[i])) {
|
/* Recognize only the whitespace characters listed
|
||||||
if (count == 1) continue;
|
* in section 2.3 of XML 1.1. U+0085 and U+2028 need
|
||||||
else count = 1;
|
* not be recognized here because section 2.11 says
|
||||||
}
|
* the XML processor must translate them to U+000A.
|
||||||
else count = 0;
|
* Do not use isspace() because the string is in UTF-8
|
||||||
|
* and individual bytes might not be characters at
|
||||||
if (s[i] == '\n' || s[i] == '\t')
|
* all. */
|
||||||
|
switch (s[i]) {
|
||||||
|
case '\x20': case '\x09': case '\x0D': case '\x0A':
|
||||||
|
if (last_was_space) continue;
|
||||||
|
last_was_space = 1;
|
||||||
r[c++] = ' ';
|
r[c++] = ' ';
|
||||||
else r[c++] = s[i];
|
break;
|
||||||
|
default:
|
||||||
|
last_was_space = 0;
|
||||||
|
r[c++] = s[i];
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
r[c] = '\0';
|
r[c] = '\0';
|
||||||
@ -370,7 +366,8 @@ on_text(void *data, const XML_Char *text, int len)
|
|||||||
/* xbel_tree_to_bookmarks_list: returns 0 on fail,
|
/* xbel_tree_to_bookmarks_list: returns 0 on fail,
|
||||||
* 1 on success */
|
* 1 on success */
|
||||||
static int
|
static int
|
||||||
xbeltree_to_bookmarks_list(struct tree_node *node,
|
xbeltree_to_bookmarks_list(const struct read_bookmarks_xbel *preload,
|
||||||
|
struct tree_node *node,
|
||||||
struct bookmark *current_parent)
|
struct bookmark *current_parent)
|
||||||
{
|
{
|
||||||
struct bookmark *tmp;
|
struct bookmark *tmp;
|
||||||
@ -384,8 +381,7 @@ xbeltree_to_bookmarks_list(struct tree_node *node,
|
|||||||
title = get_child(node, "title");
|
title = get_child(node, "title");
|
||||||
href = get_attribute_value(node, "href");
|
href = get_attribute_value(node, "href");
|
||||||
|
|
||||||
/** @todo Bugs 153, 1066: add_bookmark()
|
intl_set_charset_by_index(preload->utf8_cp);
|
||||||
* expects UTF-8. */
|
|
||||||
tmp = add_bookmark(current_parent, 0,
|
tmp = add_bookmark(current_parent, 0,
|
||||||
/* The <title> element is optional */
|
/* The <title> element is optional */
|
||||||
title && title->text ? title->text
|
title && title->text ? title->text
|
||||||
@ -408,7 +404,7 @@ xbeltree_to_bookmarks_list(struct tree_node *node,
|
|||||||
|
|
||||||
title = get_child(node, "title");
|
title = get_child(node, "title");
|
||||||
|
|
||||||
/** @todo Bug 153: add_bookmark() expects UTF-8. */
|
intl_set_charset_by_index(preload->utf8_cp);
|
||||||
tmp = add_bookmark(current_parent, 0,
|
tmp = add_bookmark(current_parent, 0,
|
||||||
title && title->text ? title->text
|
title && title->text ? title->text
|
||||||
: (unsigned char *) gettext("No title"),
|
: (unsigned char *) gettext("No title"),
|
||||||
@ -434,14 +430,18 @@ xbeltree_to_bookmarks_list(struct tree_node *node,
|
|||||||
|
|
||||||
if (node->children) {
|
if (node->children) {
|
||||||
int ret;
|
int ret;
|
||||||
|
struct bookmark *parent_for_nested;
|
||||||
|
|
||||||
/* If this node is a <folder> element, current parent
|
/* If this node is a <folder> element, current parent
|
||||||
* changes */
|
* changes */
|
||||||
ret = (!strcmp(node->name, "folder") ?
|
if (!strcmp(node->name, "folder"))
|
||||||
xbeltree_to_bookmarks_list(node->children,
|
parent_for_nested = lastbm;
|
||||||
lastbm) :
|
else
|
||||||
xbeltree_to_bookmarks_list(node->children,
|
parent_for_nested = current_parent;
|
||||||
current_parent));
|
|
||||||
|
ret = xbeltree_to_bookmarks_list(preload,
|
||||||
|
node->children,
|
||||||
|
parent_for_nested);
|
||||||
/* Out of memory */
|
/* Out of memory */
|
||||||
if (!ret) return 0;
|
if (!ret) return 0;
|
||||||
}
|
}
|
||||||
|
@ -54,14 +54,13 @@ static struct option_info bookmark_options_info[] = {
|
|||||||
"file_format", 0, 0, 1, 0,
|
"file_format", 0, 0, 1, 0,
|
||||||
N_("File format for bookmarks (affects both reading and saving):\n"
|
N_("File format for bookmarks (affects both reading and saving):\n"
|
||||||
"0 is the default native ELinks format\n"
|
"0 is the default native ELinks format\n"
|
||||||
"1 is XBEL universal XML bookmarks format (ELinks bug 153: NO NATIONAL CHARS SUPPORT!)")),
|
"1 is XBEL universal XML bookmarks format")),
|
||||||
#else
|
#else
|
||||||
INIT_OPT_INT("bookmarks", N_("File format"),
|
INIT_OPT_INT("bookmarks", N_("File format"),
|
||||||
"file_format", 0, 0, 1, 0,
|
"file_format", 0, 0, 1, 0,
|
||||||
N_("File format for bookmarks (affects both reading and saving):\n"
|
N_("File format for bookmarks (affects both reading and saving):\n"
|
||||||
"0 is the default native ELinks format\n"
|
"0 is the default native ELinks format\n"
|
||||||
"1 is XBEL universal XML bookmarks format (ELinks bug 153: NO NATIONAL CHARS SUPPORT!)"
|
"1 is XBEL universal XML bookmarks format (DISABLED)")),
|
||||||
" (DISABLED)")),
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
INIT_OPT_BOOL("bookmarks", N_("Save folder state"),
|
INIT_OPT_BOOL("bookmarks", N_("Save folder state"),
|
||||||
|
@ -45,6 +45,10 @@ n_(unsigned char *msg1, unsigned char *msg2, unsigned long int n, struct termina
|
|||||||
return gettext_noop(msg1);
|
return gettext_noop(msg1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
intl_set_charset_by_index(int new_charset)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@ -59,10 +63,8 @@ extern int current_charset;
|
|||||||
/* #define DEBUG_IT */
|
/* #define DEBUG_IT */
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
intl_set_charset(struct terminal *term)
|
intl_set_charset_by_index(int new_charset)
|
||||||
{
|
{
|
||||||
int new_charset = get_terminal_codepage(term);
|
|
||||||
|
|
||||||
/* Prevent useless switching. */
|
/* Prevent useless switching. */
|
||||||
if (current_charset != new_charset) {
|
if (current_charset != new_charset) {
|
||||||
bind_textdomain_codeset( /* PACKAGE */ "elinks",
|
bind_textdomain_codeset( /* PACKAGE */ "elinks",
|
||||||
@ -71,6 +73,14 @@ intl_set_charset(struct terminal *term)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
intl_set_charset(struct terminal *term)
|
||||||
|
{
|
||||||
|
int new_charset = get_terminal_codepage(term);
|
||||||
|
|
||||||
|
intl_set_charset_by_index(new_charset);
|
||||||
|
}
|
||||||
|
|
||||||
/* TODO: Ideally, we should internally work only in Unicode - then the need for
|
/* TODO: Ideally, we should internally work only in Unicode - then the need for
|
||||||
* charsets multiplexing would cease. That'll take some work yet, though.
|
* charsets multiplexing would cease. That'll take some work yet, though.
|
||||||
* --pasky */
|
* --pasky */
|
||||||
|
Loading…
x
Reference in New Issue
Block a user