1
0
mirror of https://github.com/rkd77/elinks.git synced 2024-12-04 14:46:47 -05:00

bug 153, 1066: Convert XBEL bookmarks to/from UTF-8.

When the file is being read, Expat provides the strings to ELinks in
UTF-8, so ELinks can put them in struct bookmark without conversions.
Make sure gettext returns any placeholder strings in UTF-8, too.
Replace '\r' with ' ' in bookmark titles and URLs.

When the file is being written, put encoding="UTF-8" in the XML
declaration, and then write out the strings from struct bookmark
without character set conversions.  Do replace some characters
with entity references though, by calling add_html_to_string().
This commit is contained in:
Kalle Olavi Niemitalo 2009-01-04 13:59:04 +02:00 committed by Kalle Olavi Niemitalo
parent 8c0ae2a215
commit 73f925ce21
3 changed files with 70 additions and 61 deletions

View File

@ -11,6 +11,7 @@
#endif /* HAVE_CONFIG_H */
#include <ctype.h>
#include <errno.h>
#include <expat.h>
#include <stdio.h>
#include <stdlib.h>
@ -54,9 +55,14 @@ static unsigned char *get_attribute_value(struct tree_node *node,
unsigned char *name);
struct read_bookmarks_xbel {
int utf8_cp;
};
static void read_bookmarks_xbel(FILE *f);
static unsigned char * filename_bookmarks_xbel(int writing);
static int xbeltree_to_bookmarks_list(struct tree_node *root,
static int xbeltree_to_bookmarks_list(const struct read_bookmarks_xbel *preload,
struct tree_node *root,
struct bookmark *current_parent);
static void write_bookmarks_list(struct secure_save_info *ssi,
LIST_OF(struct bookmark) *bookmarks_list,
@ -90,6 +96,7 @@ read_bookmarks_xbel(FILE *f)
XML_Parser p;
int done = 0;
int err = 0;
struct read_bookmarks_xbel preload;
readok = 0;
@ -126,7 +133,12 @@ read_bookmarks_xbel(FILE *f)
}
}
if (!err) readok = xbeltree_to_bookmarks_list(root_node->children, NULL); /* Top node is xbel */
if (!err) {
preload.utf8_cp = get_cp_index("UTF-8");
readok = xbeltree_to_bookmarks_list(&preload,
root_node->children, /* Top node is xbel */
NULL);
}
XML_ParserFree(p);
free_xbeltree(root_node);
@ -141,7 +153,7 @@ write_bookmarks_xbel(struct secure_save_info *ssi,
/* We check for readok in filename_bookmarks_xbel(). */
secure_fputs(ssi,
"<?xml version=\"1.0\"?>\n"
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<!DOCTYPE xbel PUBLIC \"+//IDN python.org//DTD XML "
"Bookmark Exchange Language 1.0//EN//XML\"\n"
" "
@ -169,42 +181,20 @@ indentation(struct secure_save_info *ssi, int num)
secure_fputs(ssi, " ");
}
/* FIXME This is totally broken, we should use the Unicode value in
* numeric entities.
* Additionally it is slow, not elegant, incomplete and
* if you pay enough attention you can smell the unmistakable
* odor of doom coming from it. --fabio */
static void
print_xml_entities(struct secure_save_info *ssi, const unsigned char *str)
{
#define accept_char(x) (isident((x)) || (x) == ' ' || (x) == '.' \
|| (x) == ':' || (x) == ';' \
|| (x) == '/' || (x) == '(' \
|| (x) == ')' || (x) == '}' \
|| (x) == '{' || (x) == '%' \
|| (x) == '+')
struct string entitized = NULL_STRING;
static int cp = -1;
if (cp == -1) cp = get_cp_index("us-ascii");
for (; *str; str++) {
if (accept_char(*str))
secure_fputc(ssi, *str);
else {
if (isascii(*str)) {
secure_fprintf(ssi, "&#%i;", (int) *str);
}
else {
const unsigned char *s = u2cp_no_nbsp(*str, cp);
if (s) print_xml_entities(ssi, s);
}
}
if (init_string(&entitized)
&& add_html_to_string(&entitized, str, strlen(str))) {
secure_fputs(ssi, entitized.source);
} else {
secsave_errno = SS_ERR_OUT_OF_MEM;
ssi->err = ENOMEM;
}
#undef accept_char
done_string(&entitized);
}
static void
@ -226,7 +216,6 @@ write_bookmarks_list(struct secure_save_info *ssi,
indentation(ssi, n + 2);
secure_fputs(ssi, "<title>");
/** @todo Bug 153: bm->title should be UTF-8 */
print_xml_entities(ssi, bm->title);
secure_fputs(ssi, "</title>\n");
@ -239,13 +228,11 @@ write_bookmarks_list(struct secure_save_info *ssi,
} else if (bm->box_item->type == BI_LEAF) {
secure_fputs(ssi, "<bookmark href=\"");
/** @todo Bug 1066: bm->url should be UTF-8 */
print_xml_entities(ssi, bm->url);
secure_fputs(ssi, "\">\n");
indentation(ssi, n + 2);
secure_fputs(ssi, "<title>");
/** @todo Bug 153: bm->title should be UTF-8 */
print_xml_entities(ssi, bm->title);
secure_fputs(ssi, "</title>\n");
@ -315,25 +302,34 @@ on_element_close(void *data, const char *name)
}
static unsigned char *
delete_whites(unsigned char *s)
delete_whites(const unsigned char *s)
{
unsigned char *r;
int count = 0, c = 0, i;
int last_was_space = 0, c = 0, i;
int len = strlen(s);
r = mem_alloc(len + 1);
if (!r) return NULL;
for (i = 0; i < len; i++) {
if (isspace(s[i])) {
if (count == 1) continue;
else count = 1;
}
else count = 0;
if (s[i] == '\n' || s[i] == '\t')
/* Recognize only the whitespace characters listed
* in section 2.3 of XML 1.1. U+0085 and U+2028 need
* not be recognized here because section 2.11 says
* the XML processor must translate them to U+000A.
* Do not use isspace() because the string is in UTF-8
* and individual bytes might not be characters at
* all. */
switch (s[i]) {
case '\x20': case '\x09': case '\x0D': case '\x0A':
if (last_was_space) continue;
last_was_space = 1;
r[c++] = ' ';
else r[c++] = s[i];
break;
default:
last_was_space = 0;
r[c++] = s[i];
break;
}
}
r[c] = '\0';
@ -370,7 +366,8 @@ on_text(void *data, const XML_Char *text, int len)
/* xbel_tree_to_bookmarks_list: returns 0 on fail,
* 1 on success */
static int
xbeltree_to_bookmarks_list(struct tree_node *node,
xbeltree_to_bookmarks_list(const struct read_bookmarks_xbel *preload,
struct tree_node *node,
struct bookmark *current_parent)
{
struct bookmark *tmp;
@ -384,8 +381,7 @@ xbeltree_to_bookmarks_list(struct tree_node *node,
title = get_child(node, "title");
href = get_attribute_value(node, "href");
/** @todo Bugs 153, 1066: add_bookmark()
* expects UTF-8. */
intl_set_charset_by_index(preload->utf8_cp);
tmp = add_bookmark(current_parent, 0,
/* The <title> element is optional */
title && title->text ? title->text
@ -408,7 +404,7 @@ xbeltree_to_bookmarks_list(struct tree_node *node,
title = get_child(node, "title");
/** @todo Bug 153: add_bookmark() expects UTF-8. */
intl_set_charset_by_index(preload->utf8_cp);
tmp = add_bookmark(current_parent, 0,
title && title->text ? title->text
: (unsigned char *) gettext("No title"),
@ -434,14 +430,18 @@ xbeltree_to_bookmarks_list(struct tree_node *node,
if (node->children) {
int ret;
struct bookmark *parent_for_nested;
/* If this node is a <folder> element, current parent
* changes */
ret = (!strcmp(node->name, "folder") ?
xbeltree_to_bookmarks_list(node->children,
lastbm) :
xbeltree_to_bookmarks_list(node->children,
current_parent));
if (!strcmp(node->name, "folder"))
parent_for_nested = lastbm;
else
parent_for_nested = current_parent;
ret = xbeltree_to_bookmarks_list(preload,
node->children,
parent_for_nested);
/* Out of memory */
if (!ret) return 0;
}

View File

@ -54,14 +54,13 @@ static struct option_info bookmark_options_info[] = {
"file_format", 0, 0, 1, 0,
N_("File format for bookmarks (affects both reading and saving):\n"
"0 is the default native ELinks format\n"
"1 is XBEL universal XML bookmarks format (ELinks bug 153: NO NATIONAL CHARS SUPPORT!)")),
"1 is XBEL universal XML bookmarks format")),
#else
INIT_OPT_INT("bookmarks", N_("File format"),
"file_format", 0, 0, 1, 0,
N_("File format for bookmarks (affects both reading and saving):\n"
"0 is the default native ELinks format\n"
"1 is XBEL universal XML bookmarks format (ELinks bug 153: NO NATIONAL CHARS SUPPORT!)"
" (DISABLED)")),
"1 is XBEL universal XML bookmarks format (DISABLED)")),
#endif
INIT_OPT_BOOL("bookmarks", N_("Save folder state"),

View File

@ -45,6 +45,10 @@ n_(unsigned char *msg1, unsigned char *msg2, unsigned long int n, struct termina
return gettext_noop(msg1);
}
static inline void
intl_set_charset_by_index(int new_charset)
{
}
#else
@ -59,10 +63,8 @@ extern int current_charset;
/* #define DEBUG_IT */
static inline void
intl_set_charset(struct terminal *term)
intl_set_charset_by_index(int new_charset)
{
int new_charset = get_terminal_codepage(term);
/* Prevent useless switching. */
if (current_charset != new_charset) {
bind_textdomain_codeset( /* PACKAGE */ "elinks",
@ -71,6 +73,14 @@ intl_set_charset(struct terminal *term)
}
}
static inline void
intl_set_charset(struct terminal *term)
{
int new_charset = get_terminal_codepage(term);
intl_set_charset_by_index(new_charset);
}
/* TODO: Ideally, we should internally work only in Unicode - then the need for
* charsets multiplexing would cease. That'll take some work yet, though.
* --pasky */