mirror of
https://github.com/rkd77/elinks.git
synced 2024-12-04 14:46:47 -05:00
bug 153, 1066: Convert XBEL bookmarks to/from UTF-8.
When the file is being read, Expat provides the strings to ELinks in UTF-8, so ELinks can put them in struct bookmark without conversions. Make sure gettext returns any placeholder strings in UTF-8, too. Replace '\r' with ' ' in bookmark titles and URLs. When the file is being written, put encoding="UTF-8" in the XML declaration, and then write out the strings from struct bookmark without character set conversions. Do replace some characters with entity references though, by calling add_html_to_string().
This commit is contained in:
parent
8c0ae2a215
commit
73f925ce21
@ -11,6 +11,7 @@
|
||||
#endif /* HAVE_CONFIG_H */
|
||||
|
||||
#include <ctype.h>
|
||||
#include <errno.h>
|
||||
#include <expat.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
@ -54,9 +55,14 @@ static unsigned char *get_attribute_value(struct tree_node *node,
|
||||
unsigned char *name);
|
||||
|
||||
|
||||
struct read_bookmarks_xbel {
|
||||
int utf8_cp;
|
||||
};
|
||||
|
||||
static void read_bookmarks_xbel(FILE *f);
|
||||
static unsigned char * filename_bookmarks_xbel(int writing);
|
||||
static int xbeltree_to_bookmarks_list(struct tree_node *root,
|
||||
static int xbeltree_to_bookmarks_list(const struct read_bookmarks_xbel *preload,
|
||||
struct tree_node *root,
|
||||
struct bookmark *current_parent);
|
||||
static void write_bookmarks_list(struct secure_save_info *ssi,
|
||||
LIST_OF(struct bookmark) *bookmarks_list,
|
||||
@ -90,6 +96,7 @@ read_bookmarks_xbel(FILE *f)
|
||||
XML_Parser p;
|
||||
int done = 0;
|
||||
int err = 0;
|
||||
struct read_bookmarks_xbel preload;
|
||||
|
||||
readok = 0;
|
||||
|
||||
@ -126,7 +133,12 @@ read_bookmarks_xbel(FILE *f)
|
||||
}
|
||||
}
|
||||
|
||||
if (!err) readok = xbeltree_to_bookmarks_list(root_node->children, NULL); /* Top node is xbel */
|
||||
if (!err) {
|
||||
preload.utf8_cp = get_cp_index("UTF-8");
|
||||
readok = xbeltree_to_bookmarks_list(&preload,
|
||||
root_node->children, /* Top node is xbel */
|
||||
NULL);
|
||||
}
|
||||
|
||||
XML_ParserFree(p);
|
||||
free_xbeltree(root_node);
|
||||
@ -141,7 +153,7 @@ write_bookmarks_xbel(struct secure_save_info *ssi,
|
||||
/* We check for readok in filename_bookmarks_xbel(). */
|
||||
|
||||
secure_fputs(ssi,
|
||||
"<?xml version=\"1.0\"?>\n"
|
||||
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
|
||||
"<!DOCTYPE xbel PUBLIC \"+//IDN python.org//DTD XML "
|
||||
"Bookmark Exchange Language 1.0//EN//XML\"\n"
|
||||
" "
|
||||
@ -169,42 +181,20 @@ indentation(struct secure_save_info *ssi, int num)
|
||||
secure_fputs(ssi, " ");
|
||||
}
|
||||
|
||||
/* FIXME This is totally broken, we should use the Unicode value in
|
||||
* numeric entities.
|
||||
* Additionally it is slow, not elegant, incomplete and
|
||||
* if you pay enough attention you can smell the unmistakable
|
||||
* odor of doom coming from it. --fabio */
|
||||
static void
|
||||
print_xml_entities(struct secure_save_info *ssi, const unsigned char *str)
|
||||
{
|
||||
#define accept_char(x) (isident((x)) || (x) == ' ' || (x) == '.' \
|
||||
|| (x) == ':' || (x) == ';' \
|
||||
|| (x) == '/' || (x) == '(' \
|
||||
|| (x) == ')' || (x) == '}' \
|
||||
|| (x) == '{' || (x) == '%' \
|
||||
|| (x) == '+')
|
||||
struct string entitized = NULL_STRING;
|
||||
|
||||
static int cp = -1;
|
||||
|
||||
if (cp == -1) cp = get_cp_index("us-ascii");
|
||||
|
||||
for (; *str; str++) {
|
||||
if (accept_char(*str))
|
||||
secure_fputc(ssi, *str);
|
||||
else {
|
||||
if (isascii(*str)) {
|
||||
secure_fprintf(ssi, "&#%i;", (int) *str);
|
||||
}
|
||||
else {
|
||||
const unsigned char *s = u2cp_no_nbsp(*str, cp);
|
||||
|
||||
if (s) print_xml_entities(ssi, s);
|
||||
}
|
||||
}
|
||||
if (init_string(&entitized)
|
||||
&& add_html_to_string(&entitized, str, strlen(str))) {
|
||||
secure_fputs(ssi, entitized.source);
|
||||
} else {
|
||||
secsave_errno = SS_ERR_OUT_OF_MEM;
|
||||
ssi->err = ENOMEM;
|
||||
}
|
||||
|
||||
#undef accept_char
|
||||
|
||||
done_string(&entitized);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -226,7 +216,6 @@ write_bookmarks_list(struct secure_save_info *ssi,
|
||||
|
||||
indentation(ssi, n + 2);
|
||||
secure_fputs(ssi, "<title>");
|
||||
/** @todo Bug 153: bm->title should be UTF-8 */
|
||||
print_xml_entities(ssi, bm->title);
|
||||
secure_fputs(ssi, "</title>\n");
|
||||
|
||||
@ -239,13 +228,11 @@ write_bookmarks_list(struct secure_save_info *ssi,
|
||||
} else if (bm->box_item->type == BI_LEAF) {
|
||||
|
||||
secure_fputs(ssi, "<bookmark href=\"");
|
||||
/** @todo Bug 1066: bm->url should be UTF-8 */
|
||||
print_xml_entities(ssi, bm->url);
|
||||
secure_fputs(ssi, "\">\n");
|
||||
|
||||
indentation(ssi, n + 2);
|
||||
secure_fputs(ssi, "<title>");
|
||||
/** @todo Bug 153: bm->title should be UTF-8 */
|
||||
print_xml_entities(ssi, bm->title);
|
||||
secure_fputs(ssi, "</title>\n");
|
||||
|
||||
@ -315,25 +302,34 @@ on_element_close(void *data, const char *name)
|
||||
}
|
||||
|
||||
static unsigned char *
|
||||
delete_whites(unsigned char *s)
|
||||
delete_whites(const unsigned char *s)
|
||||
{
|
||||
unsigned char *r;
|
||||
int count = 0, c = 0, i;
|
||||
int last_was_space = 0, c = 0, i;
|
||||
int len = strlen(s);
|
||||
|
||||
r = mem_alloc(len + 1);
|
||||
if (!r) return NULL;
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
if (isspace(s[i])) {
|
||||
if (count == 1) continue;
|
||||
else count = 1;
|
||||
}
|
||||
else count = 0;
|
||||
|
||||
if (s[i] == '\n' || s[i] == '\t')
|
||||
/* Recognize only the whitespace characters listed
|
||||
* in section 2.3 of XML 1.1. U+0085 and U+2028 need
|
||||
* not be recognized here because section 2.11 says
|
||||
* the XML processor must translate them to U+000A.
|
||||
* Do not use isspace() because the string is in UTF-8
|
||||
* and individual bytes might not be characters at
|
||||
* all. */
|
||||
switch (s[i]) {
|
||||
case '\x20': case '\x09': case '\x0D': case '\x0A':
|
||||
if (last_was_space) continue;
|
||||
last_was_space = 1;
|
||||
r[c++] = ' ';
|
||||
else r[c++] = s[i];
|
||||
break;
|
||||
default:
|
||||
last_was_space = 0;
|
||||
r[c++] = s[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
r[c] = '\0';
|
||||
@ -370,7 +366,8 @@ on_text(void *data, const XML_Char *text, int len)
|
||||
/* xbel_tree_to_bookmarks_list: returns 0 on fail,
|
||||
* 1 on success */
|
||||
static int
|
||||
xbeltree_to_bookmarks_list(struct tree_node *node,
|
||||
xbeltree_to_bookmarks_list(const struct read_bookmarks_xbel *preload,
|
||||
struct tree_node *node,
|
||||
struct bookmark *current_parent)
|
||||
{
|
||||
struct bookmark *tmp;
|
||||
@ -384,8 +381,7 @@ xbeltree_to_bookmarks_list(struct tree_node *node,
|
||||
title = get_child(node, "title");
|
||||
href = get_attribute_value(node, "href");
|
||||
|
||||
/** @todo Bugs 153, 1066: add_bookmark()
|
||||
* expects UTF-8. */
|
||||
intl_set_charset_by_index(preload->utf8_cp);
|
||||
tmp = add_bookmark(current_parent, 0,
|
||||
/* The <title> element is optional */
|
||||
title && title->text ? title->text
|
||||
@ -408,7 +404,7 @@ xbeltree_to_bookmarks_list(struct tree_node *node,
|
||||
|
||||
title = get_child(node, "title");
|
||||
|
||||
/** @todo Bug 153: add_bookmark() expects UTF-8. */
|
||||
intl_set_charset_by_index(preload->utf8_cp);
|
||||
tmp = add_bookmark(current_parent, 0,
|
||||
title && title->text ? title->text
|
||||
: (unsigned char *) gettext("No title"),
|
||||
@ -434,14 +430,18 @@ xbeltree_to_bookmarks_list(struct tree_node *node,
|
||||
|
||||
if (node->children) {
|
||||
int ret;
|
||||
struct bookmark *parent_for_nested;
|
||||
|
||||
/* If this node is a <folder> element, current parent
|
||||
* changes */
|
||||
ret = (!strcmp(node->name, "folder") ?
|
||||
xbeltree_to_bookmarks_list(node->children,
|
||||
lastbm) :
|
||||
xbeltree_to_bookmarks_list(node->children,
|
||||
current_parent));
|
||||
if (!strcmp(node->name, "folder"))
|
||||
parent_for_nested = lastbm;
|
||||
else
|
||||
parent_for_nested = current_parent;
|
||||
|
||||
ret = xbeltree_to_bookmarks_list(preload,
|
||||
node->children,
|
||||
parent_for_nested);
|
||||
/* Out of memory */
|
||||
if (!ret) return 0;
|
||||
}
|
||||
|
@ -54,14 +54,13 @@ static struct option_info bookmark_options_info[] = {
|
||||
"file_format", 0, 0, 1, 0,
|
||||
N_("File format for bookmarks (affects both reading and saving):\n"
|
||||
"0 is the default native ELinks format\n"
|
||||
"1 is XBEL universal XML bookmarks format (ELinks bug 153: NO NATIONAL CHARS SUPPORT!)")),
|
||||
"1 is XBEL universal XML bookmarks format")),
|
||||
#else
|
||||
INIT_OPT_INT("bookmarks", N_("File format"),
|
||||
"file_format", 0, 0, 1, 0,
|
||||
N_("File format for bookmarks (affects both reading and saving):\n"
|
||||
"0 is the default native ELinks format\n"
|
||||
"1 is XBEL universal XML bookmarks format (ELinks bug 153: NO NATIONAL CHARS SUPPORT!)"
|
||||
" (DISABLED)")),
|
||||
"1 is XBEL universal XML bookmarks format (DISABLED)")),
|
||||
#endif
|
||||
|
||||
INIT_OPT_BOOL("bookmarks", N_("Save folder state"),
|
||||
|
@ -45,6 +45,10 @@ n_(unsigned char *msg1, unsigned char *msg2, unsigned long int n, struct termina
|
||||
return gettext_noop(msg1);
|
||||
}
|
||||
|
||||
static inline void
|
||||
intl_set_charset_by_index(int new_charset)
|
||||
{
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
@ -59,10 +63,8 @@ extern int current_charset;
|
||||
/* #define DEBUG_IT */
|
||||
|
||||
static inline void
|
||||
intl_set_charset(struct terminal *term)
|
||||
intl_set_charset_by_index(int new_charset)
|
||||
{
|
||||
int new_charset = get_terminal_codepage(term);
|
||||
|
||||
/* Prevent useless switching. */
|
||||
if (current_charset != new_charset) {
|
||||
bind_textdomain_codeset( /* PACKAGE */ "elinks",
|
||||
@ -71,6 +73,14 @@ intl_set_charset(struct terminal *term)
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
intl_set_charset(struct terminal *term)
|
||||
{
|
||||
int new_charset = get_terminal_codepage(term);
|
||||
|
||||
intl_set_charset_by_index(new_charset);
|
||||
}
|
||||
|
||||
/* TODO: Ideally, we should internally work only in Unicode - then the need for
|
||||
* charsets multiplexing would cease. That'll take some work yet, though.
|
||||
* --pasky */
|
||||
|
Loading…
Reference in New Issue
Block a user