Add thai dictionary from ICU for linebreaking
This commit is contained in:
parent
e344a94946
commit
59ccef98fb
26393
data/thaidict.txt
Normal file
26393
data/thaidict.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -395,6 +395,7 @@ void FontManager::shape(const std::u32string& text,
|
||||
rtl_char.resize(str.size(), false);
|
||||
breakable.resize(str.size(), false);
|
||||
LineBreakingRules::insertBreakMark(str, breakable);
|
||||
translations->insertThaiBreakMark(str, breakable);
|
||||
RTLRules::insertRTLMark(str, rtl_line, rtl_char);
|
||||
size_t count = 0;
|
||||
raqm_glyph_t* glyphs = raqm_get_glyphs(rq, &count);
|
||||
|
@ -35,6 +35,8 @@
|
||||
#include <cwchar>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "config/user_config.hpp"
|
||||
#include "io/file_manager.hpp"
|
||||
@ -62,6 +64,16 @@ Translations* translations = NULL;
|
||||
std::map<std::string, std::string> Translations::m_localized_name;
|
||||
std::map<std::string, std::map<std::string, irr::core::stringw> >
|
||||
Translations::m_localized_country_codes;
|
||||
// ============================================================================
|
||||
std::unordered_map<char32_t,
|
||||
std::pair<std::unordered_set<std::u32string>, size_t> > g_thai_dict;
|
||||
// ============================================================================
|
||||
constexpr bool isThaiCP(char32_t c)
|
||||
{
|
||||
return c >= 0x0e00 && c <= 0x0e7f;
|
||||
} // isThaiCP
|
||||
|
||||
// ============================================================================
|
||||
|
||||
const bool REMOVE_BOM = false;
|
||||
using namespace tinygettext;
|
||||
@ -202,6 +214,50 @@ Translations::Translations() //: m_dictionary_manager("UTF-16")
|
||||
Log::error("translation", "%s", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
if (g_thai_dict.empty())
|
||||
{
|
||||
const std::string file_name = file_manager->getAsset("thaidict.txt");
|
||||
try
|
||||
{
|
||||
std::unique_ptr<std::istream> in(new std::ifstream(file_name.c_str()));
|
||||
if (!in.get())
|
||||
{
|
||||
Log::error("translation", "error: failure opening: '%s'.",
|
||||
file_name.c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
std::string line;
|
||||
while (!StringUtils::safeGetline(*in, line).eof())
|
||||
{
|
||||
const std::u32string& u32line = StringUtils::utf8ToUtf32(line);
|
||||
char32_t thai = u32line[0];
|
||||
if (u32line.empty() || !isThaiCP(thai))
|
||||
continue;
|
||||
if (g_thai_dict.find(thai) == g_thai_dict.end())
|
||||
{
|
||||
g_thai_dict[thai] =
|
||||
{
|
||||
std::make_pair(
|
||||
std::unordered_set<std::u32string>{u32line},
|
||||
u32line.size())
|
||||
};
|
||||
continue;
|
||||
}
|
||||
auto& ret = g_thai_dict.at(thai);
|
||||
ret.first.insert(u32line);
|
||||
if (ret.second < u32line.size())
|
||||
ret.second = u32line.size();
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (std::exception& e)
|
||||
{
|
||||
Log::error("translation", "error: failure extract Thai dictionary.");
|
||||
Log::error("translation", "%s", e.what());
|
||||
}
|
||||
}
|
||||
// LC_ALL does not work, sscanf will then not always be able
|
||||
// to scan for example: s=-1.1,-2.3,-3.3 correctly, which is
|
||||
// used in driveline files.
|
||||
@ -531,4 +587,46 @@ irr::core::stringw Translations::getLocalizedCountryName(const std::string& coun
|
||||
return StringUtils::utf8ToWide(country_code);
|
||||
} // getLocalizedCountryName
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
/* Insert breakmark to thai sentence according to thai word dictionary, which
|
||||
* adds a mark in the begining of a thai vocabulary
|
||||
*/
|
||||
void Translations::insertThaiBreakMark(const std::u32string& thai,
|
||||
std::vector<bool>& breakable)
|
||||
{
|
||||
if (thai.size() < 3)
|
||||
return;
|
||||
for (size_t i = 0; i < thai.size();)
|
||||
{
|
||||
char32_t t = thai[i];
|
||||
if (i >= thai.size() - 2 || !isThaiCP(t))
|
||||
{
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
auto ret = g_thai_dict.find(t);
|
||||
if (ret == g_thai_dict.end())
|
||||
{
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
size_t checked_word = 1;
|
||||
const size_t max_checking_word = ret->second.second;
|
||||
for (size_t j = i + 1;; j++)
|
||||
{
|
||||
if (j - i > max_checking_word || j > thai.size())
|
||||
break;
|
||||
const std::u32string& ss = thai.substr(i, j - i);
|
||||
if (ret->second.first.find(ss) != ret->second.first.end())
|
||||
{
|
||||
if (ss.size() > checked_word)
|
||||
checked_word = ss.size();
|
||||
if (i != 0)
|
||||
breakable[i - 1] = true;
|
||||
}
|
||||
}
|
||||
i += checked_word;
|
||||
}
|
||||
} // insertThaiBreakMark
|
||||
|
||||
#endif
|
||||
|
@ -80,6 +80,8 @@ public:
|
||||
const std::string& getLocalizedName(const std::string& str) const;
|
||||
|
||||
irr::core::stringw getLocalizedCountryName(const std::string& country_code) const;
|
||||
|
||||
void insertThaiBreakMark(const std::u32string& thai, std::vector<bool>& breakable);
|
||||
#endif
|
||||
}; // Translations
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user