Add thai dictionary from ICU for linebreaking
This commit is contained in:
parent
e344a94946
commit
59ccef98fb
26393
data/thaidict.txt
Normal file
26393
data/thaidict.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -395,6 +395,7 @@ void FontManager::shape(const std::u32string& text,
|
|||||||
rtl_char.resize(str.size(), false);
|
rtl_char.resize(str.size(), false);
|
||||||
breakable.resize(str.size(), false);
|
breakable.resize(str.size(), false);
|
||||||
LineBreakingRules::insertBreakMark(str, breakable);
|
LineBreakingRules::insertBreakMark(str, breakable);
|
||||||
|
translations->insertThaiBreakMark(str, breakable);
|
||||||
RTLRules::insertRTLMark(str, rtl_line, rtl_char);
|
RTLRules::insertRTLMark(str, rtl_line, rtl_char);
|
||||||
size_t count = 0;
|
size_t count = 0;
|
||||||
raqm_glyph_t* glyphs = raqm_get_glyphs(rq, &count);
|
raqm_glyph_t* glyphs = raqm_get_glyphs(rq, &count);
|
||||||
|
@ -35,6 +35,8 @@
|
|||||||
#include <cwchar>
|
#include <cwchar>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <unordered_set>
|
||||||
|
|
||||||
#include "config/user_config.hpp"
|
#include "config/user_config.hpp"
|
||||||
#include "io/file_manager.hpp"
|
#include "io/file_manager.hpp"
|
||||||
@ -62,6 +64,16 @@ Translations* translations = NULL;
|
|||||||
std::map<std::string, std::string> Translations::m_localized_name;
|
std::map<std::string, std::string> Translations::m_localized_name;
|
||||||
std::map<std::string, std::map<std::string, irr::core::stringw> >
|
std::map<std::string, std::map<std::string, irr::core::stringw> >
|
||||||
Translations::m_localized_country_codes;
|
Translations::m_localized_country_codes;
|
||||||
|
// ============================================================================
|
||||||
|
std::unordered_map<char32_t,
|
||||||
|
std::pair<std::unordered_set<std::u32string>, size_t> > g_thai_dict;
|
||||||
|
// ============================================================================
|
||||||
|
constexpr bool isThaiCP(char32_t c)
|
||||||
|
{
|
||||||
|
return c >= 0x0e00 && c <= 0x0e7f;
|
||||||
|
} // isThaiCP
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
const bool REMOVE_BOM = false;
|
const bool REMOVE_BOM = false;
|
||||||
using namespace tinygettext;
|
using namespace tinygettext;
|
||||||
@ -202,6 +214,50 @@ Translations::Translations() //: m_dictionary_manager("UTF-16")
|
|||||||
Log::error("translation", "%s", e.what());
|
Log::error("translation", "%s", e.what());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (g_thai_dict.empty())
|
||||||
|
{
|
||||||
|
const std::string file_name = file_manager->getAsset("thaidict.txt");
|
||||||
|
try
|
||||||
|
{
|
||||||
|
std::unique_ptr<std::istream> in(new std::ifstream(file_name.c_str()));
|
||||||
|
if (!in.get())
|
||||||
|
{
|
||||||
|
Log::error("translation", "error: failure opening: '%s'.",
|
||||||
|
file_name.c_str());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
std::string line;
|
||||||
|
while (!StringUtils::safeGetline(*in, line).eof())
|
||||||
|
{
|
||||||
|
const std::u32string& u32line = StringUtils::utf8ToUtf32(line);
|
||||||
|
char32_t thai = u32line[0];
|
||||||
|
if (u32line.empty() || !isThaiCP(thai))
|
||||||
|
continue;
|
||||||
|
if (g_thai_dict.find(thai) == g_thai_dict.end())
|
||||||
|
{
|
||||||
|
g_thai_dict[thai] =
|
||||||
|
{
|
||||||
|
std::make_pair(
|
||||||
|
std::unordered_set<std::u32string>{u32line},
|
||||||
|
u32line.size())
|
||||||
|
};
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto& ret = g_thai_dict.at(thai);
|
||||||
|
ret.first.insert(u32line);
|
||||||
|
if (ret.second < u32line.size())
|
||||||
|
ret.second = u32line.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (std::exception& e)
|
||||||
|
{
|
||||||
|
Log::error("translation", "error: failure extract Thai dictionary.");
|
||||||
|
Log::error("translation", "%s", e.what());
|
||||||
|
}
|
||||||
|
}
|
||||||
// LC_ALL does not work, sscanf will then not always be able
|
// LC_ALL does not work, sscanf will then not always be able
|
||||||
// to scan for example: s=-1.1,-2.3,-3.3 correctly, which is
|
// to scan for example: s=-1.1,-2.3,-3.3 correctly, which is
|
||||||
// used in driveline files.
|
// used in driveline files.
|
||||||
@ -531,4 +587,46 @@ irr::core::stringw Translations::getLocalizedCountryName(const std::string& coun
|
|||||||
return StringUtils::utf8ToWide(country_code);
|
return StringUtils::utf8ToWide(country_code);
|
||||||
} // getLocalizedCountryName
|
} // getLocalizedCountryName
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------
|
||||||
|
/* Insert breakmark to thai sentence according to thai word dictionary, which
|
||||||
|
* adds a mark in the begining of a thai vocabulary
|
||||||
|
*/
|
||||||
|
void Translations::insertThaiBreakMark(const std::u32string& thai,
|
||||||
|
std::vector<bool>& breakable)
|
||||||
|
{
|
||||||
|
if (thai.size() < 3)
|
||||||
|
return;
|
||||||
|
for (size_t i = 0; i < thai.size();)
|
||||||
|
{
|
||||||
|
char32_t t = thai[i];
|
||||||
|
if (i >= thai.size() - 2 || !isThaiCP(t))
|
||||||
|
{
|
||||||
|
i++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto ret = g_thai_dict.find(t);
|
||||||
|
if (ret == g_thai_dict.end())
|
||||||
|
{
|
||||||
|
i++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
size_t checked_word = 1;
|
||||||
|
const size_t max_checking_word = ret->second.second;
|
||||||
|
for (size_t j = i + 1;; j++)
|
||||||
|
{
|
||||||
|
if (j - i > max_checking_word || j > thai.size())
|
||||||
|
break;
|
||||||
|
const std::u32string& ss = thai.substr(i, j - i);
|
||||||
|
if (ret->second.first.find(ss) != ret->second.first.end())
|
||||||
|
{
|
||||||
|
if (ss.size() > checked_word)
|
||||||
|
checked_word = ss.size();
|
||||||
|
if (i != 0)
|
||||||
|
breakable[i - 1] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i += checked_word;
|
||||||
|
}
|
||||||
|
} // insertThaiBreakMark
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -80,6 +80,8 @@ public:
|
|||||||
const std::string& getLocalizedName(const std::string& str) const;
|
const std::string& getLocalizedName(const std::string& str) const;
|
||||||
|
|
||||||
irr::core::stringw getLocalizedCountryName(const std::string& country_code) const;
|
irr::core::stringw getLocalizedCountryName(const std::string& country_code) const;
|
||||||
|
|
||||||
|
void insertThaiBreakMark(const std::u32string& thai, std::vector<bool>& breakable);
|
||||||
#endif
|
#endif
|
||||||
}; // Translations
|
}; // Translations
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user