This function, ironically, is being removed. This is one of only two users, and it's pretty hard to use correctly. In this case, using one of the existing constructors to keep a prefix seems clearer even in a world where remove() was better implemented. Test: treehugger Change-Id: Icdf02d9fcc059c396c13787d9bb4e8bda5658656
399 lines
12 KiB
C++
399 lines
12 KiB
C++
#include "pseudolocalize.h"
|
|
|
|
using namespace std;
|
|
|
|
// String basis to generate expansion
|
|
static const String16 k_expansion_string = String16("one two three "
|
|
"four five six seven eight nine ten eleven twelve thirteen "
|
|
"fourteen fiveteen sixteen seventeen nineteen twenty");
|
|
|
|
// Special unicode characters to override directionality of the words
|
|
static const String16 k_rlm = String16("\xe2\x80\x8f");
|
|
static const String16 k_rlo = String16("\xE2\x80\xae");
|
|
static const String16 k_pdf = String16("\xE2\x80\xac");
|
|
|
|
// Placeholder marks
|
|
static const String16 k_placeholder_open = String16("\xc2\xbb");
|
|
static const String16 k_placeholder_close = String16("\xc2\xab");
|
|
|
|
static const char16_t k_arg_start = '{';
|
|
static const char16_t k_arg_end = '}';
|
|
|
|
Pseudolocalizer::Pseudolocalizer(PseudolocalizationMethod m)
|
|
: mImpl(nullptr), mLastDepth(0) {
|
|
setMethod(m);
|
|
}
|
|
|
|
void Pseudolocalizer::setMethod(PseudolocalizationMethod m) {
|
|
if (mImpl) {
|
|
delete mImpl;
|
|
}
|
|
if (m == PSEUDO_ACCENTED) {
|
|
mImpl = new PseudoMethodAccent();
|
|
} else if (m == PSEUDO_BIDI) {
|
|
mImpl = new PseudoMethodBidi();
|
|
} else {
|
|
mImpl = new PseudoMethodNone();
|
|
}
|
|
}
|
|
|
|
String16 Pseudolocalizer::text(const String16& text) {
|
|
String16 out;
|
|
size_t depth = mLastDepth;
|
|
size_t lastpos, pos;
|
|
const size_t length= text.size();
|
|
const char16_t* str = text.string();
|
|
bool escaped = false;
|
|
for (lastpos = pos = 0; pos < length; pos++) {
|
|
char16_t c = str[pos];
|
|
if (escaped) {
|
|
escaped = false;
|
|
continue;
|
|
}
|
|
if (c == '\'') {
|
|
escaped = true;
|
|
continue;
|
|
}
|
|
|
|
if (c == k_arg_start) {
|
|
depth++;
|
|
} else if (c == k_arg_end && depth) {
|
|
depth--;
|
|
}
|
|
|
|
if (mLastDepth != depth || pos == length - 1) {
|
|
bool pseudo = ((mLastDepth % 2) == 0);
|
|
size_t nextpos = pos;
|
|
if (!pseudo || depth == mLastDepth) {
|
|
nextpos++;
|
|
}
|
|
size_t size = nextpos - lastpos;
|
|
if (size) {
|
|
String16 chunk = String16(text, size, lastpos);
|
|
if (pseudo) {
|
|
chunk = mImpl->text(chunk);
|
|
} else if (str[lastpos] == k_arg_start &&
|
|
str[nextpos - 1] == k_arg_end) {
|
|
chunk = mImpl->placeholder(chunk);
|
|
}
|
|
out.append(chunk);
|
|
}
|
|
if (pseudo && depth < mLastDepth) { // End of message
|
|
out.append(mImpl->end());
|
|
} else if (!pseudo && depth > mLastDepth) { // Start of message
|
|
out.append(mImpl->start());
|
|
}
|
|
lastpos = nextpos;
|
|
mLastDepth = depth;
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
static const char*
|
|
pseudolocalize_char(const char16_t c)
|
|
{
|
|
switch (c) {
|
|
case 'a': return "\xc3\xa5";
|
|
case 'b': return "\xc9\x93";
|
|
case 'c': return "\xc3\xa7";
|
|
case 'd': return "\xc3\xb0";
|
|
case 'e': return "\xc3\xa9";
|
|
case 'f': return "\xc6\x92";
|
|
case 'g': return "\xc4\x9d";
|
|
case 'h': return "\xc4\xa5";
|
|
case 'i': return "\xc3\xae";
|
|
case 'j': return "\xc4\xb5";
|
|
case 'k': return "\xc4\xb7";
|
|
case 'l': return "\xc4\xbc";
|
|
case 'm': return "\xe1\xb8\xbf";
|
|
case 'n': return "\xc3\xb1";
|
|
case 'o': return "\xc3\xb6";
|
|
case 'p': return "\xc3\xbe";
|
|
case 'q': return "\x51";
|
|
case 'r': return "\xc5\x95";
|
|
case 's': return "\xc5\xa1";
|
|
case 't': return "\xc5\xa3";
|
|
case 'u': return "\xc3\xbb";
|
|
case 'v': return "\x56";
|
|
case 'w': return "\xc5\xb5";
|
|
case 'x': return "\xd1\x85";
|
|
case 'y': return "\xc3\xbd";
|
|
case 'z': return "\xc5\xbe";
|
|
case 'A': return "\xc3\x85";
|
|
case 'B': return "\xce\xb2";
|
|
case 'C': return "\xc3\x87";
|
|
case 'D': return "\xc3\x90";
|
|
case 'E': return "\xc3\x89";
|
|
case 'G': return "\xc4\x9c";
|
|
case 'H': return "\xc4\xa4";
|
|
case 'I': return "\xc3\x8e";
|
|
case 'J': return "\xc4\xb4";
|
|
case 'K': return "\xc4\xb6";
|
|
case 'L': return "\xc4\xbb";
|
|
case 'M': return "\xe1\xb8\xbe";
|
|
case 'N': return "\xc3\x91";
|
|
case 'O': return "\xc3\x96";
|
|
case 'P': return "\xc3\x9e";
|
|
case 'Q': return "\x71";
|
|
case 'R': return "\xc5\x94";
|
|
case 'S': return "\xc5\xa0";
|
|
case 'T': return "\xc5\xa2";
|
|
case 'U': return "\xc3\x9b";
|
|
case 'V': return "\xce\xbd";
|
|
case 'W': return "\xc5\xb4";
|
|
case 'X': return "\xc3\x97";
|
|
case 'Y': return "\xc3\x9d";
|
|
case 'Z': return "\xc5\xbd";
|
|
case '!': return "\xc2\xa1";
|
|
case '?': return "\xc2\xbf";
|
|
case '$': return "\xe2\x82\xac";
|
|
default: return NULL;
|
|
}
|
|
}
|
|
|
|
static bool is_possible_normal_placeholder_end(const char16_t c) {
|
|
switch (c) {
|
|
case 's': return true;
|
|
case 'S': return true;
|
|
case 'c': return true;
|
|
case 'C': return true;
|
|
case 'd': return true;
|
|
case 'o': return true;
|
|
case 'x': return true;
|
|
case 'X': return true;
|
|
case 'f': return true;
|
|
case 'e': return true;
|
|
case 'E': return true;
|
|
case 'g': return true;
|
|
case 'G': return true;
|
|
case 'a': return true;
|
|
case 'A': return true;
|
|
case 'b': return true;
|
|
case 'B': return true;
|
|
case 'h': return true;
|
|
case 'H': return true;
|
|
case '%': return true;
|
|
case 'n': return true;
|
|
default: return false;
|
|
}
|
|
}
|
|
|
|
static String16 pseudo_generate_expansion(const unsigned int length) {
|
|
String16 result = k_expansion_string;
|
|
const char16_t* s = result.string();
|
|
if (result.size() < length) {
|
|
result += String16(" ");
|
|
result += pseudo_generate_expansion(length - result.size());
|
|
} else {
|
|
int ext = 0;
|
|
// Should contain only whole words, so looking for a space
|
|
for (unsigned int i = length + 1; i < result.size(); ++i) {
|
|
++ext;
|
|
if (s[i] == ' ') {
|
|
break;
|
|
}
|
|
}
|
|
// Just keep the first length + ext characters
|
|
result = String16(result, length + ext);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
static bool is_space(const char16_t c) {
|
|
return (c == ' ' || c == '\t' || c == '\n');
|
|
}
|
|
|
|
String16 PseudoMethodAccent::start() {
|
|
String16 result;
|
|
if (mDepth == 0) {
|
|
result = String16(String8("["));
|
|
}
|
|
mWordCount = mLength = 0;
|
|
mDepth++;
|
|
return result;
|
|
}
|
|
|
|
String16 PseudoMethodAccent::end() {
|
|
String16 result;
|
|
if (mLength) {
|
|
result.append(String16(String8(" ")));
|
|
result.append(pseudo_generate_expansion(
|
|
mWordCount > 3 ? mLength : mLength / 2));
|
|
}
|
|
mWordCount = mLength = 0;
|
|
mDepth--;
|
|
if (mDepth == 0) {
|
|
result.append(String16(String8("]")));
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Converts characters so they look like they've been localized.
|
|
*
|
|
* Note: This leaves escape sequences untouched so they can later be
|
|
* processed by ResTable::collectString in the normal way.
|
|
*/
|
|
String16 PseudoMethodAccent::text(const String16& source)
|
|
{
|
|
const char16_t* s = source.string();
|
|
String16 result;
|
|
const size_t I = source.size();
|
|
bool lastspace = true;
|
|
for (size_t i=0; i<I; i++) {
|
|
char16_t c = s[i];
|
|
if (c == '\\') {
|
|
// Escape syntax, no need to pseudolocalize
|
|
if (i<I-1) {
|
|
result += String16("\\");
|
|
i++;
|
|
c = s[i];
|
|
switch (c) {
|
|
case 'u':
|
|
// this one takes up 5 chars
|
|
result += String16(s+i, 5);
|
|
i += 4;
|
|
break;
|
|
case 't':
|
|
case 'n':
|
|
case '#':
|
|
case '@':
|
|
case '?':
|
|
case '"':
|
|
case '\'':
|
|
case '\\':
|
|
default:
|
|
result.append(&c, 1);
|
|
break;
|
|
}
|
|
} else {
|
|
result.append(&c, 1);
|
|
}
|
|
} else if (c == '%') {
|
|
// Placeholder syntax, no need to pseudolocalize
|
|
String16 chunk;
|
|
bool end = false;
|
|
chunk.append(&c, 1);
|
|
while (!end && i < I) {
|
|
++i;
|
|
c = s[i];
|
|
chunk.append(&c, 1);
|
|
if (is_possible_normal_placeholder_end(c)) {
|
|
end = true;
|
|
} else if (c == 't') {
|
|
++i;
|
|
c = s[i];
|
|
chunk.append(&c, 1);
|
|
end = true;
|
|
}
|
|
}
|
|
// Treat chunk as a placeholder unless it ends with %.
|
|
result += ((c == '%') ? chunk : placeholder(chunk));
|
|
} else if (c == '<' || c == '&') {
|
|
// html syntax, no need to pseudolocalize
|
|
bool tag_closed = false;
|
|
while (!tag_closed && i < I) {
|
|
if (c == '&') {
|
|
String16 escape_text;
|
|
escape_text.append(&c, 1);
|
|
bool end = false;
|
|
size_t htmlCodePos = i;
|
|
while (!end && htmlCodePos < I) {
|
|
++htmlCodePos;
|
|
c = s[htmlCodePos];
|
|
escape_text.append(&c, 1);
|
|
// Valid html code
|
|
if (c == ';') {
|
|
end = true;
|
|
i = htmlCodePos;
|
|
}
|
|
// Wrong html code
|
|
else if (!((c == '#' ||
|
|
(c >= 'a' && c <= 'z') ||
|
|
(c >= 'A' && c <= 'Z') ||
|
|
(c >= '0' && c <= '9')))) {
|
|
end = true;
|
|
}
|
|
}
|
|
result += escape_text;
|
|
if (escape_text != String16("<")) {
|
|
tag_closed = true;
|
|
}
|
|
continue;
|
|
}
|
|
if (c == '>') {
|
|
tag_closed = true;
|
|
result.append(&c, 1);
|
|
continue;
|
|
}
|
|
result.append(&c, 1);
|
|
i++;
|
|
c = s[i];
|
|
}
|
|
} else {
|
|
// This is a pure text that should be pseudolocalized
|
|
const char* p = pseudolocalize_char(c);
|
|
if (p != NULL) {
|
|
result += String16(p);
|
|
} else {
|
|
bool space = is_space(c);
|
|
if (lastspace && !space) {
|
|
mWordCount++;
|
|
}
|
|
lastspace = space;
|
|
result.append(&c, 1);
|
|
}
|
|
// Count only pseudolocalizable chars and delimiters
|
|
mLength++;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
String16 PseudoMethodAccent::placeholder(const String16& source) {
|
|
// Surround a placeholder with brackets
|
|
return k_placeholder_open + source + k_placeholder_close;
|
|
}
|
|
|
|
String16 PseudoMethodBidi::text(const String16& source)
|
|
{
|
|
const char16_t* s = source.string();
|
|
String16 result;
|
|
bool lastspace = true;
|
|
bool space = true;
|
|
bool escape = false;
|
|
const char16_t ESCAPE_CHAR = '\\';
|
|
for (size_t i=0; i<source.size(); i++) {
|
|
char16_t c = s[i];
|
|
if (!escape && c == ESCAPE_CHAR) {
|
|
escape = true;
|
|
continue;
|
|
}
|
|
space = (!escape && is_space(c)) || (escape && (c == 'n' || c == 't'));
|
|
if (lastspace && !space) {
|
|
// Word start
|
|
result += k_rlm + k_rlo;
|
|
} else if (!lastspace && space) {
|
|
// Word end
|
|
result += k_pdf + k_rlm;
|
|
}
|
|
lastspace = space;
|
|
if (escape) {
|
|
result.append(&ESCAPE_CHAR, 1);
|
|
escape=false;
|
|
}
|
|
result.append(&c, 1);
|
|
}
|
|
if (!lastspace) {
|
|
// End of last word
|
|
result += k_pdf + k_rlm;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
String16 PseudoMethodBidi::placeholder(const String16& source) {
|
|
// Surround a placeholder with directionality change sequence
|
|
return k_rlm + k_rlo + source + k_pdf + k_rlm;
|
|
}
|
|
|