2015-12-17 13:03:11 -08:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2015 The Android Open Source Project
|
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "compile/Pseudolocalizer.h"
|
|
|
|
#include "util/Util.h"
|
|
|
|
|
|
|
|
namespace aapt {
|
|
|
|
|
|
|
|
// String basis to generate expansion
|
2016-07-08 15:00:32 -07:00
|
|
|
static const std::string k_expansion_string = "one two three "
|
2015-12-17 13:03:11 -08:00
|
|
|
"four five six seven eight nine ten eleven twelve thirteen "
|
|
|
|
"fourteen fiveteen sixteen seventeen nineteen twenty";
|
|
|
|
|
|
|
|
// Special unicode characters to override directionality of the words
|
2016-07-08 15:00:32 -07:00
|
|
|
static const std::string k_rlm = "\u200f";
|
|
|
|
static const std::string k_rlo = "\u202e";
|
|
|
|
static const std::string k_pdf = "\u202c";
|
2015-12-17 13:03:11 -08:00
|
|
|
|
|
|
|
// Placeholder marks
|
2016-07-08 15:00:32 -07:00
|
|
|
static const std::string k_placeholder_open = "\u00bb";
|
|
|
|
static const std::string k_placeholder_close = "\u00ab";
|
2015-12-17 13:03:11 -08:00
|
|
|
|
2016-07-08 15:00:32 -07:00
|
|
|
static const char k_arg_start = '{';
|
|
|
|
static const char k_arg_end = '}';
|
2015-12-17 13:03:11 -08:00
|
|
|
|
|
|
|
class PseudoMethodNone : public PseudoMethodImpl {
|
|
|
|
public:
|
2016-07-08 15:00:32 -07:00
|
|
|
std::string text(const StringPiece& text) override { return text.toString(); }
|
|
|
|
std::string placeholder(const StringPiece& text) override { return text.toString(); }
|
2015-12-17 13:03:11 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
class PseudoMethodBidi : public PseudoMethodImpl {
|
|
|
|
public:
|
2016-07-08 15:00:32 -07:00
|
|
|
std::string text(const StringPiece& text) override;
|
|
|
|
std::string placeholder(const StringPiece& text) override;
|
2015-12-17 13:03:11 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
class PseudoMethodAccent : public PseudoMethodImpl {
|
|
|
|
public:
|
|
|
|
PseudoMethodAccent() : mDepth(0), mWordCount(0), mLength(0) {}
|
2016-07-08 15:00:32 -07:00
|
|
|
std::string start() override;
|
|
|
|
std::string end() override;
|
|
|
|
std::string text(const StringPiece& text) override;
|
|
|
|
std::string placeholder(const StringPiece& text) override;
|
2015-12-17 13:03:11 -08:00
|
|
|
private:
|
|
|
|
size_t mDepth;
|
|
|
|
size_t mWordCount;
|
|
|
|
size_t mLength;
|
|
|
|
};
|
|
|
|
|
|
|
|
Pseudolocalizer::Pseudolocalizer(Method method) : mLastDepth(0) {
|
|
|
|
setMethod(method);
|
|
|
|
}
|
|
|
|
|
|
|
|
void Pseudolocalizer::setMethod(Method method) {
|
|
|
|
switch (method) {
|
|
|
|
case Method::kNone:
|
|
|
|
mImpl = util::make_unique<PseudoMethodNone>();
|
|
|
|
break;
|
|
|
|
case Method::kAccent:
|
|
|
|
mImpl = util::make_unique<PseudoMethodAccent>();
|
|
|
|
break;
|
|
|
|
case Method::kBidi:
|
|
|
|
mImpl = util::make_unique<PseudoMethodBidi>();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-08 15:00:32 -07:00
|
|
|
std::string Pseudolocalizer::text(const StringPiece& text) {
|
|
|
|
std::string out;
|
2015-12-17 13:03:11 -08:00
|
|
|
size_t depth = mLastDepth;
|
|
|
|
size_t lastpos, pos;
|
|
|
|
const size_t length = text.size();
|
2016-07-08 15:00:32 -07:00
|
|
|
const char* str = text.data();
|
2015-12-17 13:03:11 -08:00
|
|
|
bool escaped = false;
|
|
|
|
for (lastpos = pos = 0; pos < length; pos++) {
|
|
|
|
char16_t c = str[pos];
|
|
|
|
if (escaped) {
|
|
|
|
escaped = false;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (c == '\'') {
|
|
|
|
escaped = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (c == k_arg_start) {
|
|
|
|
depth++;
|
|
|
|
} else if (c == k_arg_end && depth) {
|
|
|
|
depth--;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (mLastDepth != depth || pos == length - 1) {
|
|
|
|
bool pseudo = ((mLastDepth % 2) == 0);
|
|
|
|
size_t nextpos = pos;
|
|
|
|
if (!pseudo || depth == mLastDepth) {
|
|
|
|
nextpos++;
|
|
|
|
}
|
|
|
|
size_t size = nextpos - lastpos;
|
|
|
|
if (size) {
|
2016-07-08 15:00:32 -07:00
|
|
|
std::string chunk = text.substr(lastpos, size).toString();
|
2015-12-17 13:03:11 -08:00
|
|
|
if (pseudo) {
|
|
|
|
chunk = mImpl->text(chunk);
|
|
|
|
} else if (str[lastpos] == k_arg_start && str[nextpos - 1] == k_arg_end) {
|
|
|
|
chunk = mImpl->placeholder(chunk);
|
|
|
|
}
|
|
|
|
out.append(chunk);
|
|
|
|
}
|
|
|
|
if (pseudo && depth < mLastDepth) { // End of message
|
|
|
|
out.append(mImpl->end());
|
|
|
|
} else if (!pseudo && depth > mLastDepth) { // Start of message
|
|
|
|
out.append(mImpl->start());
|
|
|
|
}
|
|
|
|
lastpos = nextpos;
|
|
|
|
mLastDepth = depth;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
2016-07-08 15:00:32 -07:00
|
|
|
static const char* pseudolocalizeChar(const char c) {
|
2015-12-17 13:03:11 -08:00
|
|
|
switch (c) {
|
2016-07-08 15:00:32 -07:00
|
|
|
case 'a': return "\u00e5";
|
|
|
|
case 'b': return "\u0253";
|
|
|
|
case 'c': return "\u00e7";
|
|
|
|
case 'd': return "\u00f0";
|
|
|
|
case 'e': return "\u00e9";
|
|
|
|
case 'f': return "\u0192";
|
|
|
|
case 'g': return "\u011d";
|
|
|
|
case 'h': return "\u0125";
|
|
|
|
case 'i': return "\u00ee";
|
|
|
|
case 'j': return "\u0135";
|
|
|
|
case 'k': return "\u0137";
|
|
|
|
case 'l': return "\u013c";
|
|
|
|
case 'm': return "\u1e3f";
|
|
|
|
case 'n': return "\u00f1";
|
|
|
|
case 'o': return "\u00f6";
|
|
|
|
case 'p': return "\u00fe";
|
|
|
|
case 'q': return "\u0051";
|
|
|
|
case 'r': return "\u0155";
|
|
|
|
case 's': return "\u0161";
|
|
|
|
case 't': return "\u0163";
|
|
|
|
case 'u': return "\u00fb";
|
|
|
|
case 'v': return "\u0056";
|
|
|
|
case 'w': return "\u0175";
|
|
|
|
case 'x': return "\u0445";
|
|
|
|
case 'y': return "\u00fd";
|
|
|
|
case 'z': return "\u017e";
|
|
|
|
case 'A': return "\u00c5";
|
|
|
|
case 'B': return "\u03b2";
|
|
|
|
case 'C': return "\u00c7";
|
|
|
|
case 'D': return "\u00d0";
|
|
|
|
case 'E': return "\u00c9";
|
|
|
|
case 'G': return "\u011c";
|
|
|
|
case 'H': return "\u0124";
|
|
|
|
case 'I': return "\u00ce";
|
|
|
|
case 'J': return "\u0134";
|
|
|
|
case 'K': return "\u0136";
|
|
|
|
case 'L': return "\u013b";
|
|
|
|
case 'M': return "\u1e3e";
|
|
|
|
case 'N': return "\u00d1";
|
|
|
|
case 'O': return "\u00d6";
|
|
|
|
case 'P': return "\u00de";
|
|
|
|
case 'Q': return "\u0071";
|
|
|
|
case 'R': return "\u0154";
|
|
|
|
case 'S': return "\u0160";
|
|
|
|
case 'T': return "\u0162";
|
|
|
|
case 'U': return "\u00db";
|
|
|
|
case 'V': return "\u03bd";
|
|
|
|
case 'W': return "\u0174";
|
|
|
|
case 'X': return "\u00d7";
|
|
|
|
case 'Y': return "\u00dd";
|
|
|
|
case 'Z': return "\u017d";
|
|
|
|
case '!': return "\u00a1";
|
|
|
|
case '?': return "\u00bf";
|
|
|
|
case '$': return "\u20ac";
|
|
|
|
default: return nullptr;
|
2015-12-17 13:03:11 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-08 15:00:32 -07:00
|
|
|
static bool isPossibleNormalPlaceholderEnd(const char c) {
|
2015-12-17 13:03:11 -08:00
|
|
|
switch (c) {
|
|
|
|
case 's': return true;
|
|
|
|
case 'S': return true;
|
|
|
|
case 'c': return true;
|
|
|
|
case 'C': return true;
|
|
|
|
case 'd': return true;
|
|
|
|
case 'o': return true;
|
|
|
|
case 'x': return true;
|
|
|
|
case 'X': return true;
|
|
|
|
case 'f': return true;
|
|
|
|
case 'e': return true;
|
|
|
|
case 'E': return true;
|
|
|
|
case 'g': return true;
|
|
|
|
case 'G': return true;
|
|
|
|
case 'a': return true;
|
|
|
|
case 'A': return true;
|
|
|
|
case 'b': return true;
|
|
|
|
case 'B': return true;
|
|
|
|
case 'h': return true;
|
|
|
|
case 'H': return true;
|
|
|
|
case '%': return true;
|
|
|
|
case 'n': return true;
|
|
|
|
default: return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-08 15:00:32 -07:00
|
|
|
static std::string pseudoGenerateExpansion(const unsigned int length) {
|
|
|
|
std::string result = k_expansion_string;
|
|
|
|
const char* s = result.data();
|
2015-12-17 13:03:11 -08:00
|
|
|
if (result.size() < length) {
|
2016-07-08 15:00:32 -07:00
|
|
|
result += " ";
|
2015-12-17 13:03:11 -08:00
|
|
|
result += pseudoGenerateExpansion(length - result.size());
|
|
|
|
} else {
|
|
|
|
int ext = 0;
|
|
|
|
// Should contain only whole words, so looking for a space
|
|
|
|
for (unsigned int i = length + 1; i < result.size(); ++i) {
|
|
|
|
++ext;
|
|
|
|
if (s[i] == ' ') {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
result = result.substr(0, length + ext);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2016-07-08 15:00:32 -07:00
|
|
|
std::string PseudoMethodAccent::start() {
|
|
|
|
std::string result;
|
2015-12-17 13:03:11 -08:00
|
|
|
if (mDepth == 0) {
|
2016-07-08 15:00:32 -07:00
|
|
|
result = "[";
|
2015-12-17 13:03:11 -08:00
|
|
|
}
|
|
|
|
mWordCount = mLength = 0;
|
|
|
|
mDepth++;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2016-07-08 15:00:32 -07:00
|
|
|
std::string PseudoMethodAccent::end() {
|
|
|
|
std::string result;
|
2015-12-17 13:03:11 -08:00
|
|
|
if (mLength) {
|
2016-07-08 15:00:32 -07:00
|
|
|
result += " ";
|
2015-12-17 13:03:11 -08:00
|
|
|
result += pseudoGenerateExpansion(mWordCount > 3 ? mLength : mLength / 2);
|
|
|
|
}
|
|
|
|
mWordCount = mLength = 0;
|
|
|
|
mDepth--;
|
|
|
|
if (mDepth == 0) {
|
2016-07-08 15:00:32 -07:00
|
|
|
result += "]";
|
2015-12-17 13:03:11 -08:00
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Converts characters so they look like they've been localized.
|
|
|
|
*
|
|
|
|
* Note: This leaves placeholder syntax untouched.
|
|
|
|
*/
|
2016-07-08 15:00:32 -07:00
|
|
|
std::string PseudoMethodAccent::text(const StringPiece& source)
|
2015-12-17 13:03:11 -08:00
|
|
|
{
|
2016-07-08 15:00:32 -07:00
|
|
|
const char* s = source.data();
|
|
|
|
std::string result;
|
2015-12-17 13:03:11 -08:00
|
|
|
const size_t I = source.size();
|
|
|
|
bool lastspace = true;
|
|
|
|
for (size_t i = 0; i < I; i++) {
|
2016-07-08 15:00:32 -07:00
|
|
|
char c = s[i];
|
2015-12-17 13:03:11 -08:00
|
|
|
if (c == '%') {
|
|
|
|
// Placeholder syntax, no need to pseudolocalize
|
2016-07-08 15:00:32 -07:00
|
|
|
std::string chunk;
|
2015-12-17 13:03:11 -08:00
|
|
|
bool end = false;
|
|
|
|
chunk.append(&c, 1);
|
2016-06-23 13:03:15 -07:00
|
|
|
while (!end && i + 1 < I) {
|
2015-12-17 13:03:11 -08:00
|
|
|
++i;
|
|
|
|
c = s[i];
|
|
|
|
chunk.append(&c, 1);
|
|
|
|
if (isPossibleNormalPlaceholderEnd(c)) {
|
|
|
|
end = true;
|
2016-06-23 13:03:15 -07:00
|
|
|
} else if (i + 1 < I && c == 't') {
|
2015-12-17 13:03:11 -08:00
|
|
|
++i;
|
|
|
|
c = s[i];
|
|
|
|
chunk.append(&c, 1);
|
|
|
|
end = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Treat chunk as a placeholder unless it ends with %.
|
|
|
|
result += ((c == '%') ? chunk : placeholder(chunk));
|
|
|
|
} else if (c == '<' || c == '&') {
|
|
|
|
// html syntax, no need to pseudolocalize
|
|
|
|
bool tag_closed = false;
|
|
|
|
while (!tag_closed && i < I) {
|
|
|
|
if (c == '&') {
|
2016-07-08 15:00:32 -07:00
|
|
|
std::string escapeText;
|
2015-12-17 13:03:11 -08:00
|
|
|
escapeText.append(&c, 1);
|
|
|
|
bool end = false;
|
|
|
|
size_t htmlCodePos = i;
|
|
|
|
while (!end && htmlCodePos < I) {
|
|
|
|
++htmlCodePos;
|
|
|
|
c = s[htmlCodePos];
|
|
|
|
escapeText.append(&c, 1);
|
|
|
|
// Valid html code
|
|
|
|
if (c == ';') {
|
|
|
|
end = true;
|
|
|
|
i = htmlCodePos;
|
|
|
|
}
|
|
|
|
// Wrong html code
|
|
|
|
else if (!((c == '#' ||
|
|
|
|
(c >= 'a' && c <= 'z') ||
|
|
|
|
(c >= 'A' && c <= 'Z') ||
|
|
|
|
(c >= '0' && c <= '9')))) {
|
|
|
|
end = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
result += escapeText;
|
2016-07-08 15:00:32 -07:00
|
|
|
if (escapeText != "<") {
|
2015-12-17 13:03:11 -08:00
|
|
|
tag_closed = true;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (c == '>') {
|
|
|
|
tag_closed = true;
|
|
|
|
result.append(&c, 1);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
result.append(&c, 1);
|
|
|
|
i++;
|
|
|
|
c = s[i];
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// This is a pure text that should be pseudolocalized
|
2016-07-08 15:00:32 -07:00
|
|
|
const char* p = pseudolocalizeChar(c);
|
2015-12-17 13:03:11 -08:00
|
|
|
if (p != nullptr) {
|
|
|
|
result += p;
|
|
|
|
} else {
|
2016-07-08 15:00:32 -07:00
|
|
|
bool space = isspace(c);
|
2015-12-17 13:03:11 -08:00
|
|
|
if (lastspace && !space) {
|
|
|
|
mWordCount++;
|
|
|
|
}
|
|
|
|
lastspace = space;
|
|
|
|
result.append(&c, 1);
|
|
|
|
}
|
|
|
|
// Count only pseudolocalizable chars and delimiters
|
|
|
|
mLength++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2016-07-08 15:00:32 -07:00
|
|
|
std::string PseudoMethodAccent::placeholder(const StringPiece& source) {
|
2015-12-17 13:03:11 -08:00
|
|
|
// Surround a placeholder with brackets
|
|
|
|
return k_placeholder_open + source.toString() + k_placeholder_close;
|
|
|
|
}
|
|
|
|
|
2016-07-08 15:00:32 -07:00
|
|
|
std::string PseudoMethodBidi::text(const StringPiece& source) {
|
|
|
|
const char* s = source.data();
|
|
|
|
std::string result;
|
2015-12-17 13:03:11 -08:00
|
|
|
bool lastspace = true;
|
|
|
|
bool space = true;
|
|
|
|
for (size_t i = 0; i < source.size(); i++) {
|
2016-07-08 15:00:32 -07:00
|
|
|
char c = s[i];
|
|
|
|
space = isspace(c);
|
2015-12-17 13:03:11 -08:00
|
|
|
if (lastspace && !space) {
|
|
|
|
// Word start
|
|
|
|
result += k_rlm + k_rlo;
|
|
|
|
} else if (!lastspace && space) {
|
|
|
|
// Word end
|
|
|
|
result += k_pdf + k_rlm;
|
|
|
|
}
|
|
|
|
lastspace = space;
|
|
|
|
result.append(&c, 1);
|
|
|
|
}
|
|
|
|
if (!lastspace) {
|
|
|
|
// End of last word
|
|
|
|
result += k_pdf + k_rlm;
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2016-07-08 15:00:32 -07:00
|
|
|
std::string PseudoMethodBidi::placeholder(const StringPiece& source) {
|
2015-12-17 13:03:11 -08:00
|
|
|
// Surround a placeholder with directionality change sequence
|
|
|
|
return k_rlm + k_rlo + source.toString() + k_pdf + k_rlm;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace aapt
|