Igor Viarheichyk 4fb6516a55 Improved word break for BiDi pseudolocalizer.
Characted sequences \n and \t are now treated as word separators
by BiDi pseudolocalizer. This solves issues when text rendering
engine breaks a line in the middle of a text chunk marked with
RLM+RLO and PDF+RLM sequences.

Bug:34064580
Change-Id: I52e6018785fae25479fa167440f24c534b0e3253
Fixes:34064580
Test: make aapt2_tests
Test: Run aapt2_tests binary
2017-07-06 15:41:47 -07:00

484 lines
11 KiB
C++

/*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "compile/Pseudolocalizer.h"
#include "util/Util.h"
using android::StringPiece;
namespace aapt {
// String basis to generate expansion
static const std::string kExpansionString =
"one two three "
"four five six seven eight nine ten eleven twelve thirteen "
"fourteen fiveteen sixteen seventeen nineteen twenty";
// Special unicode characters to override directionality of the words
static const std::string kRlm = "\u200f";
static const std::string kRlo = "\u202e";
static const std::string kPdf = "\u202c";
// Placeholder marks
static const std::string kPlaceholderOpen = "\u00bb";
static const std::string kPlaceholderClose = "\u00ab";
static const char kArgStart = '{';
static const char kArgEnd = '}';
class PseudoMethodNone : public PseudoMethodImpl {
public:
std::string Text(const StringPiece& text) override { return text.to_string(); }
std::string Placeholder(const StringPiece& text) override { return text.to_string(); }
};
class PseudoMethodBidi : public PseudoMethodImpl {
public:
std::string Text(const StringPiece& text) override;
std::string Placeholder(const StringPiece& text) override;
};
class PseudoMethodAccent : public PseudoMethodImpl {
public:
PseudoMethodAccent() : depth_(0), word_count_(0), length_(0) {}
std::string Start() override;
std::string End() override;
std::string Text(const StringPiece& text) override;
std::string Placeholder(const StringPiece& text) override;
private:
size_t depth_;
size_t word_count_;
size_t length_;
};
Pseudolocalizer::Pseudolocalizer(Method method) : last_depth_(0) {
SetMethod(method);
}
void Pseudolocalizer::SetMethod(Method method) {
switch (method) {
case Method::kNone:
impl_ = util::make_unique<PseudoMethodNone>();
break;
case Method::kAccent:
impl_ = util::make_unique<PseudoMethodAccent>();
break;
case Method::kBidi:
impl_ = util::make_unique<PseudoMethodBidi>();
break;
}
}
std::string Pseudolocalizer::Text(const StringPiece& text) {
std::string out;
size_t depth = last_depth_;
size_t lastpos, pos;
const size_t length = text.size();
const char* str = text.data();
bool escaped = false;
for (lastpos = pos = 0; pos < length; pos++) {
char16_t c = str[pos];
if (escaped) {
escaped = false;
continue;
}
if (c == '\'') {
escaped = true;
continue;
}
if (c == kArgStart) {
depth++;
} else if (c == kArgEnd && depth) {
depth--;
}
if (last_depth_ != depth || pos == length - 1) {
bool pseudo = ((last_depth_ % 2) == 0);
size_t nextpos = pos;
if (!pseudo || depth == last_depth_) {
nextpos++;
}
size_t size = nextpos - lastpos;
if (size) {
std::string chunk = text.substr(lastpos, size).to_string();
if (pseudo) {
chunk = impl_->Text(chunk);
} else if (str[lastpos] == kArgStart && str[nextpos - 1] == kArgEnd) {
chunk = impl_->Placeholder(chunk);
}
out.append(chunk);
}
if (pseudo && depth < last_depth_) { // End of message
out.append(impl_->End());
} else if (!pseudo && depth > last_depth_) { // Start of message
out.append(impl_->Start());
}
lastpos = nextpos;
last_depth_ = depth;
}
}
return out;
}
static const char* PseudolocalizeChar(const char c) {
switch (c) {
case 'a':
return "\u00e5";
case 'b':
return "\u0253";
case 'c':
return "\u00e7";
case 'd':
return "\u00f0";
case 'e':
return "\u00e9";
case 'f':
return "\u0192";
case 'g':
return "\u011d";
case 'h':
return "\u0125";
case 'i':
return "\u00ee";
case 'j':
return "\u0135";
case 'k':
return "\u0137";
case 'l':
return "\u013c";
case 'm':
return "\u1e3f";
case 'n':
return "\u00f1";
case 'o':
return "\u00f6";
case 'p':
return "\u00fe";
case 'q':
return "\u0051";
case 'r':
return "\u0155";
case 's':
return "\u0161";
case 't':
return "\u0163";
case 'u':
return "\u00fb";
case 'v':
return "\u0056";
case 'w':
return "\u0175";
case 'x':
return "\u0445";
case 'y':
return "\u00fd";
case 'z':
return "\u017e";
case 'A':
return "\u00c5";
case 'B':
return "\u03b2";
case 'C':
return "\u00c7";
case 'D':
return "\u00d0";
case 'E':
return "\u00c9";
case 'G':
return "\u011c";
case 'H':
return "\u0124";
case 'I':
return "\u00ce";
case 'J':
return "\u0134";
case 'K':
return "\u0136";
case 'L':
return "\u013b";
case 'M':
return "\u1e3e";
case 'N':
return "\u00d1";
case 'O':
return "\u00d6";
case 'P':
return "\u00de";
case 'Q':
return "\u0071";
case 'R':
return "\u0154";
case 'S':
return "\u0160";
case 'T':
return "\u0162";
case 'U':
return "\u00db";
case 'V':
return "\u03bd";
case 'W':
return "\u0174";
case 'X':
return "\u00d7";
case 'Y':
return "\u00dd";
case 'Z':
return "\u017d";
case '!':
return "\u00a1";
case '?':
return "\u00bf";
case '$':
return "\u20ac";
default:
return nullptr;
}
}
static bool IsPossibleNormalPlaceholderEnd(const char c) {
switch (c) {
case 's':
return true;
case 'S':
return true;
case 'c':
return true;
case 'C':
return true;
case 'd':
return true;
case 'o':
return true;
case 'x':
return true;
case 'X':
return true;
case 'f':
return true;
case 'e':
return true;
case 'E':
return true;
case 'g':
return true;
case 'G':
return true;
case 'a':
return true;
case 'A':
return true;
case 'b':
return true;
case 'B':
return true;
case 'h':
return true;
case 'H':
return true;
case '%':
return true;
case 'n':
return true;
default:
return false;
}
}
static std::string PseudoGenerateExpansion(const unsigned int length) {
std::string result = kExpansionString;
const char* s = result.data();
if (result.size() < length) {
result += " ";
result += PseudoGenerateExpansion(length - result.size());
} else {
int ext = 0;
// Should contain only whole words, so looking for a space
for (unsigned int i = length + 1; i < result.size(); ++i) {
++ext;
if (s[i] == ' ') {
break;
}
}
result = result.substr(0, length + ext);
}
return result;
}
std::string PseudoMethodAccent::Start() {
std::string result;
if (depth_ == 0) {
result = "[";
}
word_count_ = length_ = 0;
depth_++;
return result;
}
std::string PseudoMethodAccent::End() {
std::string result;
if (length_) {
result += " ";
result += PseudoGenerateExpansion(word_count_ > 3 ? length_ : length_ / 2);
}
word_count_ = length_ = 0;
depth_--;
if (depth_ == 0) {
result += "]";
}
return result;
}
/**
* Converts characters so they look like they've been localized.
*
* Note: This leaves placeholder syntax untouched.
*/
std::string PseudoMethodAccent::Text(const StringPiece& source) {
const char* s = source.data();
std::string result;
const size_t I = source.size();
bool lastspace = true;
for (size_t i = 0; i < I; i++) {
char c = s[i];
if (c == '%') {
// Placeholder syntax, no need to pseudolocalize
std::string chunk;
bool end = false;
chunk.append(&c, 1);
while (!end && i + 1 < I) {
++i;
c = s[i];
chunk.append(&c, 1);
if (IsPossibleNormalPlaceholderEnd(c)) {
end = true;
} else if (i + 1 < I && c == 't') {
++i;
c = s[i];
chunk.append(&c, 1);
end = true;
}
}
// Treat chunk as a placeholder unless it ends with %.
result += ((c == '%') ? chunk : Placeholder(chunk));
} else if (c == '<' || c == '&') {
// html syntax, no need to pseudolocalize
bool tag_closed = false;
while (!tag_closed && i < I) {
if (c == '&') {
std::string escape_text;
escape_text.append(&c, 1);
bool end = false;
size_t html_code_pos = i;
while (!end && html_code_pos < I) {
++html_code_pos;
c = s[html_code_pos];
escape_text.append(&c, 1);
// Valid html code
if (c == ';') {
end = true;
i = html_code_pos;
}
// Wrong html code
else if (!((c == '#' || (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')))) {
end = true;
}
}
result += escape_text;
if (escape_text != "&lt;") {
tag_closed = true;
}
continue;
}
if (c == '>') {
tag_closed = true;
result.append(&c, 1);
continue;
}
result.append(&c, 1);
i++;
c = s[i];
}
} else {
// This is a pure text that should be pseudolocalized
const char* p = PseudolocalizeChar(c);
if (p != nullptr) {
result += p;
} else {
bool space = isspace(c);
if (lastspace && !space) {
word_count_++;
}
lastspace = space;
result.append(&c, 1);
}
// Count only pseudolocalizable chars and delimiters
length_++;
}
}
return result;
}
std::string PseudoMethodAccent::Placeholder(const StringPiece& source) {
// Surround a placeholder with brackets
return kPlaceholderOpen + source.to_string() + kPlaceholderClose;
}
std::string PseudoMethodBidi::Text(const StringPiece& source) {
const char* s = source.data();
std::string result;
bool lastspace = true;
bool space = true;
bool escape = false;
const char ESCAPE_CHAR = '\\';
for (size_t i = 0; i < source.size(); i++) {
char c = s[i];
if (!escape && c == ESCAPE_CHAR) {
escape = true;
continue;
}
space = (!escape && isspace(c)) || (escape && (c == 'n' || c == 't'));
if (lastspace && !space) {
// Word start
result += kRlm + kRlo;
} else if (!lastspace && space) {
// Word end
result += kPdf + kRlm;
}
lastspace = space;
if (escape) {
result.append(&ESCAPE_CHAR, 1);
escape=false;
}
result.append(&c, 1);
}
if (!lastspace) {
// End of last word
result += kPdf + kRlm;
}
return result;
}
std::string PseudoMethodBidi::Placeholder(const StringPiece& source) {
// Surround a placeholder with directionality change sequence
return kRlm + kRlo + source.to_string() + kPdf + kRlm;
}
} // namespace aapt