382 lines
14 KiB
C++
Raw Normal View History

/*
* Copyright (C) 2011 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* A service that exchanges time synchronization information between
* a master that defines a timeline and clients that follow the timeline.
*/
#define __STDC_LIMIT_MACROS
#define LOG_TAG "common_time"
#include <utils/Log.h>
#include <stdint.h>
#include <common_time/local_clock.h>
#include <assert.h>
#include "clock_recovery.h"
#include "common_clock.h"
#ifdef TIME_SERVICE_DEBUG
#include "diag_thread.h"
#endif
namespace android {
ClockRecoveryLoop::ClockRecoveryLoop(LocalClock* local_clock,
CommonClock* common_clock) {
assert(NULL != local_clock);
assert(NULL != common_clock);
local_clock_ = local_clock;
common_clock_ = common_clock;
local_clock_can_slew_ = local_clock_->initCheck() &&
(local_clock_->setLocalSlew(0) == OK);
computePIDParams();
reset(true, true);
#ifdef TIME_SERVICE_DEBUG
diag_thread_ = new DiagThread(common_clock_, local_clock_);
if (diag_thread_ != NULL) {
status_t res = diag_thread_->startWorkThread();
if (res != OK)
LOGW("Failed to start A@H clock recovery diagnostic thread.");
} else
LOGW("Failed to allocate diagnostic thread.");
#endif
}
ClockRecoveryLoop::~ClockRecoveryLoop() {
#ifdef TIME_SERVICE_DEBUG
diag_thread_->stopWorkThread();
#endif
}
void ClockRecoveryLoop::reset(bool position, bool frequency) {
Mutex::Autolock lock(&lock_);
reset_l(position, frequency);
}
uint32_t ClockRecoveryLoop::findMinRTTNdx(DisciplineDataPoint* data,
uint32_t count) {
uint32_t min_rtt = 0;
for (uint32_t i = 1; i < count; ++i)
if (data[min_rtt].rtt > data[i].rtt)
min_rtt = i;
return min_rtt;
}
bool ClockRecoveryLoop::pushDisciplineEvent(int64_t local_time,
int64_t nominal_common_time,
int64_t rtt) {
Mutex::Autolock lock(&lock_);
// If we have not defined a basis for common time, then we need to use these
// initial points to do so. In order to avoid significant initial error
// from a particularly bad startup data point, we collect the first N data
// points and choose the best of them before moving on.
if (!common_clock_->isValid()) {
if (startup_filter_wr_ < kStartupFilterSize) {
DisciplineDataPoint& d = startup_filter_data_[startup_filter_wr_];
d.local_time = local_time;
d.nominal_common_time = nominal_common_time;
d.rtt = rtt;
startup_filter_wr_++;
}
if (startup_filter_wr_ == kStartupFilterSize) {
uint32_t min_rtt = findMinRTTNdx(startup_filter_data_,
kStartupFilterSize);
common_clock_->setBasis(
startup_filter_data_[min_rtt].local_time,
startup_filter_data_[min_rtt].nominal_common_time);
}
return true;
}
int64_t observed_common;
int64_t delta;
int32_t delta32;
int32_t correction_cur;
int32_t correction_cur_P = 0;
int32_t correction_cur_I = 0;
int32_t correction_cur_D = 0;
if (OK != common_clock_->localToCommon(local_time, &observed_common)) {
// Since we just checked to make certain that this conversion was valid,
// and no one else in the system should be messing with it, if this
// conversion is suddenly invalid, it is a good reason to panic.
LOGE("Failed to convert local time to common time in %s:%d",
__PRETTY_FUNCTION__, __LINE__);
return false;
}
// Implement a filter which should match NTP filtering behavior when a
// client is associated with only one peer of lower stratum. Basically,
// always use the best of the N last data points, where best is defined as
// lowest round trip time. NTP uses an N of 8; we use a value of 6.
//
// TODO(johngro) : experiment with other filter strategies. The goal here
// is to mitigate the effects of high RTT data points which typically have
// large asymmetries in the TX/RX legs. Downside of the existing NTP
// approach (particularly because of the PID controller we are using to
// produce the control signal from the filtered data) are that the rate at
// which discipline events are actually acted upon becomes irregular and can
// become drawn out (the time between actionable event can go way up). If
// the system receives a strong high quality data point, the proportional
// component of the controller can produce a strong correction which is left
// in place for too long causing overshoot. In addition, the integral
// component of the system currently is an approximation based on the
// assumption of a more or less homogeneous sampling of the error. Its
// unclear what the effect of undermining this assumption would be right
// now.
// Two ideas which come to mind immediately would be to...
// 1) Keep a history of more data points (32 or so) and ignore data points
// whose RTT is more than a certain number of standard deviations outside
// of the norm.
// 2) Eliminate the PID controller portion of this system entirely.
// Instead, move to a system which uses a very wide filter (128 data
// points or more) with a sum-of-least-squares line fitting approach to
// tracking the long term drift. This would take the place of the I
// component in the current PID controller. Also use a much more narrow
// outlier-rejector filter (as described in #1) to drive a short term
// correction factor similar to the P component of the PID controller.
assert(filter_wr_ < kFilterSize);
filter_data_[filter_wr_].local_time = local_time;
filter_data_[filter_wr_].observed_common_time = observed_common;
filter_data_[filter_wr_].nominal_common_time = nominal_common_time;
filter_data_[filter_wr_].rtt = rtt;
filter_data_[filter_wr_].point_used = false;
filter_wr_ = (filter_wr_ + 1) % kFilterSize;
if (!filter_wr_)
filter_full_ = true;
// Scan the accumulated data for the point with the minimum RTT. If that
// point has never been used before, go ahead and use it now, otherwise just
// do nothing.
uint32_t scan_end = filter_full_ ? kFilterSize : filter_wr_;
uint32_t min_rtt = findMinRTTNdx(filter_data_, scan_end);
if (filter_data_[min_rtt].point_used)
return true;
local_time = filter_data_[min_rtt].local_time;
observed_common = filter_data_[min_rtt].observed_common_time;
nominal_common_time = filter_data_[min_rtt].nominal_common_time;
filter_data_[min_rtt].point_used = true;
// Compute the error then clamp to the panic threshold. If we ever exceed
// this amt of error, its time to panic and reset the system. Given that
// the error in the measurement of the error could be as high as the RTT of
// the data point, we don't actually panic until the implied error (delta)
// is greater than the absolute panic threashold plus the RTT. IOW - we
// don't panic until we are absoluely sure that our best case sync is worse
// than the absolute panic threshold.
int64_t effective_panic_thresh = panic_thresh_ + filter_data_[min_rtt].rtt;
delta = nominal_common_time - observed_common;
if ((delta > effective_panic_thresh) || (delta < -effective_panic_thresh)) {
// PANIC!!!
//
// TODO(johngro) : need to report this to the upper levels of
// code.
reset_l(false, true);
return false;
} else
delta32 = delta;
// Accumulate error into the integrated error, then clamp.
integrated_error_ += delta32;
if (integrated_error_ > pid_params_.integrated_delta_max)
integrated_error_ = pid_params_.integrated_delta_max;
else if (integrated_error_ < pid_params_.integrated_delta_min)
integrated_error_ = pid_params_.integrated_delta_min;
// Compute the difference in error between last time and this time, then
// update last_delta_
int32_t input_D = last_delta_valid_ ? delta32 - last_delta_ : 0;
last_delta_valid_ = true;
last_delta_ = delta32;
// Compute the various components of the correction value.
correction_cur_P = doGainScale(pid_params_.gain_P, delta32);
correction_cur_I = doGainScale(pid_params_.gain_I, integrated_error_);
// TODO(johngro) : the differential portion of this code used to rely
// upon a completely homogeneous discipline frequency. Now that the
// discipline frequency may not be homogeneous, its probably important
// to divide by the amt of time between discipline events during the
// gain calculation.
correction_cur_D = doGainScale(pid_params_.gain_D, input_D);
// Compute the final correction value and clamp.
correction_cur = correction_cur_P + correction_cur_I + correction_cur_D;
if (correction_cur < pid_params_.correction_min)
correction_cur = pid_params_.correction_min;
else if (correction_cur > pid_params_.correction_max)
correction_cur = pid_params_.correction_max;
// If there was a change in the amt of correction to use, update the
// system.
if (correction_cur_ != correction_cur) {
correction_cur_ = correction_cur;
applySlew();
}
LOGV("rtt %lld observed %lld nominal %lld delta = %5lld "
"int = %7d correction %5d (P %5d, I %5d, D %5d)\n",
filter_data_[min_rtt].rtt,
observed_common,
nominal_common_time,
nominal_common_time - observed_common,
integrated_error_,
correction_cur,
correction_cur_P,
correction_cur_I,
correction_cur_D);
#ifdef TIME_SERVICE_DEBUG
diag_thread_->pushDisciplineEvent(
local_time,
observed_common,
nominal_common_time,
correction_cur,
correction_cur_P,
correction_cur_I,
correction_cur_D);
#endif
return true;
}
Implement new common_time service functionality. Major re-factor of the common_time (formally aah_timesrv) service in preparation for up-integration into Android master. This work includes bug fixes, new features, and general code cleanup. High points are listed below. + CommonClock interface has been enhanced to allow querying of many more low level synchronization details; mostly for debugging, but in theory useful to an application as well. + CommonTimeConfig interface has been implemented. This allows a management process to configure a number of different parameters (many of them new) to control the behavior of the common_time service. Most importantly, the time service can be bound to a specific network interface and should only operate on that interface an no others. + Enhance log messages to be more useful in determining what the time service state machine is doing and why. + Enhance information provided by dumpsys to provide many more details about the quality of time sync and the network conditions which gave rise to the current quality conditions. Features, features, features.... + Add a feature which lets the high level choose a different master election endpoint so that multiple time synchronization domains can co-exist on the same subnet (mostly to support a potential use case of multiple home domains in a multiple dwelling environment like a hotel, dormitory or apartment complex). + Add a feature which lets the high level assign a 64-bit group ID which allows partitioning of time synchronization domains even when the master election endpoint is shared (as it might be if broadcast is being used instead of multicast) + Add an auto-disable feature which lets the time service drop into network-less mode when there are no active clients of the common_time service in the device. Mostly for phones, this allows phones to not consume network/battery resources when they don't need to maintain common time. + Add a feature which lets the high level choose the priority of the common_time service in the master election protocol. This allows high level decisions about things like mobile vs non-mobile, wired ethernet vs WiFi to affect who ends up with the job of master on a given network. Priority overrides at the low level also allow clients coming in from network-less mode to lower their effective priority as they join a new network so as to not disrupt any stable long-running timeline which may already be active on the network. + Add the ability to control some of the core parameters of the time sync service which effect network load (like the sync polling interval and the master announce interval) Change-Id: I71af15a83cfa5ef0417b406928967fb9e02f55c6
2012-01-20 12:12:59 -08:00
int32_t ClockRecoveryLoop::getLastErrorEstimate() {
Mutex::Autolock lock(&lock_);
if (last_delta_valid_)
return last_delta_;
else
return ICommonClock::kErrorEstimateUnknown;
}
void ClockRecoveryLoop::computePIDParams() {
// TODO(johngro) : add the ability to fetch parameters from the driver/board
// level in case they have a HW clock discipline solution with parameters
// tuned specifically for it.
// Correction factor is limited to MIN/MAX_INT_16
pid_params_.correction_min = -0x8000;
pid_params_.correction_max = 0x7FFF;
// Default proportional gain to 2^15:1000. (max proportional drive at 1mSec
// of instantaneous error)
memset(&pid_params_.gain_P, 0, sizeof(pid_params_.gain_P));
pid_params_.gain_P.a_to_b_numer = 0x8000;
pid_params_.gain_P.a_to_b_denom = 1000;
// Set the integral gain to 2^15:5000
memset(&pid_params_.gain_I, 0, sizeof(pid_params_.gain_I));
pid_params_.gain_I.a_to_b_numer = 0x8000;
pid_params_.gain_I.a_to_b_denom = 5000;
// Default controller is just a PI controller. Right now, the network based
// measurements of the error are way to noisy to feed into the differential
// component of a PID controller. Someday we might come back and add some
// filtering of the error channel, but until then leave the controller as a
// simple PI controller.
memset(&pid_params_.gain_D, 0, sizeof(pid_params_.gain_D));
// Don't let the integral component of the controller wind up to
// the point where it would want to drive the correction factor
// past saturation.
int64_t tmp;
pid_params_.gain_I.doReverseTransform(pid_params_.correction_min, &tmp);
pid_params_.integrated_delta_min = static_cast<int32_t>(tmp);
pid_params_.gain_I.doReverseTransform(pid_params_.correction_max, &tmp);
pid_params_.integrated_delta_max = static_cast<int32_t>(tmp);
// By default, panic when are certain that the sync error is > 20mSec;
panic_thresh_ = 20000;
}
void ClockRecoveryLoop::reset_l(bool position, bool frequency) {
assert(NULL != common_clock_);
if (position) {
common_clock_->resetBasis();
startup_filter_wr_ = 0;
}
if (frequency) {
last_delta_valid_ = false;
last_delta_ = 0;
integrated_error_ = 0;
correction_cur_ = 0;
applySlew();
}
filter_wr_ = 0;
filter_full_ = false;
}
int32_t ClockRecoveryLoop::doGainScale(const LinearTransform& gain,
int32_t val) {
if (!gain.a_to_b_numer || !gain.a_to_b_denom || !val)
return 0;
int64_t tmp;
int64_t val64 = static_cast<int64_t>(val);
if (!gain.doForwardTransform(val64, &tmp)) {
LOGW("Overflow/Underflow while scaling %d in %s",
val, __PRETTY_FUNCTION__);
return (val < 0) ? INT32_MIN : INT32_MAX;
}
if (tmp > INT32_MAX) {
LOGW("Overflow while scaling %d in %s", val, __PRETTY_FUNCTION__);
return INT32_MAX;
}
if (tmp < INT32_MIN) {
LOGW("Underflow while scaling %d in %s", val, __PRETTY_FUNCTION__);
return INT32_MIN;
}
return static_cast<int32_t>(tmp);
}
void ClockRecoveryLoop::applySlew() {
if (local_clock_can_slew_) {
local_clock_->setLocalSlew(correction_cur_);
} else {
// The SW clock recovery implemented by the common clock class expects
// values expressed in PPM. Map the MIN/MAX_INT_16 drive range to +/-
// 100ppm.
int sw_correction;
sw_correction = correction_cur_ - pid_params_.correction_min;
sw_correction *= 200;
sw_correction /= (pid_params_.correction_max -
pid_params_.correction_min);
sw_correction -= 100;
common_clock_->setSlew(local_clock_->getLocalTime(), sw_correction);
}
}
} // namespace android