Skip to content

Commit 51c926a

Browse files
committedOct 14, 2016
mds: fix false "failing to respond to cache pressure" warning
the false warning happens in following sequence of events - MDS has cache pressure, sends recall state messages to clients - Client does not trim as many caps as MDS expected. So MDS does not reset session->recalled_at - MDS no longer has cache pressure, it stop sending recall state messages to clients. - Client does not release its caps. So session->recalled_at in MDS keeps unchanged Signed-off-by: Yan, Zheng <zyan@redhat.com>
1 parent a4ce1f5 commit 51c926a

File tree

5 files changed

+26
-10
lines changed

5 files changed

+26
-10
lines changed
 

‎src/mds/Beacon.cc

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -385,8 +385,10 @@ void Beacon::notify_health(MDSRank const *mds)
385385
{
386386
set<Session*> sessions;
387387
mds->sessionmap.get_client_session_set(sessions);
388+
388389
utime_t cutoff = ceph_clock_now(g_ceph_context);
389390
cutoff -= g_conf->mds_recall_state_timeout;
391+
utime_t last_recall = mds->mdcache->last_recall_state;
390392

391393
std::list<MDSHealthMetric> late_recall_metrics;
392394
std::list<MDSHealthMetric> large_completed_requests_metrics;
@@ -396,7 +398,10 @@ void Beacon::notify_health(MDSRank const *mds)
396398
dout(20) << "Session servicing RECALL " << session->info.inst
397399
<< ": " << session->recalled_at << " " << session->recall_release_count
398400
<< "/" << session->recall_count << dendl;
399-
if (session->recalled_at < cutoff) {
401+
if (last_recall < cutoff || session->last_recall_sent < last_recall) {
402+
dout(20) << " no longer recall" << dendl;
403+
session->clear_recalled_at();
404+
} else if (session->recalled_at < cutoff) {
400405
dout(20) << " exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl;
401406
std::ostringstream oss;
402407
oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";

‎src/mds/MDCache.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7334,10 +7334,11 @@ void MDCache::check_memory_usage()
73347334

73357335
if (num_inodes_with_caps > g_conf->mds_cache_size) {
73367336
float ratio = (float)g_conf->mds_cache_size * .9 / (float)num_inodes_with_caps;
7337-
if (ratio < 1.0)
7337+
if (ratio < 1.0) {
7338+
last_recall_state = ceph_clock_now(g_ceph_context);
73387339
mds->server->recall_client_state(ratio);
7340+
}
73397341
}
7340-
73417342
}
73427343

73437344

‎src/mds/MDCache.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,8 @@ class MDCache {
693693
void trim_client_leases();
694694
void check_memory_usage();
695695

696+
utime_t last_recall_state;
697+
696698
// shutdown
697699
void shutdown_start();
698700
void shutdown_check();

‎src/mds/SessionMap.cc

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -789,11 +789,8 @@ void Session::notify_cap_release(size_t n_caps)
789789
{
790790
if (!recalled_at.is_zero()) {
791791
recall_release_count += n_caps;
792-
if (recall_release_count >= recall_count) {
793-
recalled_at = utime_t();
794-
recall_count = 0;
795-
recall_release_count = 0;
796-
}
792+
if (recall_release_count >= recall_count)
793+
clear_recalled_at();
797794
}
798795
}
799796

@@ -808,13 +805,22 @@ void Session::notify_recall_sent(int const new_limit)
808805
if (recalled_at.is_zero()) {
809806
// Entering recall phase, set up counters so we can later
810807
// judge whether the client has respected the recall request
811-
recalled_at = ceph_clock_now(g_ceph_context);
808+
recalled_at = last_recall_sent = ceph_clock_now(g_ceph_context);
812809
assert (new_limit < caps.size()); // Behaviour of Server::recall_client_state
813810
recall_count = caps.size() - new_limit;
814811
recall_release_count = 0;
812+
} else {
813+
last_recall_sent = ceph_clock_now(g_ceph_context);
815814
}
816815
}
817816

817+
void Session::clear_recalled_at()
818+
{
819+
recalled_at = last_recall_sent = utime_t();
820+
recall_count = 0;
821+
recall_release_count = 0;
822+
}
823+
818824
void Session::set_client_metadata(map<string, string> const &meta)
819825
{
820826
info.client_metadata = meta;

‎src/mds/SessionMap.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ class Session : public RefCountedObject {
129129

130130
// Ephemeral state for tracking progress of capability recalls
131131
utime_t recalled_at; // When was I asked to SESSION_RECALL?
132+
utime_t last_recall_sent;
132133
uint32_t recall_count; // How many caps was I asked to SESSION_RECALL?
133134
uint32_t recall_release_count; // How many caps have I actually revoked?
134135

@@ -148,6 +149,7 @@ class Session : public RefCountedObject {
148149

149150
void notify_cap_release(size_t n_caps);
150151
void notify_recall_sent(int const new_limit);
152+
void clear_recalled_at();
151153

152154
inodeno_t next_ino() {
153155
if (info.prealloc_inos.empty())
@@ -315,7 +317,7 @@ class Session : public RefCountedObject {
315317

316318
Session() :
317319
state(STATE_CLOSED), state_seq(0), importing_count(0),
318-
recalled_at(), recall_count(0), recall_release_count(0),
320+
recall_count(0), recall_release_count(0),
319321
auth_caps(g_ceph_context),
320322
connection(NULL), item_session_list(this),
321323
requests(0), // member_offset passed to front() manually

0 commit comments

Comments
 (0)
Please sign in to comment.