openbsd-ports/sysutils/fam/files/IMonKQueue.c++

460 lines
17 KiB
C++
Raw Normal View History

// $OpenBSD: IMonKQueue.c++,v 1.1.1.1 2007/04/27 22:00:55 jasper Exp $
// $NetBSD: IMonKQueue.c++,v 1.3 2005/01/05 16:21:06 jmmv Exp $
//
// Copyright (c) 2004, 2005 Julio M. Merino Vidal.
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of version 2 of the GNU General Public License as
// published by the Free Software Foundation.
//
// This program is distributed in the hope that it would be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. Further, any
// license provided herein, whether implied or otherwise, is limited to
// this program in accordance with the express provisions of the GNU
// General Public License. Patent licenses, if any, provided herein do not
// apply to combinations of this program with other product or programs, or
// any other product whatsoever. This program is distributed without any
// warranty that the program is delivered free of the rightful claim of any
// third person by way of infringement or the like. See the GNU General
// Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this program; if not, write the Free Software Foundation, Inc., 59
// Temple Place - Suite 330, Boston MA 02111-1307, USA.
// ------------------------------------------------------------------------
// imon emulation through kqueue
// -----------------------------
//
// The code in this file provides an imon-like interface to FAM using kqueue,
// the kernel event notification mechanism found in FreeBSD, NetBSD and
// OpenBSD.
//
// The idea is the following: a thread, kqueue_monitor, simulates the kernel
// part of imon. This thread can receive commands (ioctl(2)s) and produces
// notifications when there is something to notify. The thread is constantly
// running in the background, calling kevent(2) to see if there are new
// events in the monitored files since the last call.
//
// Communication with kqueue_monitor is accomplished by using two pipes.
// pipe_out is used by the monitor to provide notifications; i.e., it is the
// same as the read end of the regular /dev/imon device, and produces
// compatible messages. On the other hand we have pipe_in, which is used
// to give commands to the monitor (express and revoke); we can't emulate
// ioctl(2)s from user space, so we have to go this route.
//
// Why we use pipe_in to provide commands to the thread, instead of some
// mutexes? If we used mutexes, we'd have to give kevent(2) a timeout, to
// let it "reload" the list of changes to be monitored in case it was
// externally modified. By using a pipe, we can tell kqueue(2) to monitor
// it for us, and let kevent(2) immediately return when there is a command
// to process.
//
// However, there is a little problem when using kqueue instead of imon or
// polling. kqueue(2) works by monitoring open file descriptors, instead
// of inodes on the disk. Therefore we must keep all files being monitored
// open, and the number of open files can quickly raise in some environments.
// This is why the code unlimits the number of open files in imon_open and
// sets a reasonable maximum based on kern.maxfiles (to avoid overflowing
// it quickly). If we overflow this limit, the poller will enter the game
// (because we will return an error).
//
// Known problem: if we receive *lots* of events quickly, famd may end up
// locked. To reproduce, run the test program provided by fam against a
// local directory, say /tmp/foo, and do the following:
// cd /tmp/foo; for f in $(jot 1000); do touch $(jot 100); rm *; done
// You should receive some messages like:
// famd[21058]: kqueue can't revoke "75", dev = 0, ino = 1113421
// while the test is running (not a lot), and it will eventually lock up.
//
// Having said all this, let's go to the code...
// ------------------------------------------------------------------------
#include "IMon.h"
#include "Log.h"
#include "config.h"
#include "imon-compat.h"
#include <sys/event.h>
#include <sys/param.h>
#include <sys/resource.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <assert.h>
#include <fcntl.h>
#include <pthread.h>
#include <string.h>
#include <unistd.h>
#include <map>
// ------------------------------------------------------------------------
// devino is a structure that holds a device/inode pair. It is used as an
// indentifier of files managed by imon.
struct devino {
dev_t di_dev;
ino_t di_ino;
bool operator<(const struct devino& di) const
{ return (di_dev < di.di_dev) or
(di_dev == di.di_dev and di_ino < di.di_ino); }
};
// imon_cmd simulates commands thrown to imon as ioctl(2)s (but remember
// we use a pipe).
struct imon_cmd {
#define IMON_CMD_EXPRESS 0
#define IMON_CMD_REVOKE 1
int ic_type;
// imon identifies files through a device/inode pair.
struct devino ic_di;
// A pipe that will be used to receive the result of the command
// (asynchronously).
int ic_stat[2];
// If this is an 'express' command, we need the descriptor to monitor.
int ic_fd;
};
// ------------------------------------------------------------------------
static int max_changes;
static int last_change;
static int kqueue_fd;
static int pipe_in[2], pipe_out[2];
static pthread_t kevent_thread;
static struct kevent *changes;
typedef std::map<struct devino, int> DEVINOFD_MAP;
static DEVINOFD_MAP devino_to_fd;
typedef std::map<int, struct devino> FDDEVINO_MAP;
static FDDEVINO_MAP fd_to_devino;
// ------------------------------------------------------------------------
static void *kqueue_monitor(void *data);
static void process_command(void);
// ------------------------------------------------------------------------
int
IMon::imon_open(void)
{
// Get the kernel event queue. We only need one during all the life
// of famd.
kqueue_fd = kqueue();
if (kqueue_fd == -1)
return -1;
// Create "emulation" pipes.
if (pipe(pipe_in) == -1) {
close(kqueue_fd);
return -1;
}
if (pipe(pipe_out) == -1) {
close(kqueue_fd);
close(pipe_in[0]); close(pipe_in[1]);
return -1;
}
// Get the maximum number of files we can open and use it to set a
// limit of the files we can monitor.
size_t len = sizeof(max_changes);
int mib[2];
mib[0] = CTL_KERN;
mib[1] = KERN_MAXFILES;
if (sysctl(mib, 2, &max_changes, &len, NULL, 0) == -1)
max_changes = 128;
else
max_changes /= 2;
// Unlimit maximum number of open files. We don't go to RLIM_INFINITY
// to avoid possible open descriptor leaks produce a system DoS. 75%
// of the system limit seems a good number (we request more than the
// number calculated previously to leave room for temporary pipes).
// We need to be root to do this.
uid_t olduid = geteuid();
seteuid(0);
struct rlimit rlp;
rlp.rlim_cur = rlp.rlim_max = max_changes * 3 / 2;
if (setrlimit(RLIMIT_NOFILE, &rlp) == -1)
Log::error("can't unlimit number of open files");
seteuid(olduid);
changes = new struct kevent[max_changes];
// We must monitor pipe_in for any commands that may alter the actual
// set of files being monitored.
EV_SET(&changes[0], pipe_in[0], EVFILT_READ,
EV_ADD | EV_ENABLE | EV_ONESHOT, 0, 0, 0);
last_change = 1;
// Create a thread that will run the kevent(2) function continuously.
if (pthread_create(&kevent_thread, NULL, kqueue_monitor, NULL) != 0) {
close(kqueue_fd);
close(pipe_in[0]); close(pipe_in[1]);
close(pipe_out[0]); close(pipe_out[1]);
return -1;
}
return pipe_out[0];
}
// ------------------------------------------------------------------------
IMon::Status
IMon::imon_express(const char *name, struct stat *status)
{
// Get file information.
struct stat sb;
if (status == NULL)
status = &sb;
if (lstat(name, status) == -1)
return BAD;
// Open the file to be monitored; kqueue only works with open descriptors
// so we have to keep this descriptor during the life of this 'interest'.
int fd = open(name, O_RDONLY);
if (fd == -1)
return BAD;
// Construct a command to 'express' interest in a file. This will be
// handled by the kqueue_monitor thread as soon as possible.
struct imon_cmd cmd;
cmd.ic_type = IMON_CMD_EXPRESS;
cmd.ic_di.di_dev = status->st_dev;
cmd.ic_di.di_ino = status->st_ino;
cmd.ic_fd = fd;
if (pipe(cmd.ic_stat) == -1) {
close(fd);
return BAD;
}
write(pipe_in[1], &cmd, sizeof(struct imon_cmd));
// Wait for a result form the previous operation.
bool result;
read(cmd.ic_stat[0], &result, sizeof(bool));
close(cmd.ic_stat[0]); close(cmd.ic_stat[1]);
if (!result) {
close(fd);
Log::error("kqueue can't monitor more than %d files", max_changes);
return BAD;
}
Log::debug("told kqueue to monitor \"%s\", descriptor = %d, dev = %d, "
"ino = %d", name, cmd.ic_fd, cmd.ic_di.di_dev,
cmd.ic_di.di_ino);
return OK;
}
// ------------------------------------------------------------------------
IMon::Status
IMon::imon_revoke(const char *name, dev_t dev, ino_t ino)
{
// Construct a command to 'revoke' interest from a file. This will be
// handled by the kqueue_monitor thread as soon as possible.
struct imon_cmd cmd;
cmd.ic_type = IMON_CMD_REVOKE;
cmd.ic_di.di_dev = dev;
cmd.ic_di.di_ino = ino;
if (pipe(cmd.ic_stat) == -1)
return BAD;
write(pipe_in[1], &cmd, sizeof(struct imon_cmd));
// Wait for a result form the previous operation.
bool result;
read(cmd.ic_stat[0], &result, sizeof(bool));
close(cmd.ic_stat[0]); close(cmd.ic_stat[1]);
if (!result) {
Log::error("kqueue can't revoke \"%s\", dev = %d, ino = %d", name,
cmd.ic_di.di_dev, cmd.ic_di.di_ino);
return BAD;
}
Log::debug("told kqueue to forget \"%s\", dev = %d, ino = %d", name,
cmd.ic_di.di_dev, cmd.ic_di.di_ino);
return OK;
}
// ------------------------------------------------------------------------
static void *
kqueue_monitor(void *data)
{
struct kevent event;
for (;;) {
int nev = kevent(kqueue_fd, changes, last_change, &event, 1, NULL);
if (nev == -1)
Log::perror("kevent");
else if (nev > 0) {
assert(nev == 1);
if (event.flags & EV_ERROR) {
int fd = event.ident;
FDDEVINO_MAP::const_iterator iter = fd_to_devino.find(fd);
assert(iter != fd_to_devino.end());
struct devino di = iter->second;
Log::error("kqueue returned error for fd = %d, dev = %d, "
"ino = %d", fd, di.di_dev, di.di_ino);
// Remove offending entry from the mappings.
assert(devino_to_fd.find(di) != devino_to_fd.end());
devino_to_fd.erase(di);
assert(devino_to_fd.find(di) == devino_to_fd.end());
assert(fd_to_devino.find(fd) != fd_to_devino.end());
fd_to_devino.erase(fd);
assert(fd_to_devino.find(fd) == fd_to_devino.end());
// Remove the entry associated to the descriptor from the list
// of changes monitored by kqueue.
int i;
for (i = 1; i < last_change; i++)
if (changes[i].ident == fd)
break;
for (int j = i; j < last_change - 1; j++)
changes[j] = changes[j + 1];
last_change--;
close(fd);
continue;
}
if (event.ident == pipe_in[0]) {
// We have got a control command, so process it.
process_command();
} else {
// One of the descriptors we are monitoring has got activity.
FDDEVINO_MAP::const_iterator iter =
fd_to_devino.find(event.ident);
if (iter != fd_to_devino.end()) {
qelem_t elem;
// Set device/inode identifier on imon element.
const struct devino &di = (*iter).second;
elem.qe_dev = di.di_dev;
elem.qe_inode = di.di_ino;
// Convert the modification flags reported by kqueue to
// flags understood by imon.
elem.qe_what = 0;
if (event.fflags & NOTE_DELETE)
elem.qe_what |= IMON_DELETE;
if (event.fflags & NOTE_RENAME)
elem.qe_what |= IMON_RENAME;
if (event.fflags & NOTE_ATTRIB or event.fflags & NOTE_LINK)
elem.qe_what |= IMON_ATTRIBUTE;
if (event.fflags & NOTE_WRITE or event.fflags & NOTE_EXTEND)
elem.qe_what |= IMON_CONTENT;
// Deliver the element.
write(pipe_out[1], &elem, sizeof(qelem_t));
} else
Log::error("got an event from an unhandled device/inode "
"pair");
}
}
}
}
// ------------------------------------------------------------------------
static void
process_command(void)
{
bool result = false;
struct imon_cmd cmd;
// Read the command from the control pipe.
read(pipe_in[0], &cmd, sizeof(struct imon_cmd));
if (cmd.ic_type == IMON_CMD_EXPRESS) {
Log::debug("process_command: express, dev = %d, ino = %d",
cmd.ic_di.di_dev, cmd.ic_di.di_ino);
if (devino_to_fd.find(cmd.ic_di) != devino_to_fd.end()) {
// The file is already being monitored.
close(cmd.ic_fd);
result = true;
} else if (fd_to_devino.find(cmd.ic_fd) != fd_to_devino.end()) {
// We can't receive a new interest of a descriptor that is
// already being monitored. If this happens, there is an
// inconsistency in the data somewhere.
assert(false);
} else if (last_change < max_changes) {
// Add the new descriptor to the list of changes to monitor.
// We watch for any change that happens on it.
EV_SET(&changes[last_change], cmd.ic_fd, EVFILT_VNODE,
EV_ADD | EV_ENABLE | EV_ONESHOT,
NOTE_DELETE | NOTE_WRITE | NOTE_EXTEND | NOTE_ATTRIB |
NOTE_LINK | NOTE_RENAME | NOTE_REVOKE,
0, 0);
last_change++;
// Map the device/inode pair to the file descriptor associated
// to it and viceversa. We will need this information during
// 'revoke' and when we receive events. We use two different
// maps to speed up searches in both directions later.
assert(devino_to_fd.find(cmd.ic_di) == devino_to_fd.end());
devino_to_fd.insert
(DEVINOFD_MAP::value_type(cmd.ic_di, cmd.ic_fd));
assert(devino_to_fd.find(cmd.ic_di) != devino_to_fd.end());
assert(fd_to_devino.find(cmd.ic_fd) == fd_to_devino.end());
fd_to_devino.insert
(FDDEVINO_MAP::value_type(cmd.ic_fd, cmd.ic_di));
assert(fd_to_devino.find(cmd.ic_fd) != fd_to_devino.end());
result = true;
}
} else if (cmd.ic_type == IMON_CMD_REVOKE) {
Log::debug("process_command: revoke, dev = %d, ino = %d",
cmd.ic_di.di_dev, cmd.ic_di.di_ino);
DEVINOFD_MAP::const_iterator iter = devino_to_fd.find(cmd.ic_di);
if (iter != devino_to_fd.end()) {
// Get the descriptor associated to the given device/inode pair
// and remove the mapping from the required structure.
int fd = (*iter).second;
assert(devino_to_fd.find(cmd.ic_di) != devino_to_fd.end());
devino_to_fd.erase(cmd.ic_di);
assert(devino_to_fd.find(cmd.ic_di) == devino_to_fd.end());
assert(fd_to_devino.find(fd) != fd_to_devino.end());
fd_to_devino.erase(fd);
assert(fd_to_devino.find(fd) == fd_to_devino.end());
// Remove the entry associated to the descriptor from the list
// of changes monitored by kqueue.
int i;
for (i = 1; i < last_change; i++)
if (changes[i].ident == fd)
break;
for (int j = i; j < last_change - 1; j++)
changes[j] = changes[j + 1];
last_change--;
close(fd);
result = true;
}
} else {
// Huh? Unknown command received.
assert(false);
}
// Deliver the result of the operation.
write(cmd.ic_stat[1], &result, sizeof(bool));
}