diff -Nru pdns-2.9.21.orig/config.h.in pdns-2.9.21/config.h.in --- pdns-2.9.21.orig/config.h.in 2007-04-21 15:56:44.000000000 +0200 +++ pdns-2.9.21/config.h.in 2007-11-05 15:50:50.000000000 +0100 @@ -45,6 +45,9 @@ /* Define to 1 if you have the `select' function. */ #undef HAVE_SELECT +/* Define to 1 if you have the `poll' function. */ +#undef HAVE_POLL + /* Define to 1 if you have the `socket' function. */ #undef HAVE_SOCKET diff -Nru pdns-2.9.21.orig/configure.in pdns-2.9.21/configure.in --- pdns-2.9.21.orig/configure.in 2007-04-21 15:56:36.000000000 +0200 +++ pdns-2.9.21/configure.in 2007-11-05 15:55:52.000000000 +0100 @@ -30,7 +30,7 @@ dnl Checks for library functions. AC_TYPE_SIGNAL -AC_CHECK_FUNCS(gethostname gettimeofday mkdir mktime select socket strerror) +AC_CHECK_FUNCS(gethostname gettimeofday mkdir mktime select poll socket strerror) # Check for libdl diff -Nru pdns-2.9.21.orig/pdns/Makefile.am pdns-2.9.21/pdns/Makefile.am --- pdns-2.9.21.orig/pdns/Makefile.am 2007-04-21 15:56:36.000000000 +0200 +++ pdns-2.9.21/pdns/Makefile.am 2007-11-06 13:21:37.000000000 +0100 @@ -33,6 +33,7 @@ dnsproxy.hh randombackend.cc unix_utility.cc common_startup.cc \ utility.hh iputils.hh common_startup.hh unix_semaphore.cc \ backends/bind/bindbackend2.cc \ +backends/bind/bind2keepalive.cc backends/bind/socket_helper.cc \ backends/bind/bindparser.cc backends/bind/bindlexer.c \ backends/gsql/gsqlbackend.cc \ backends/gsql/gsqlbackend.hh backends/gsql/ssql.hh \ diff -Nru pdns-2.9.21.orig/pdns/backends/bind/Makefile.am pdns-2.9.21/pdns/backends/bind/Makefile.am --- pdns-2.9.21.orig/pdns/backends/bind/Makefile.am 2007-04-21 15:56:36.000000000 +0200 +++ pdns-2.9.21/pdns/backends/bind/Makefile.am 2007-11-06 16:08:46.000000000 +0100 @@ -4,6 +4,7 @@ libbind2backend_la_SOURCES=bindbackend2.cc bindbackend2.hh bindparser.yy \ bindlexer.l ../../zoneparser-tng.cc ../../misc.cc \ +bind2keepalive.cc bind2keepalive.hh socket_helper.cc socket_helper.hh \ bindparser.hh ../../unix_utility.cc libbind2backend_la_CXXFLAGS=$(AM_CXXFLAGS) diff -Nru pdns-2.9.21.orig/pdns/backends/bind/README.bind2keepalive pdns-2.9.21/pdns/backends/bind/README.bind2keepalive --- pdns-2.9.21.orig/pdns/backends/bind/README.bind2keepalive 1970-01-01 01:00:00.000000000 +0100 +++ pdns-2.9.21/pdns/backends/bind/README.bind2keepalive 2007-11-05 15:56:51.000000000 +0100 @@ -0,0 +1,131 @@ +Keepalive functionality for BindBackend2 of PowerDNS + +Copyright (C) 2006 Daniel Bilik (daniel.bilik@neosystem.cz) + +1. Overview + +The patch extends original BindBackend capabilities in several ways: + +- A records are served not all at once, but one by one + +- A records are "weighted", ie. some can be served more often than + others + +- special TXT records are identified and keepalive data for + associated A record are parsed from them + +- extra thread is being run that periodically walks through parsed + keepalive data and does all the "keepalive work", ie. checks if + an IP is alive and disables/enables it for serving + +2. Examples + +Let have domain.com: + +--- +$TTL 3600 +@ IN SOA ns.domain.com. root.ns.domain.com. ( + 2006032501; + 3600; + 900; + 3600; + 3600; ) + IN NS ns +ns IN A 192.168.1.1 +www IN A 192.168.1.2 +mail IN A 192.168.1.3 +--- + +We have now launched two more servers in a different location and +we make records for them: + +--- +www IN A 192.168.2.2 +mail IN A 192.168.2.3 +--- + +At this point we have classic DNS round-robin mechanism for serving +web and receiving mail. But we want something more... + +We need to drive 2/3 of all the traffic to the first location. Also +we'd like the outages (be it server or network related) to be taken +care of automatically, and not to serve records for unreachable +servers. For this purpose we'll use special TXT record of this syntax: + +"magic IP weight ka_type ka_port ka_freq ka_tout ka_maxfail ka_data" + +magic - must be 'wwrr', used to identify the record +IP - content of an A record to associate with +weight - how many times to serve associated A record +ka_type - method to use for keepalive checks + ('tcp' or 'http', 'icmp' not yet implemented) +ka_port - remote port to use for keepalive checks +ka_freq - how often to do keepalive checks in seconds +ka_tout - timeout in second for each one keepalive check +ka_maxfail - how many times the keepalive check must fail + before associated A record is disabled +ka_data - additional data for use in keepalive check + (for 'tcp' it can be a string to be read from host, + in case of 'http' it can be custom URL to request) + +Fields can be separated either by space, tab or '|'. + +So, in our example we add these records to the zone definition: + +--- +www IN TXT "wwrr 192.168.1.2 2 http 80 30 5 3" +www IN TXT "wwrr 192.168.2.2 1 http 80 30 5 3" +--- + +This tells the nameserver to serve A record for www.domain.com this +way: two times the record with content '192.168.1.2', then once the +second one with content '192.168.2.2', then again two times the first +one and so on. +There is also keepalive info which instructs the keepalive guard to +make HTTP request for "/" on port 80 each 30 seconds, waiting at +most 5 seconds to be finished successfully, and to disable an +A record for serving if there are 3 consecutive failures for the +host. + +Similar we can define keepalive info for our mail servers: + +--- +mail IN TXT "wwrr 192.168.1.3 2 tcp 25 60 5 3 220" +mail IN TXT "wwrr 192.168.2.3 1 tcp 25 60 5 3 220" +--- + +Again we are balancing the traffic 2:1 between locations. Keepalive +guard is instructed to connect to port 25 each minute. Additionaly +it will read data from remote host that should include string "220". +If this does not happen in 5 seconds, host is considered failed. + +When the keepalive configuration is in place, we can enable the +functionality in pdns.conf: + +--- +bind-wwrr=yes +--- + +After restarting nameserver, we should see every keepalive guard's +activity in the log file. +Status overview for all keepalive entries is accessible through +PowerDNS internal webserver by selecting 'bind' next to 'Backend +statistics' on the main page. + +3. Notes + +Be aware that the "weighted round-robin" mechanism of serving records +is influenced by the PowerDNS cache, so it's advised either to set +cache-ttl very low (1 or 2 seconds) or disable it completely. +You will also want to decrease TTL of the A records for the failover +to be effective on the client side as soon as possible. Values of +5 to 10 minutes should work fine. + +4. Bugs + +None I'm aware of. + +5. Plans + +Implement 'icmp' and 'none' (for loadbalancing only) type for keepalive +checks. diff -Nru pdns-2.9.21.orig/pdns/backends/bind/bind2keepalive.cc pdns-2.9.21/pdns/backends/bind/bind2keepalive.cc --- pdns-2.9.21.orig/pdns/backends/bind/bind2keepalive.cc 1970-01-01 01:00:00.000000000 +0100 +++ pdns-2.9.21/pdns/backends/bind/bind2keepalive.cc 2007-11-07 10:20:07.000000000 +0100 @@ -0,0 +1,719 @@ +/* + Keepalive functionality for BindBackend2 of PowerDNS (http://www.powerdns.com/) + + Copyright (C) 2006-2007 Daniel Bilik (daniel.bilik@neosystem.cz) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include "dns.hh" +#include "logger.hh" +#include "dnsbackend.hh" +#include "bind2keepalive.hh" +#include "bindbackend2.hh" +#include "socket_helper.hh" + +vector keepalive_target_state; + +void *gthread(void *i) +{ + unsigned int a; + useconds_t time_sleep; + Bind2Backend::Keepalive *me = (Bind2Backend::Keepalive *)i; + + if (pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0) + L << "Keepalive thread becoming candidate for immortal: unable to pthread_setcancelstate()." << endl; + else if (pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL) != 0) + L << "Keepalive thread becoming candidate for immortal: unable to pthread_setcanceltype()." << endl; + + L << "Keepalive thread launched." << endl; + + while (me->status != 9) { + time_sleep = 5; + switch (me->command) { + case 0: /* sleep */ + if (me->status != 0) { + L << "Keepalive thread falling asleep." << endl; + me->status = 0; + } + break; + case 1: /* run */ + /* try locking watch_pool */ + if (pthread_mutex_trylock(&me->watch_pool_lock) != 0) + /* already locked, probably maintenance in progress due to zone reload, skip this round */ + break; + if (me->status != 1) { + L << "Keepalive thread awaking." << endl; + me->status = 1; + } + /* run the timer (clears timedout connections and marks entries to be checked now) */ + me->watch_pool_timer(); + /* check opened sockets for activity (marks entries in watch_pool that can continue in processing) */ + socket_pool_check(1); + /* walk through all entries in watch_pool */ + for (a = 0; a < me->watch_count; a++) { + /* if the entry failed previously but has not reached max fails yet, run next check immediately */ + if ((me->watch_pool[a].fail_count > 0) && (me->watch_pool[a].fail_count < me->watch_pool[a].fail_max)) + if (me->watch_pool[a].state == 0) + me->watch_pool[a].last_check.tv_sec = 0; + /* if the entry is marked to start check or there was socket activity for active check, do what is needed */ + if ((me->watch_pool[a].last_check.tv_sec == 0) || (me->watch_pool[a].target_socket_ready)) + me->watch_pool_check(a); + } + pthread_mutex_unlock(&me->watch_pool_lock); + break; + case 9: /* exit */ + me->status = 9; + L << "Keepalive thread exiting." << endl; + time_sleep = 0; + break; + } + usleep(time_sleep * 1000); + } + + return(NULL); +} + +Bind2Backend::Keepalive::Keepalive(bool active) +{ + int returned = -1; + unsigned int fail_count = 0, check_count = 0; + + /* initialize things */ + watch_pool = NULL; + watch_count = 0; + guard_active = active; + /* if we were instructed not to use keepalive guard, just return here */ + if (!guard_active) + return; + while ((returned != 0) && (fail_count < 3)) { + command = 0; + status = 10; + /* try to launch guard thread */ + if ((returned = pthread_create(&guard_thread, 0, gthread, (void *)this)) != 0) { + /* thread creation failed, we'll try in the next round */ + fail_count++; + usleep(1000); + continue; + } + /* wait for the new thread to show some activity, this is a little hack for rare 'suspended after created' problems */ + check_count = 0; + while ((status > 0) && (check_count < 50)) { + check_count++; + usleep(1000); + } + /* if our thread was unable to do anything... */ + if (status > 0) { + /* ... take it and try again in the next round */ + if (pthread_cancel(guard_thread) == 0) + pthread_join(guard_thread, NULL); + fail_count++; + returned = -1; + usleep(1000); + /* thread seems to be up and running, try to finish the work by initializing mutex */ + } else if ((returned = pthread_mutex_init(&watch_pool_lock, NULL)) != 0) { + command = 9; + pthread_join(guard_thread, NULL); + fail_count++; + usleep(1000); + } + } + if (returned != 0) { + guard_active = 0; + L << "Keepalive guard disabled: unable to initialize new thread." << endl; + } + keepalive_target_state.push_back("dead"); + keepalive_target_state.push_back("alive"); + keepalive_target_state.push_back("testing"); +} + +Bind2Backend::Keepalive::~Keepalive() +{ + if (!guard_active) + return; + /* make sure we can start safely destroying things */ + pthread_mutex_lock(&watch_pool_lock); + /* tell the guard thread to finish */ + command = 9; + /* take it... */ + pthread_join(guard_thread, NULL); + /* ... and clean everything */ + watch_pool_flush(); + pthread_mutex_destroy(&watch_pool_lock); +} + +void +Bind2Backend::Keepalive::initialize(id_zone_map_t &zone_map) +{ + id_zone_map_t::const_iterator a; + vector::iterator b; + + if (!guard_active) + return; + /* we are going to touch watch_pool, so get lock first */ + pthread_mutex_lock(&watch_pool_lock); + + watch_pool_flush(); + + /* walk through all zones, find A records that have additional keepalive info, and add them to watch_pool */ + for(a = zone_map.begin(); a != zone_map.end(); a++) + for (b = a->second.d_records->begin(); b != a->second.d_records->end(); b++) + if ((b->qtype == QType::A) && (b->wwrr.type != 0)) + watch_pool_add(*b, a->second.d_id); + + /* do the (re)sizing of socket_pool */ + socket_pool_init(watch_count); + + pthread_mutex_unlock(&watch_pool_lock); + + L << "Keepalive guard complete initialization: got " << watch_count << " watch entries." << endl; +} + +void +Bind2Backend::Keepalive::initialize(id_zone_map_t &zone_map, unsigned int domain_id) +{ + BB2DomainInfo a = (zone_map.find(domain_id))->second; + vector::iterator b; + unsigned int c = 0; + + if (!guard_active) + return; + /* same as above... */ + pthread_mutex_lock(&watch_pool_lock); + + /* ... but flush only entries matching the given domain_id... */ + while (c < watch_count) + if (watch_pool[c].domain_id == domain_id) + watch_pool_del(c); + else + c++; + + c = 0; + /* ... and walk through records of just this one zone */ + for (b = a.d_records->begin(); b != a.d_records->end(); b++) + if ((b->qtype == QType::A) && (b->wwrr.type != 0)) { + watch_pool_add(*b, domain_id); + c++; + } + + socket_pool_init(watch_count); + + pthread_mutex_unlock(&watch_pool_lock); + + L << "Keepalive guard initialization of domain '" << a.d_name << "': got " << c << " watch entries." << endl; +} + +void +Bind2Backend::Keepalive::suspend() +{ + if (!guard_active) + return; + /* get lock here to be sure the guard is not running the main loop */ + pthread_mutex_lock(&watch_pool_lock); + /* tell the guard to fall asleep */ + command = 0; + pthread_mutex_unlock(&watch_pool_lock); + + L << "Keepalive guard suspended." << endl; +} + +void +Bind2Backend::Keepalive::activate() +{ + if (!guard_active) + return; + /* tell the guard to wake up */ + command = 1; + + L << "Keepalive guard activated." << endl; +} + +void +Bind2Backend::Keepalive::watch_pool_timer() +{ + unsigned int a; + useconds_t b; + struct timeval now; + + /* just to be safe ;-) */ + if (watch_pool == NULL) + return; + + gettimeofday(&now, NULL); + + for (a = 0; a < watch_count; a++) { + /* count the time passed from the last check of the entry */ + b = ((now.tv_sec - watch_pool[a].last_check.tv_sec) * 1000000) + now.tv_usec - watch_pool[a].last_check.tv_usec; + b = b / 1000000; + /* if there is no check running for the entry... */ + if (watch_pool[a].state == 0) { + /* ... and it's reached checktime... */ + if (b > watch_pool[a].frequency) + /* ... mark it for check */ + watch_pool[a].last_check.tv_sec = 0; + /* if there is check running for the entry that has just timedout... */ + } else if (b > watch_pool[a].timeout) { + /* ... record the failure and the time */ + watch_pool_fail(&now, a, ETIMEDOUT); + watch_pool[a].last_check = now; + } + } +} + +void +Bind2Backend::Keepalive::watch_pool_check(unsigned int entry_index) +{ + int a, b, returned; + char socket_state = SOCKET_STATE_WRITE; + struct timeval now; + + if (watch_pool == NULL) + return; + + gettimeofday(&now, NULL); + + if (watch_pool[entry_index].state == 0) + watch_pool[entry_index].last_check = now; + + switch (watch_pool[entry_index].state) { + case 0: /* not connected */ + switch (watch_pool[entry_index].type) { + case KEEPALIVE_TYPE_ICMP: + /* not yet implemented, pretend success */ + watch_pool_success(&now, entry_index); + break; + case KEEPALIVE_TYPE_TCP: + /* if there are data expected to be read() after connect, change desired socket_state */ + if (watch_pool[entry_index].data != NULL) + socket_state = SOCKET_STATE_READ; + case KEEPALIVE_TYPE_HTTP: + /* create new socket and make the connection */ + if ((a = socket_connect_tcp(&watch_pool[entry_index].target_address, sizeof(watch_pool[entry_index].target_address))) == -1) + watch_pool_fail(&now, entry_index, errno); + else { + watch_pool[entry_index].target_socket = a; + /* add new socket to socket_pool to be watched for activity */ + if (socket_pool_add(a, &watch_pool[entry_index].target_socket_ready, socket_state) == -1) + watch_pool_fail(&now, entry_index, EMFILE); + else + watch_pool[entry_index].state = 1; + } + break; + } + break; + case 1: /* connected, ready for read() and/or write() */ + a = watch_pool[entry_index].target_socket; + switch (watch_pool[entry_index].type) { + case KEEPALIVE_TYPE_TCP: + /* check for socket errors */ + if (socket_error(a) < 0) + watch_pool_fail(&now, entry_index, errno); + /* connect() done, no need to read anything from remote, mark it success */ + else if (watch_pool[entry_index].data == NULL) + watch_pool_success(&now, entry_index); + else { + /* read() data from socket to the end of buffer and compare whole buffer with given string... */ + returned = socket_read(a, &watch_pool[entry_index].receive_buffer, &watch_pool[entry_index].receive_len); + switch (returned) { + case 0: + errno = ECONNABORTED; + case -1: + watch_pool_fail(&now, entry_index, errno); + break; + default: + if (watch_pool[entry_index].receive_buffer != NULL) + /* ... and in case of match, mark it success */ + if (strstr(watch_pool[entry_index].receive_buffer, watch_pool[entry_index].data) != NULL) + watch_pool_success(&now, entry_index); + } + } + break; + case KEEPALIVE_TYPE_HTTP: + /* check for socket errors */ + if (socket_error(a) < 0) + watch_pool_fail(&now, entry_index, errno); + /* send simplified HTTP request... */ + else if ((b = http_request(a, watch_pool[entry_index].data)) == -1) + watch_pool_fail(&now, entry_index, errno); + else if (b == 1) { + /* ... and in case of success remove socket from socket_pool... */ + socket_pool_del(a); + /* ... and re-insert it with changed desired socket_state */ + if (socket_pool_add(a, &watch_pool[entry_index].target_socket_ready, SOCKET_STATE_READ) == -1) + watch_pool_fail(&now, entry_index, EMFILE); + else + watch_pool[entry_index].state = 2; + } + break; + } + break; + case 2: /* request sent, response data ready on the socket for read() */ + a = watch_pool[entry_index].target_socket; + switch (watch_pool[entry_index].type) { + case KEEPALIVE_TYPE_HTTP: + /* check for socket errors */ + if (socket_error(a) < 0) + watch_pool_fail(&now, entry_index, errno); + else { + /* read() data from socket to the end of buffer... */ + returned = socket_read(a, &watch_pool[entry_index].receive_buffer, &watch_pool[entry_index].receive_len); + switch (returned) { + case 0: + errno = ECONNABORTED; + case -1: + watch_pool_fail(&now, entry_index, errno); + break; + default: + /* ... parse HTTP response in buffer, it returns 0 in case the response is not complete yet... */ + if ((b = http_response(watch_pool[entry_index].receive_buffer)) == -1) + /* ... -1 in case of HTTP protocol error... */ + watch_pool_fail(&now, entry_index, b); + /* or received HTTP response code */ + else if ((b > 0) && (b < 400)) { + watch_pool[entry_index].check_http_code = b; + watch_pool_success(&now, entry_index); + } else if (b >= 400) { + watch_pool[entry_index].check_http_code = b; + watch_pool_fail(&now, entry_index, 0); + } + } + } + break; + } + break; + } + + if (watch_pool[entry_index].state == 0) + watch_pool[entry_index].last_check = now; +} + +void +Bind2Backend::Keepalive::watch_pool_flush() +{ + unsigned int a; + + if (watch_pool == NULL) + return; + + /* walk through the watch_pool... */ + for (a = 0; a < watch_count; a++) { + /* ... free any allocated buffers... */ + if (watch_pool[a].receive_buffer != NULL) + free(watch_pool[a].receive_buffer); + /* ... and close any opened sockets */ + if (watch_pool[a].target_socket != -1) { + socket_pool_del(watch_pool[a].target_socket); + shutdown(watch_pool[a].target_socket, SHUT_RDWR); + close(watch_pool[a].target_socket); + } + } + + /* free the rest of allocated memory and reset variables */ + free(watch_pool); + watch_pool = NULL; + watch_count = 0; +} + +void +Bind2Backend::Keepalive::watch_pool_add(Bind2DNSRecord &record, unsigned int domain_id) +{ + watch_pool = (watch_entry_t *)realloc(watch_pool, ((watch_count + 1) * sizeof(watch_entry_t))); + + if (watch_pool == NULL) { + watch_count = 0; + return; + } + + /* take values found in given Bind2DNSRecord and assign them to next free entry in watch_pool */ + watch_pool[watch_count].domain_id = domain_id; + watch_pool[watch_count].state = 0; + watch_pool[watch_count].lever = &record.alive; + inet_pton(AF_INET, record.content.c_str(), &watch_pool[watch_count].target_address.sin_addr); + watch_pool[watch_count].target_address.sin_family = AF_INET; + watch_pool[watch_count].target_address.sin_port = htons(record.wwrr.port); + watch_pool[watch_count].target_socket = -1; + watch_pool[watch_count].target_socket_ready = 0; + watch_pool[watch_count].type = record.wwrr.type; + watch_pool[watch_count].frequency = record.wwrr.frequency; + watch_pool[watch_count].timeout = record.wwrr.timeout; + watch_pool[watch_count].fail_max = record.wwrr.maxfailures; + watch_pool[watch_count].fail_count = 0; + watch_pool[watch_count].last_check.tv_sec = 0; + watch_pool[watch_count].check_duration = 0; + watch_pool[watch_count].check_error = 0; + watch_pool[watch_count].check_http_code = 0; + if (record.wwrr.data == "") + watch_pool[watch_count].data = NULL; + else + watch_pool[watch_count].data = record.wwrr.data.c_str(); + watch_pool[watch_count].receive_buffer = NULL; + watch_pool[watch_count].receive_len = 0; + + watch_count++; +} + +void +Bind2Backend::Keepalive::watch_pool_del(unsigned int entry_index) +{ + if (watch_pool == NULL) + return; + + /* free any allocated buffer */ + if (watch_pool[entry_index].receive_buffer != NULL) + free(watch_pool[entry_index].receive_buffer); + + /* close any opened socket */ + if (watch_pool[entry_index].target_socket != -1) { + socket_pool_del(watch_pool[entry_index].target_socket); + shutdown(watch_pool[entry_index].target_socket, SHUT_RDWR); + close(watch_pool[entry_index].target_socket); + } + + watch_count--; + + /* if the deleted entry was not the last one in watch_pool... */ + if (entry_index < watch_count) { + /* move the last one to its place... */ + memcpy(&watch_pool[entry_index], &watch_pool[watch_count], sizeof(watch_entry_t)); + /* ... and if it has opened socket... */ + if (watch_pool[entry_index].target_socket != -1) + /* update socket_pool entry accordingly */ + socket_pool_update(watch_pool[entry_index].target_socket, &watch_pool[entry_index].target_socket_ready); + } +} + +void +Bind2Backend::Keepalive::watch_pool_success(struct timeval *now, unsigned int entry_index) +{ + char ip[16]; + + /* free any allocated buffer */ + if (watch_pool[entry_index].receive_buffer != NULL) { + free(watch_pool[entry_index].receive_buffer); + watch_pool[entry_index].receive_buffer = NULL; + watch_pool[entry_index].receive_len = 0; + } + + /* close opened socket if any */ + if (watch_pool[entry_index].target_socket != -1) { + socket_pool_del(watch_pool[entry_index].target_socket); + shutdown(watch_pool[entry_index].target_socket, SHUT_RDWR); + close(watch_pool[entry_index].target_socket); + watch_pool[entry_index].target_socket = -1; + } + + /* if the associated record was previsously disabled because of failures... */ + if (*watch_pool[entry_index].lever == 0) { + memset(&ip, 0, 16); + inet_ntop(AF_INET, &watch_pool[entry_index].target_address.sin_addr, ip, 15); + /* ... report the fact it is enabled again */ + L << "Keepalive guard: " << ip << " enabled." << endl; + } + + /* statistics */ + watch_pool[entry_index].check_duration = ((now->tv_sec - watch_pool[entry_index].last_check.tv_sec) * 1000000) + now->tv_usec - watch_pool[entry_index].last_check.tv_usec; + watch_pool[entry_index].check_error = 0; + + /* mark associated record enabled and reset counters */ + *watch_pool[entry_index].lever = 1; + watch_pool[entry_index].fail_count = 0; + + watch_pool[entry_index].state = 0; +} + +void +Bind2Backend::Keepalive::watch_pool_fail(struct timeval *now, unsigned int entry_index, int error_number) +{ + char ip[16]; + uint16_t port; + + /* free any allocated buffer */ + if (watch_pool[entry_index].receive_buffer != NULL) { + free(watch_pool[entry_index].receive_buffer); + watch_pool[entry_index].receive_buffer = NULL; + watch_pool[entry_index].receive_len = 0; + } + + /* close opened socket if any */ + if (watch_pool[entry_index].target_socket != -1) { + socket_pool_del(watch_pool[entry_index].target_socket); + shutdown(watch_pool[entry_index].target_socket, SHUT_RDWR); + close(watch_pool[entry_index].target_socket); + watch_pool[entry_index].target_socket = -1; + } + + /* if entry has not reached maximum failures yet... */ + if (watch_pool[entry_index].fail_count < watch_pool[entry_index].fail_max) { + /* ... increase the counter... */ + watch_pool[entry_index].fail_count++; + memset(&ip, 0, 16); + inet_ntop(AF_INET, &watch_pool[entry_index].target_address.sin_addr, ip, 15); + port = ntohs(watch_pool[entry_index].target_address.sin_port); + /* ... report it... */ + L << "Keepalive guard: " << ip << ":" << port << " failed (" << + watch_pool[entry_index].fail_count << " of " << watch_pool[entry_index].fail_max << ")." << endl; + /* ... and if it was the last step to reach maximum failures... */ + if (watch_pool[entry_index].fail_count == watch_pool[entry_index].fail_max) { + /* ... report the fact the record is disabled... */ + L << "Keepalive guard: " << ip << " disabled." << endl; + /* ... and mark the record disabled */ + *watch_pool[entry_index].lever = 0; + } + } + + /* statistics */ + watch_pool[entry_index].check_duration = ((now->tv_sec - watch_pool[entry_index].last_check.tv_sec) * 1000000) + now->tv_usec - watch_pool[entry_index].last_check.tv_usec; + watch_pool[entry_index].check_error = error_number; + + watch_pool[entry_index].state = 0; +} + +string +Bind2Backend::Keepalive::stats(const map &varmap) +{ + unsigned int a, b, c; + uint32_t host_a = 0, host_b = 0; + char ip[16]; + uint16_t port; + struct tm last_check; + char check_time[20]; + char check_detail[60]; + double check_duration; + unsigned int status_watch_count = 0; + watch_entry_t *status_watch_pool = NULL; + int *watch_pool_index = NULL; + int watch_pool_sort = 0; + map page_params = varmap; + ostringstream returned; + + /* if there are any entries we can gather statistics for... */ + if (watch_count > 0) { + /* ... grab the lock to make sure that somebody else is not touching the pool */ + pthread_mutex_lock(&watch_pool_lock); + /* allocate required amount of memory for local copy of the pool and its index array */ + status_watch_count = watch_count; + status_watch_pool = (watch_entry_t *)malloc(sizeof(watch_entry_t) * status_watch_count); + watch_pool_index = (int *)malloc(sizeof(int) * status_watch_count); + /* if we were not given needed memory... */ + if ((status_watch_pool == NULL) || (watch_pool_index == NULL)) { + /* ... unlock the pool... */ + pthread_mutex_unlock(&watch_pool_lock); + /* ... report the failure instead of statistics... */ + returned << "Keepalive guard statistics could not be gathered: unable to allocate memory." << endl; + /* ... free any memory we got so far... */ + if (status_watch_pool != NULL) + free(status_watch_pool); + if (watch_pool_index != NULL) + free(watch_pool_index); + /* ... and return the report */ + return returned.str(); + } + /* got all needed memory so make a local copy of the pool... */ + memcpy(status_watch_pool, watch_pool, (sizeof(watch_entry_t) * status_watch_count)); + + /* ... dereference pointers now to avoid races... */ + for (a = 0; a < status_watch_count; a++) + if (status_watch_pool[a].state > 0) + status_watch_pool[a].state = 2; + else + status_watch_pool[a].state = *status_watch_pool[a].lever; + + /* ... and release the lock */ + pthread_mutex_unlock(&watch_pool_lock); + } + + returned << "" << endl; + returned << "" << endl; + returned << ""; + returned << ""; + returned << ""; + returned << ""; + returned << ""; + returned << "" << endl; + + /* check if sorting is desired */ + if (page_params["sort"] == "address") + watch_pool_sort = 1; + if (page_params["sort"] == "state") + watch_pool_sort = 2; + + /* initialize index */ + for (a = 0; a < status_watch_count; a++) + watch_pool_index[a] = a; + + /* sort entries as requested */ + if (watch_pool_sort > 0) + for (a = 0; a < status_watch_count; a++) + for (b = a; b < status_watch_count; b++) { + switch (watch_pool_sort) { + case 1: + host_a = ntohl(*(uint32_t *)&status_watch_pool[watch_pool_index[a]].target_address.sin_addr); + host_b = ntohl(*(uint32_t *)&status_watch_pool[watch_pool_index[b]].target_address.sin_addr); + break; + case 2: + host_a = (uint32_t)status_watch_pool[watch_pool_index[a]].state; + host_b = (uint32_t)status_watch_pool[watch_pool_index[b]].state; + break; + } + if (host_b < host_a) { + c = watch_pool_index[b]; + watch_pool_index[b] = watch_pool_index[a]; + watch_pool_index[a] = c; + } + } + + /* form entries info and add it to the statistics table */ + for (a = 0; a < status_watch_count; a++) { + c = watch_pool_index[a]; + inet_ntop(AF_INET, &status_watch_pool[c].target_address.sin_addr, ip, 15); + port = ntohs(status_watch_pool[c].target_address.sin_port); + b = (unsigned int)status_watch_pool[c].state; + if (b == 2) + sprintf(check_time, "now"); + else { + localtime_r((time_t *)&status_watch_pool[c].last_check.tv_sec, &last_check); + strftime(check_time, 20, "%H:%M:%S %d-%m-%Y", &last_check); + } + returned << ""; + returned << ""; + returned << ""; + returned << ""; + returned << ""; + returned << "" << endl; + } + + /* release all allocated memory */ + if (status_watch_pool != NULL) + free(status_watch_pool); + if (watch_pool_index != NULL) + free(watch_pool_index); + + /* and return it */ + return returned.str(); +} diff -Nru pdns-2.9.21.orig/pdns/backends/bind/bind2keepalive.hh pdns-2.9.21/pdns/backends/bind/bind2keepalive.hh --- pdns-2.9.21.orig/pdns/backends/bind/bind2keepalive.hh 1970-01-01 01:00:00.000000000 +0100 +++ pdns-2.9.21/pdns/backends/bind/bind2keepalive.hh 2007-11-05 15:57:22.000000000 +0100 @@ -0,0 +1,44 @@ +/* + Keepalive functionality for BindBackend2 of PowerDNS (http://www.powerdns.com/) + + Copyright (C) 2006 Daniel Bilik (daniel.bilik@neosystem.cz) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#define KEEPALIVE_TYPE_ICMP 1 +#define KEEPALIVE_TYPE_TCP 2 +#define KEEPALIVE_TYPE_HTTP 3 + +typedef struct { + unsigned int domain_id; + uint8_t state; + uint8_t *lever; + struct sockaddr_in target_address; + int target_socket; + char target_socket_ready; + uint8_t type; + uint16_t frequency; + uint8_t timeout; + uint8_t fail_max; + char fail_count; + struct timeval last_check; + useconds_t check_duration; + int check_error; + int check_http_code; + /* 0 = tested, 1 = connecting, 2 = sending, 3 = receiving */ + const char *data; + char *receive_buffer; + unsigned int receive_len; +} watch_entry_t; diff -Nru pdns-2.9.21.orig/pdns/backends/bind/bindbackend2.cc pdns-2.9.21/pdns/backends/bind/bindbackend2.cc --- pdns-2.9.21.orig/pdns/backends/bind/bindbackend2.cc 2007-04-21 15:56:36.000000000 +0200 +++ pdns-2.9.21/pdns/backends/bind/bindbackend2.cc 2007-11-08 11:38:23.000000000 +0100 @@ -32,6 +32,7 @@ #include "dns.hh" #include "dnsbackend.hh" +#include "bind2keepalive.hh" #include "bindbackend2.hh" #include "dnspacket.hh" #include "zoneparser-tng.hh" @@ -42,6 +43,8 @@ #include "misc.hh" #include "dynlistener.hh" #include "lock.hh" +#include "webserver.hh" +#include "ws.hh" using namespace std; /** new scheme of things: @@ -126,6 +129,64 @@ d_ctime=buf.st_ctime; } +uint8_t BB2DomainInfo::WWRRType(const string &s) +{ + /* parse given string to find known keywords and return according values or zero */ + switch (s[0]) { + case 'i': + if (strcmp(s.c_str(), "icmp") == 0) + return KEEPALIVE_TYPE_ICMP; + break; + case 't': + if (strcmp(s.c_str(), "tcp") == 0) + return KEEPALIVE_TYPE_TCP; + break; + case 'h': + if (strcmp(s.c_str(), "http") == 0) + return KEEPALIVE_TYPE_HTTP; + break; + } + return 0; +} + +void BB2DomainInfo::WWRRPair() +{ + vector &records = *d_records; + vector keepalive_data; + vector::iterator i, j; + + /* walk through all records of the zone */ + for (j = records.begin(); j != records.end(); ++j) + /* if TXT record is found... */ + if (j->qtype == QType::TXT) { + keepalive_data.clear(); + /* ... parse it... */ + stringtok(keepalive_data, j->content, " ,\t\""); + /* ... and if it seems to be 'watched and weighted round robin' information... */ + if (strcmp(keepalive_data[0].c_str(), "wwrr") || (keepalive_data.size() < 8)) + continue; + /* ... try to find according A record */ + for (i = records.begin(); i != records.end(); ++i) + /* if one is found... */ + if (i->qtype == QType::A) + /* ... see if it can be associated... */ + if ((i->qname == j->qname) && (i->content == keepalive_data[1])) { + /* ... and fill in data parsed from TXT record */ + i->weight = atoi(keepalive_data[2].c_str()); + i->wwrr.type = WWRRType(keepalive_data[3]); + i->wwrr.port = atoi(keepalive_data[4].c_str()); + i->wwrr.frequency = atoi(keepalive_data[5].c_str()); + i->wwrr.timeout = atoi(keepalive_data[6].c_str()); + i->wwrr.maxfailures = atoi(keepalive_data[7].c_str()); + if (keepalive_data.size() > 8) + i->wwrr.data = keepalive_data[8]; + else + i->wwrr.data = ""; + break; + } + } +} + void Bind2Backend::setNotified(uint32_t id, uint32_t serial) { Lock l(&s_state_lock); @@ -138,7 +199,7 @@ s_state->id_zone_map[domain_id].d_lastcheck=time(0); } -bool Bind2Backend::startTransaction(const string &qname, int id) +bool Bind2Backend::startTransaction(const string &qname, int id, const string &master) { shared_ptr state = s_state; // is only read from @@ -154,7 +215,7 @@ } *d_of<<"; Written by PowerDNS, don't edit!"< contents; +static Bind2Backend::Keepalive *keepalive_guard; +static bool keepalive_guard_used; + /** THIS IS AN INTERNAL FUNCTION! It does moadnsparser prio impedence matching This function adds a record to a domain with a certain id. Much of the complication is due to the efforts to benefit from std::string reference counting copy on write semantics */ @@ -373,6 +437,18 @@ bdr.ttl=ttl; bdr.priority=prio; + /* if an A record is to be inserted... */ + if (bdr.qtype == QType::A) { + /* ... fill in default 'watched and weighted round-robin' values... */ + bdr.weight = 1; + bdr.token = 1; + bdr.alive = 1; + bdr.wwrr.type = 0; + /* ... and create lock for it */ + if (pthread_mutex_init(&bdr.token_lock, NULL) != 0) + throw AhuException("Unable to pthread_mutex_init(), name='"+bdr.qname+"', zone='"+s_state->id_zone_map[id].d_name+"'"); + } + records.push_back(bdr); } @@ -457,6 +533,15 @@ } s_first=0; s_state = shared_ptr(new State); + + /* launch keepalive guard */ + keepalive_guard_used = mustDo("wwrr"); + keepalive_guard = new Keepalive(keepalive_guard_used); + + extern StatWebServer sws; + string status_page = "bind"; + sws.registerPage(status_page, &status); + loadConfig(); extern DynListener *dl; @@ -623,8 +708,10 @@ for(id_zone_map_t::const_iterator j=s_state->id_zone_map.begin();j != s_state->id_zone_map.end();++j) { oldnames.insert(j->second.d_name); } - for(id_zone_map_t::const_iterator j=staging->id_zone_map.begin(); j!= staging->id_zone_map.end(); ++j) { + for(id_zone_map_t::iterator j=staging->id_zone_map.begin(); j!= staging->id_zone_map.end(); ++j) { newnames.insert(j->second.d_name); + /* for new zones do pairing of 'watched and weighted round-robin' TXT records with according A records */ + j->second.WWRRPair(); } vector diff; @@ -657,8 +744,13 @@ set_difference(newnames.begin(), newnames.end(), oldnames.begin(), oldnames.end(), back_inserter(diff2)); newdomains=diff2.size(); + keepalive_guard->suspend(); + s_state.swap(staging); // and boy do we hope this is a threadsafe operation! + keepalive_guard->initialize(s_state->id_zone_map); + keepalive_guard->activate(); + // report ostringstream msg; msg<<" Done parsing domains, "<d_loaded=0; // block further access + + /* walk through all the records... */ + for (vector::iterator j = bbd->d_records->begin(); j != bbd->d_records->end(); ++j) + /* ... to find any A records... */ + if (j->qtype == QType::A) { + /* ... because we must destroy their locks first */ + pthread_mutex_lock(&j->token_lock); + pthread_mutex_destroy(&j->token_lock); + } + bbd->d_records = shared_ptr > (new vector); } @@ -685,6 +787,8 @@ // we reload *now* for the time being + keepalive_guard->suspend(); + try { nukeZoneRecords(bbd); // ? do we need this? staging->id_zone_map[bbd->d_id]=s_state->id_zone_map[bbd->d_id]; @@ -701,8 +805,13 @@ contents.clear(); + /* pair the 'WWRR' TXT and A records for the parsed zone */ + staging->id_zone_map[bbd->d_id].WWRRPair(); + s_state->id_zone_map[bbd->d_id]=staging->id_zone_map[bbd->d_id]; // move over + keepalive_guard->initialize(s_state->id_zone_map, bbd->d_id); + bbd->setCtime(); // and raise d_loaded again! bbd->d_loaded=1; @@ -720,6 +829,8 @@ msg<<" error at "+nowTime()+" parsing '"<d_name<<"' from file '"<d_filename<<"': "<d_status=msg.str(); } + + keepalive_guard->activate(); } bool operator<(const Bind2DNSRecord &a, const string &b) @@ -787,23 +898,15 @@ if(d_handle.d_records->empty()) DLOG(L<<"Query with no results"<::const_iterator, vector::const_iterator> range; + pair::iterator, vector::iterator> range; - // cout<<"starting equal range for: '"<begin(), d_handle.d_records->end(), lname); - if(range.first==range.second) { - d_handle.d_list=false; - return; - } - else { - d_handle.d_iter=range.first; - d_handle.d_end_iter=range.second; - d_handle.mustlog = mustlog; - - } + d_handle.d_iter=d_handle.d_start_iter=range.first; + d_handle.d_end_iter=range.second; d_handle.d_list=false; } @@ -841,20 +944,57 @@ bool Bind2Backend::handle::get_normal(DNSResourceRecord &r) { - DLOG(L << "Bind2Backend get() was called for "<size()<<" available!"<::iterator d_actual_iter; + + DLOG(L << "Bind2Backend get() was called for "<size()<<" available!"<qtype == QType::A) + ; + /* the record matches the query type */ + else if (qtype.getCode() == QType::ANY || qtype.getCode() == d_iter->qtype) { + /* if it's an A record... */ + if (d_iter->qtype == QType::A) { + /* check if it has the token to be served */ + if (d_iter->token) + break; + } else + break; + } + DLOG(L<qtype).getName()<<": '"<content<<"'"<qtype==qtype.getCode())) { - DLOG(L<qtype).getName()<<": '"<content<<"'"<qtype).getCode()<qtype; r.ttl=(d_iter)->ttl; r.priority=(d_iter)->priority; - d_iter++; + + /* if keepalive guard is active and an 'WWRR' A record is to be returned, we must update token */ + if (keepalive_guard_used && d_iter->qtype == QType::A && d_iter->wwrr.type != 0) { + /* weight of the record not reached yet... */ + if (d_iter->token < d_iter->weight) + /* ... so just increase token... */ + d_iter->token++; + /* ... otherwise token needs to be passed to next A record... */ + else if (pthread_mutex_trylock(&d_iter->token_lock) == 0) { + /* lock acquired, nobody else is doing this dirty job, so it's up to us */ + d_actual_iter = d_iter; + d_iter++; + for (;;) + if (d_iter == d_end_iter) + d_iter = d_start_iter; + else { + /* find next A record marked enabled (alive) that has non-zero weight, + if we end up on an A record that was just served, leave the token on it and continue */ + if ((d_iter == d_actual_iter) || ((d_iter->qtype == QType::A) && d_iter->alive && d_iter->weight)) { + d_actual_iter->token = 0; + d_iter->token = 1; + break; + } + d_iter++; + } + pthread_mutex_unlock(&d_actual_iter->token_lock); + } + d_iter = d_end_iter; + } else + d_iter++; return true; } @@ -999,6 +1168,25 @@ return true; } +string Bind2Backend::status(const map &varmap, void *ptr, bool *custom) +{ + ostringstream returned; + + returned << "PowerDNS Operational Monitor" << endl; + + /* make the header of the statistics document */ + returned << "

Status overview for BindBackend2

" << endl; + + if (keepalive_guard_used) + returned << keepalive_guard->stats(varmap); + + /* close document */ + returned << "" << endl; + + /* and return it */ + return returned.str(); +} + class Bind2Factory : public BackendFactory { public: @@ -1012,6 +1200,7 @@ declare(suffix,"supermaster-config","Location of (part of) named.conf where pdns can write zone-statements to",""); declare(suffix,"supermasters","List of IP-addresses of supermasters",""); declare(suffix,"supermaster-destdir","Destination directory for newly added slave zones",::arg()["config-dir"]); + declare(suffix,"wwrr","Enable loadbalancing and keepalive functionality","no"); } DNSBackend *make(const string &suffix="") diff -Nru pdns-2.9.21.orig/pdns/backends/bind/bindbackend2.hh pdns-2.9.21/pdns/backends/bind/bindbackend2.hh --- pdns-2.9.21.orig/pdns/backends/bind/bindbackend2.hh 2007-04-21 15:56:36.000000000 +0200 +++ pdns-2.9.21/pdns/backends/bind/bindbackend2.hh 2007-11-06 15:57:34.000000000 +0100 @@ -33,6 +33,16 @@ using namespace std; using namespace boost; +struct Bind2WWRR +{ + uint8_t type; + uint16_t port; + uint16_t frequency; + uint8_t timeout; + uint8_t maxfailures; + string data; +}; + /** This struct is used within the Bind2Backend to store DNS information. It is almost identical to a DNSResourceRecord, but then a bit smaller and with different sorting rules, which make sure that the SOA record comes up front. */ @@ -43,6 +53,12 @@ string content; uint16_t qtype; uint16_t priority; + uint16_t weight; + uint16_t token; + pthread_mutex_t token_lock; + uint8_t alive; + Bind2WWRR wwrr; + bool operator<(const Bind2DNSRecord& rhs) const { if(qname < rhs.qname) @@ -66,6 +82,8 @@ bool current(); + void WWRRPair(); + bool d_loaded; //!< if a domain is loaded string d_status; //!< message describing status of a domain, for human consumtpion bool d_checknow; //!< if this domain has been flagged for a check @@ -85,6 +103,7 @@ private: time_t getCtime(); time_t d_checkinterval; + uint8_t WWRRType(const string &s); }; class Bind2Backend : public DNSBackend @@ -106,7 +125,7 @@ void setFresh(uint32_t domain_id); void setNotified(uint32_t id, uint32_t serial); - bool startTransaction(const string &qname, int id); + bool startTransaction(const string &qname, int id, const string &master); // bool Bind2Backend::stopTransaction(const string &qname, int id); bool feedRecord(const DNSResourceRecord &r); bool commitTransaction(); @@ -129,7 +148,34 @@ // for supermaster support bool superMasterBackend(const string &ip, const string &domain, const vector&nsset, string *account, DNSBackend **db); bool createSlaveDomain(const string &ip, const string &domain, const string &account); - + + static string status(const map &varmap, void *ptr, bool *custom); + + class Keepalive { + public: + Keepalive(bool active); + ~Keepalive(); + void initialize(id_zone_map_t &zone_map); + void initialize(id_zone_map_t &zone_map, unsigned int domain_id); + void suspend(); + void activate(); + string stats(const map &varmap); + void watch_pool_timer(); + void watch_pool_check(unsigned int entry_index); + char command, status; + unsigned int watch_count; + watch_entry_t *watch_pool; + pthread_mutex_t watch_pool_lock; + private: + void watch_pool_flush(); + void watch_pool_add(Bind2DNSRecord &record, unsigned int domain_id); + void watch_pool_del(unsigned int entry_index); + void watch_pool_success(struct timeval *now, unsigned int entry_index); + void watch_pool_fail(struct timeval *now, unsigned int entry_index, int error_number); + bool guard_active; + pthread_t guard_thread; + }; + private: class handle { @@ -145,7 +191,7 @@ handle(); shared_ptr > d_records; - vector::const_iterator d_iter, d_end_iter; + vector::iterator d_iter, d_start_iter, d_end_iter; vector::const_iterator d_qname_iter; vector::const_iterator d_qname_end; diff -Nru pdns-2.9.21.orig/pdns/backends/bind/socket_helper.cc pdns-2.9.21/pdns/backends/bind/socket_helper.cc --- pdns-2.9.21.orig/pdns/backends/bind/socket_helper.cc 1970-01-01 01:00:00.000000000 +0100 +++ pdns-2.9.21/pdns/backends/bind/socket_helper.cc 2007-11-06 11:46:35.000000000 +0100 @@ -0,0 +1,610 @@ +/* + Helper routines for multiple sockets manipulation + inspired by fd watcher seen in thttpd webserver (http://www.acme.com/software/thttpd/) + + Copyright (C) 2006 Daniel Bilik (daniel.bilik@neosystem.cz) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "config.h" +#include "socket_helper.hh" + +#ifdef HAVE_KQUEUE + +#include + +#define INIT(c) kqueue_init(c) +#define ADD(f, s) kqueue_add(f, s) +#define DEL(i) kqueue_del(i) +#define CHECK(t) kqueue_check(t) +#define GET() kqueue_get() + +int kqueue_init(unsigned int entry_count); +void kqueue_add(int target_socket, char state); +void kqueue_del(unsigned int entry_index); +int kqueue_check(useconds_t timeout); +unsigned int kqueue_get(void); + +#else + +#ifdef HAVE_POLL + +#include + +#define INIT(c) poll_init(c) +#define ADD(f, s) poll_add(f, s) +#define DEL(i) poll_del(i) +#define CHECK(t) poll_check(t) +#define GET() poll_get() + +int poll_init(unsigned int entry_count); +void poll_add(int target_socket, char state); +void poll_del(unsigned int entry_index); +int poll_check(int timeout); +unsigned int poll_get(void); + +#else + +#ifdef HAVE_SELECT + +#ifndef FD_SET +#define NFDBITS 32 +#define FD_SETSIZE 32 +#define FD_SET(n, p) ((p)->fds_bits[(n)/NFDBITS] |= (1 << ((n) % NFDBITS))) +#define FD_CLR(n, p) ((p)->fds_bits[(n)/NFDBITS] &= ~(1 << ((n) % NFDBITS))) +#define FD_ISSET(n, p) ((p)->fds_bits[(n)/NFDBITS] & (1 << ((n) % NFDBITS))) +#define FD_ZERO(p) bzero((char*)(p), sizeof(*(p))) +#endif + +#define INIT(c) select_init(c) +#define ADD(f, s) select_add(f, s) +#define DEL(i) select_del(i) +#define CHECK(t) select_check(t) +#define GET() select_get() + +int select_init(unsigned int entry_count); +void select_add(int target_socket, char state); +void select_del(unsigned int entry_index); +int select_check(useconds_t timeout); +unsigned int select_get(void); + +#endif + +#endif + +#endif + +typedef struct { + int target_socket; + char *target_socket_ready; +} socket_entry_t; + +static socket_entry_t *socket_pool = NULL; +static unsigned int socket_pool_size = 0, socket_watch_count = 0; + +int +socket_pool_init(unsigned int entry_count) +{ + unsigned int a = socket_pool_size; + int returned; + + while (a < entry_count) + a += 64; + + if (socket_pool_size < a) { + socket_pool = (socket_entry_t *)realloc(socket_pool, (sizeof(socket_entry_t) * a)); + returned = INIT(a); + if ((socket_pool == NULL) || (returned < 0)) { + socket_pool_size = 0; + socket_watch_count = 0; + return(-1); + } + socket_pool_size = a; + } + + return(0); +} + +int +socket_pool_add(int target_socket, char *target_socket_ready, char state) +{ + if (socket_watch_count == socket_pool_size) + return(-1); + *target_socket_ready = 0; + socket_pool[socket_watch_count].target_socket = target_socket; + socket_pool[socket_watch_count].target_socket_ready = target_socket_ready; + ADD(target_socket, state); + socket_watch_count++; + return(0); +} + +void +socket_pool_del(int target_socket) +{ + unsigned int a = 0; + + while (a < socket_watch_count) + if (socket_pool[a].target_socket != target_socket) + a++; + else { + socket_watch_count--; + *socket_pool[a].target_socket_ready = 0; + if (a != socket_watch_count) + memcpy(&socket_pool[a], &socket_pool[socket_watch_count], sizeof(socket_entry_t)); + DEL(a); + } +} + +void +socket_pool_update(int target_socket, char *target_socket_ready) +{ + unsigned int a; + + for (a = 0; a < socket_watch_count; a++) + if (socket_pool[a].target_socket == target_socket) { + socket_pool[a].target_socket_ready = target_socket_ready; + break; + } +} + +int +socket_pool_check(useconds_t timeout) +{ + int a; + unsigned int b; + + if ((a = CHECK(timeout)) > 0) + while ((b = GET()) < socket_watch_count) + *socket_pool[b].target_socket_ready = 1; + + return(a); +} + +#ifdef HAVE_KQUEUE + +static int kqueue_fd = -1; +static unsigned int *kqueue_fd_pool = NULL; +static struct kevent *kqueue_event_set = NULL; +static struct kevent *kqueue_event_get = NULL; +static unsigned int kqueue_event_max = 0, kqueue_event_count = 0; +static unsigned int kqueue_event_returned = 0, kqueue_event_index = 0; + +int +kqueue_init(unsigned int entry_count) +{ + struct kevent *a; + unsigned int b = entry_count * 2; + + if (kqueue_fd == -1) + if ((kqueue_fd = kqueue()) == -1) + return(-1); + + if ((kqueue_fd_pool = (unsigned int *)realloc(kqueue_fd_pool, (sizeof(unsigned int) * entry_count))) == NULL) + return(-2); + + if ((kqueue_event_get = (struct kevent *)realloc(kqueue_event_get, (sizeof(struct kevent) * entry_count))) == NULL) + return(-3); + + a = (struct kevent *)malloc(sizeof(struct kevent) * b); + + if (a == NULL) + return(-4); + + memset(a, 0, (sizeof(struct kevent) * b)); + + if (kqueue_event_set != NULL) { + memcpy(a, kqueue_event_set, (sizeof(struct kevent) * socket_pool_size)); + free(kqueue_event_set); + } + + kqueue_event_set = a; + kqueue_event_max = b; + + return(0); +} + +void +kqueue_add(int target_socket, char state) +{ + if (kqueue_event_count == kqueue_event_max) + return; + + kqueue_fd_pool[socket_watch_count] = target_socket; + + kqueue_event_set[kqueue_event_count].ident = target_socket; + kqueue_event_set[kqueue_event_count].flags = EV_ADD; + + switch(state) { + case SOCKET_STATE_READ: + kqueue_event_set[kqueue_event_count].filter = EVFILT_READ; + break; + case SOCKET_STATE_WRITE: + kqueue_event_set[kqueue_event_count].filter = EVFILT_WRITE; + break; + } + + kqueue_event_count++; +} + +void +kqueue_del(unsigned int entry_index) +{ + if (kqueue_event_count == kqueue_event_max) + return; + + kqueue_event_set[kqueue_event_count].ident = kqueue_fd_pool[entry_index]; + kqueue_event_set[kqueue_event_count].flags = EV_DELETE; + + kqueue_event_count++; + + kqueue_fd_pool[entry_index] = kqueue_fd_pool[socket_watch_count]; +} + +int +kqueue_check(useconds_t timeout) +{ + int returned; + struct timespec time_limit; + + time_limit.tv_sec = timeout / 1000; + time_limit.tv_nsec = (timeout % 1000) * 1000000; + + returned = kevent(kqueue_fd, kqueue_event_set, kqueue_event_count, kqueue_event_get, socket_pool_size, &time_limit); + + kqueue_event_returned = (returned == -1) ? 0:returned; + + kqueue_event_count = 0; + kqueue_event_index = 0; + + return(returned); +} + +unsigned int +kqueue_get(void) +{ + unsigned int a, b; + + while ((a = kqueue_event_index++) < kqueue_event_returned) + if (!(kqueue_event_get[a].flags & EV_ERROR)) + for (b = 0; b < socket_watch_count; b++) + if (kqueue_fd_pool[b] == kqueue_event_get[a].ident) + return(b); + + return(socket_watch_count); +} + +#else + +#ifdef HAVE_POLL + +static struct pollfd *poll_fd_pool = NULL; +static unsigned int poll_fd_pool_index = 0; + +int +poll_init(unsigned int entry_count) +{ + if ((poll_fd_pool = (struct pollfd *)realloc(poll_fd_pool, (sizeof(struct pollfd) * entry_count))) == NULL) + return(-1); + + return(0); +} + +void +poll_add(int target_socket, char state) +{ + poll_fd_pool[socket_watch_count].fd = target_socket; + + switch(state) { + case SOCKET_STATE_READ: + poll_fd_pool[socket_watch_count].events = POLLIN; + break; + case SOCKET_STATE_WRITE: + poll_fd_pool[socket_watch_count].events = POLLOUT; + break; + } +} + +void +poll_del(unsigned int entry_index) +{ + if (entry_index < socket_watch_count) + memcpy(&poll_fd_pool[entry_index], &poll_fd_pool[socket_watch_count], sizeof(struct pollfd)); +} + +int +poll_check(int timeout) +{ + int returned; + + returned = poll(poll_fd_pool, socket_watch_count, timeout); + + poll_fd_pool_index = 0; + + return(returned); +} + +unsigned int +poll_get(void) +{ + unsigned int a; + + while ((a = poll_fd_pool_index++) < socket_watch_count) + if (poll_fd_pool[a].revents & (POLLIN | POLLHUP | POLLNVAL)) + return(a); + else if (poll_fd_pool[a].revents & (POLLOUT | POLLHUP | POLLNVAL)) + return(a); + + return(socket_watch_count); +} + +#else + +#ifdef HAVE_SELECT + +static int *select_fd_pool = NULL; +static fd_set select_main_fds_read, select_main_fds_write; +static fd_set select_work_fds_read, select_work_fds_write; +static unsigned int select_fd_pool_index = 0; +static int select_max_fd = -1; + +int +select_init(unsigned int entry_count) +{ + if (select_fd_pool == NULL) { + FD_ZERO(&select_main_fds_read); + FD_ZERO(&select_main_fds_write); + } + + select_fd_pool = (int *)realloc(select_fd_pool, (sizeof(int) * entry_count)); + + if (select_fd_pool == NULL) + return(-1); + + return(0); +} + +void +select_add(int target_socket, char state) +{ + select_fd_pool[socket_watch_count] = target_socket; + + if (target_socket > select_max_fd) + select_max_fd = target_socket; + + switch(state) { + case SOCKET_STATE_READ: + FD_SET(target_socket, &select_main_fds_read); + break; + case SOCKET_STATE_WRITE: + FD_SET(target_socket, &select_main_fds_write); + break; + } +} + +void +select_del(unsigned int entry_index) +{ + unsigned int a; + + FD_CLR(select_fd_pool[entry_index], &select_main_fds_read); + FD_CLR(select_fd_pool[entry_index], &select_main_fds_write); + + if (select_max_fd == select_fd_pool[entry_index]) { + select_fd_pool[entry_index] = -1; + select_max_fd = -1; + for (a = 0; a < socket_watch_count; a++) + if (select_fd_pool[a] > select_max_fd) + select_max_fd = select_fd_pool[a]; + } + + select_fd_pool[entry_index] = select_fd_pool[socket_watch_count]; +} + +int +select_check(useconds_t timeout) +{ + int returned; + struct timeval time_limit; + + select_work_fds_read = select_main_fds_read; + select_work_fds_write = select_main_fds_write; + + time_limit.tv_sec = timeout / 1000; + time_limit.tv_usec = (timeout % 1000) * 1000; + + returned = select((select_max_fd + 1), &select_work_fds_read, &select_work_fds_write, NULL, &time_limit); + + select_fd_pool_index = 0; + + return(returned); +} + +unsigned int +select_get(void) +{ + unsigned int a; + + while ((a = select_fd_pool_index++) < socket_watch_count) + if (FD_ISSET(select_fd_pool[a], &select_work_fds_read)) + return(a); + else if (FD_ISSET(select_fd_pool[a], &select_work_fds_write)) + return(a); + + return(socket_watch_count); +} + +#endif + +#endif + +#endif + +int +socket_block(int target_socket, char block) +{ + int returned, socket_flags; + + returned = socket_flags = fcntl(target_socket, F_GETFL, 0); + + if (returned == -1) + return(-1); + + if (socket_flags & O_NONBLOCK) { + if (block) + socket_flags ^= O_NONBLOCK; + } else { + if (!block) + socket_flags |= O_NONBLOCK; + } + + if (returned != socket_flags) + if (fcntl(target_socket, F_SETFL, socket_flags) < 0) + return(-1); + + return(0); +} + +int +socket_connect_tcp(struct sockaddr_in *target_address, socklen_t target_len) +{ + int returned, target_socket; + + if ((returned = socket(AF_INET, SOCK_STREAM, 0)) == -1) + return(-1); + + target_socket = returned; + + if (socket_block(target_socket, 0) == -1) + return(-1); + + if (connect(target_socket, (struct sockaddr *)target_address, target_len) == 0) + return(target_socket); + else if (errno == EINPROGRESS) + return(target_socket); + else + return(-1); +} + +int +socket_error(int target_socket) +{ + int socket_state; + socklen_t a; + + a = sizeof(socket_state); + + if (getsockopt(target_socket, SOL_SOCKET, SO_ERROR, &socket_state, &a) != 0) + return(-2); + else if (socket_state != 0) { + errno = socket_state; + return(-1); + } else + return(0); +} + +int +socket_read(int target_socket, char **data, unsigned int *data_len) +{ + char *a, receive_buffer[SOCKET_BUFFER_SIZE]; + int returned; + + if ((returned = read(target_socket, receive_buffer, SOCKET_BUFFER_SIZE)) == 0) + return(0); + else if (returned == -1) { + if (errno == EAGAIN) + return(1); + else + return(-1); + } + + if ((a = (char *)malloc(*data_len + returned + 1)) == NULL) + return(-2); + + if (*data != NULL) { + memcpy(a, *data, *data_len); + free(*data); + } + + memcpy((a + *data_len), receive_buffer, returned); + + *data_len += returned; + a[*data_len] = '\0'; + *data = a; + + return(returned); +} + +/* the rest of code is quick hack for BindBackend2 keepalive funcionality, +it should probably go into separate file but I'm too lazy to do that just for those few lines :) */ + +static const char *http_default_request[] = { "HEAD ", "/", " HTTP/1.0\r\nUser-Agent: PowerDNS Keepalive Guard 0.1\r\n\r\n" }; + +int +http_request(int target_socket, const char *request_url) +{ + if (request_url == NULL) + request_url = http_default_request[1]; + + if (write(target_socket, http_default_request[0], strlen(http_default_request[0])) == -1) { + if (errno == EAGAIN) + return(0); + else + return(-1); + } + + if (write(target_socket, request_url, strlen(request_url)) == -1) + return(-1); + + if (write(target_socket, http_default_request[2], strlen(http_default_request[2])) == -1) + return(-1); + + return(1); +} + +int +http_response(char *data) +{ + int protocol_major, protocol_minor, status_code; + + if (data == NULL) + return(0); + + if (strstr(data, "\r\n\r\n") == NULL) + return(0); + + if (strlen(data) < 14) + return(-1); + + if ((strncmp(data, "HTTP/", 5) != 0) || (data[6] != '.') || (data[8] != ' ') || (data[12] != ' ')) + return(-1); + + if (sscanf(data, "HTTP/%d.%d %d ", &protocol_major, &protocol_minor, &status_code) != 3) + return(-1); + + if (protocol_major != 1) + return(-1); + + return(status_code); +} diff -Nru pdns-2.9.21.orig/pdns/backends/bind/socket_helper.hh pdns-2.9.21/pdns/backends/bind/socket_helper.hh --- pdns-2.9.21.orig/pdns/backends/bind/socket_helper.hh 1970-01-01 01:00:00.000000000 +0100 +++ pdns-2.9.21/pdns/backends/bind/socket_helper.hh 2007-11-06 11:46:35.000000000 +0100 @@ -0,0 +1,35 @@ +/* + Helper routines for multiple sockets manipulation + inspired by fd watcher seen in thttpd webserver (http://www.acme.com/software/thttpd/) + + Copyright (C) 2006 Daniel Bilik (daniel.bilik@neosystem.cz) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#define SOCKET_STATE_READ 1 +#define SOCKET_STATE_WRITE 2 + +#define SOCKET_BUFFER_SIZE 4096 + +int socket_pool_init(unsigned int entry_count); +int socket_pool_add(int target_socket, char *target_socket_ready, char state); +void socket_pool_del(int target_socket); +void socket_pool_update(int target_socket, char *target_socket_ready); +int socket_pool_check(useconds_t timeout); +int socket_connect_tcp(struct sockaddr_in *target_address, socklen_t target_len); +int socket_error(int target_socket); +int socket_read(int target_socket, char **data, unsigned int *data_len); +int http_request(int target_socket, const char *request_url); +int http_response(char *data); diff -Nru pdns-2.9.21.orig/pdns/common_startup.cc pdns-2.9.21/pdns/common_startup.cc --- pdns-2.9.21.orig/pdns/common_startup.cc 2007-04-21 15:56:36.000000000 +0200 +++ pdns-2.9.21/pdns/common_startup.cc 2007-11-06 11:54:10.000000000 +0100 @@ -30,6 +30,7 @@ UDPNameserver *N; int avg_latency; TCPNameserver *TN; +StatWebServer sws; ArgvMap &arg() { @@ -298,7 +299,6 @@ dl->go(); pthread_t qtid; - StatWebServer sws; if(arg().mustDo("webserver")) sws.go(); diff -Nru pdns-2.9.21.orig/pdns/communicator.cc pdns-2.9.21/pdns/communicator.cc --- pdns-2.9.21.orig/pdns/communicator.cc 2007-04-21 15:56:36.000000000 +0200 +++ pdns-2.9.21/pdns/communicator.cc 2007-11-06 11:56:24.000000000 +0100 @@ -78,7 +78,7 @@ while(resolver.axfrChunk(recs)) { if(first) { L<startTransaction(domain, domain_id); + di.backend->startTransaction(domain, domain_id, remote); first=false; } for(Resolver::res_t::iterator i=recs.begin();i!=recs.end();++i) { diff -Nru pdns-2.9.21.orig/pdns/dnsbackend.hh pdns-2.9.21/pdns/dnsbackend.hh --- pdns-2.9.21.orig/pdns/dnsbackend.hh 2007-04-21 15:56:36.000000000 +0200 +++ pdns-2.9.21/pdns/dnsbackend.hh 2007-11-06 11:57:43.000000000 +0100 @@ -91,7 +91,7 @@ } //! starts the transaction for updating domain qname (FIXME: what is id?) - virtual bool startTransaction(const string &qname, int id=-1) + virtual bool startTransaction(const string &qname, int id=-1, const string &master="") { return false; } diff -Nru pdns-2.9.21.orig/pdns/receiver.cc pdns-2.9.21/pdns/receiver.cc --- pdns-2.9.21.orig/pdns/receiver.cc 2007-04-21 15:56:36.000000000 +0200 +++ pdns-2.9.21/pdns/receiver.cc 2007-11-06 13:07:11.000000000 +0100 @@ -52,6 +52,7 @@ #include "statbag.hh" #include "tcpreceiver.hh" #include "packetcache.hh" +#include "webserver.hh" #include "ws.hh" #include "misc.hh" #include "dynlistener.hh" diff -Nru pdns-2.9.21.orig/pdns/webserver.cc pdns-2.9.21/pdns/webserver.cc --- pdns-2.9.21.orig/pdns/webserver.cc 2007-04-21 15:56:36.000000000 +0200 +++ pdns-2.9.21/pdns/webserver.cc 2007-11-06 16:10:31.000000000 +0100 @@ -137,7 +137,7 @@ HandlerFunction *fptr; if((fptr=d_functions[baseUrl])) { - bool custom; + bool custom = false; string ret=(*fptr)(varmap, d_that, &custom); if(!custom) { diff -Nru pdns-2.9.21.orig/pdns/ws.cc pdns-2.9.21/pdns/ws.cc --- pdns-2.9.21.orig/pdns/ws.cc 2007-04-21 15:56:36.000000000 +0200 +++ pdns-2.9.21/pdns/ws.cc 2007-11-06 13:16:02.000000000 +0100 @@ -15,8 +15,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "utility.hh" -#include "ws.hh" #include "webserver.hh" +#include "ws.hh" #include "logger.hh" #include "statbag.hh" #include "misc.hh" @@ -24,6 +24,7 @@ #include "dns.hh" extern StatBag S; +static vector registered_pages; StatWebServer::StatWebServer() { @@ -166,6 +167,14 @@ time_t passed=time(0)-s_starttime; + if (! registered_pages.empty()) { + ret << "

" << endl; + ret << "Backend statistics:" << endl; + for(vector::const_iterator i=registered_pages.begin();i!=registered_pages.end();++i) + ret << "[" << *i << "]" << endl; + ret << "

" << endl; + } + ret<<"Uptime: "; ret<setCaller(this); + ws->registerHandler("",&indexfunction); + ws->go(); } catch(...) { L<registerHandler(s, ptr); + registered_pages.push_back(s); +} diff -Nru pdns-2.9.21.orig/pdns/ws.hh pdns-2.9.21/pdns/ws.hh --- pdns-2.9.21.orig/pdns/ws.hh 2007-04-21 15:56:36.000000000 +0200 +++ pdns-2.9.21/pdns/ws.hh 2007-11-06 13:16:49.000000000 +0100 @@ -81,6 +81,7 @@ public: StatWebServer(); void go(); + void registerPage(string &s, WebServer::HandlerFunction *ptr); private: static void *threadHelper(void *); static void *statThreadHelper(void *p); @@ -89,6 +90,9 @@ void printargs(ostringstream &ret); void launch(); void statThread(); + + WebServer *ws; + pthread_t d_tid; time_t d_start;
Keepalive guard entries
target addresstarget statelast checkedcheck details
" << ip << "" << keepalive_target_state[b] << "" << check_time << ""; + if (status_watch_pool[c].check_error > 0) + returned << strerror(watch_pool[c].check_error); + else { + check_duration = (double)status_watch_pool[c].check_duration / 1000000; + switch (status_watch_pool[c].type) { + case KEEPALIVE_TYPE_TCP: + snprintf(check_detail, 60, "TCP connect on port %d in %1.2f secs", port, check_duration); + break; + case KEEPALIVE_TYPE_HTTP: + snprintf(check_detail, 60, "HTTP response code %d on port %d in %1.2f secs", status_watch_pool[c].check_http_code, port, check_duration); + break; + } + returned << check_detail; + } + returned << "