diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 7cc33efc2..97fe2e9e6 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -52,29 +52,7 @@ add_library(reactor-c) target_sources(reactor-c PRIVATE ${REACTORC_SOURCES}) lf_enable_compiler_warnings(reactor-c) -if(DEFINED LF_TRACE) - include(${LF_ROOT}/trace/api/CMakeLists.txt) - target_link_libraries(reactor-c PUBLIC lf::trace-api) - # If the user specified an external trace plugin. Find it and link with it - if (LF_TRACE_PLUGIN) - message(STATUS "Linking trace plugin library ${LF_TRACE_PLUGIN}") - find_library(TRACE_LIB NAMES ${LF_TRACE_PLUGIN} HINTS "${LF_ROOT}") - if (NOT TRACE_LIB) - message(FATAL_ERROR "The trace plugin library ${LF_TRACE_PLUGIN} not found") - endif() - # We also link with libdl because it is needed for some platforms. - # TODO: Figure out why this is the case and how to avoid it. - target_link_libraries(reactor-c PRIVATE ${TRACE_LIB} dl) - else() - # If not, use the default implementation - message(STATUS "Linking with default trace implementation") - include(${LF_ROOT}/trace/impl/CMakeLists.txt) - target_link_libraries(reactor-c PRIVATE lf::trace-impl) - endif() -else() - include(${LF_ROOT}/trace/api/types/CMakeLists.txt) - target_link_libraries(reactor-c PUBLIC lf::trace-api-types) -endif() +include(${LF_ROOT}/core/lf_trace.cmake) include(${LF_ROOT}/version/api/CMakeLists.txt) target_link_libraries(reactor-c PUBLIC lf::version-api) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index 93205eab2..67582b4e2 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -46,6 +46,9 @@ const char* rti_trace_file_name = "rti.lft"; /** Indicator that normal termination of the RTI has occurred. */ bool normal_termination = false; +// RTI transient federates information file path +const char* transient_federates_file_path = NULL; + /** * Send a failed signal to the specified federate. */ @@ -99,6 +102,8 @@ void usage(int argc, const char* argv[]) { lf_print(" The ID of the federation that this RTI will control.\n"); lf_print(" -n, --number_of_federates "); lf_print(" The number of federates in the federation that this RTI will control.\n"); + lf_print(" -nt, --number_of_transient_federates "); + lf_print(" The number of transient federates in the federation that this RTI will control.\n"); lf_print(" -p, --port "); lf_print(" The port number to use for the RTI. Must be larger than 0 and smaller than %d. Default is %d.\n", UINT16_MAX, DEFAULT_PORT); @@ -206,6 +211,15 @@ int process_args(int argc, const char* argv[]) { i++; lf_print("RTI: Federation ID: %s", argv[i]); rti.federation_id = argv[i]; + } else if (strcmp(argv[i], "-tf") == 0 || strcmp(argv[i], "--tf") == 0){ + if (argc < i + 2) { + lf_print_error("-tf needs the file path argument"); + return 0; + } + i++; + transient_federates_file_path = argv[i]; + lf_print("RTI: The transient federates config file path was retrieved: %s", transient_federates_file_path); + } else if (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--number_of_federates") == 0) { if (argc < i + 2) { lf_print_error("--number_of_federates needs an integer argument."); @@ -221,6 +235,21 @@ int process_args(int argc, const char* argv[]) { } rti.base.number_of_scheduling_nodes = (int32_t)num_federates; // FIXME: Loses numbers on 64-bit machines lf_print("RTI: Number of federates: %d", rti.base.number_of_scheduling_nodes); + } else if (strcmp(argv[i], "-nt") == 0 || strcmp(argv[i], "--number_of_transient_federates") == 0) { + if (argc < i + 2) { + lf_print_error("--number_of_transient_federates needs a valid positive argument."); + usage(argc, argv); + return 0; + } + i++; + long num_transient_federates = strtol(argv[i], NULL, 10); + if (num_transient_federates == LONG_MAX || num_transient_federates == LONG_MIN) { + lf_print_error("--number_of_transient_federates needs a valid positive or null integer argument."); + usage(argc, argv); + return 0; + } + rti.number_of_transient_federates = (int32_t)num_transient_federates; // FIXME: Loses numbers on 64-bit machines + lf_print("RTI: Number of transient federates: %d", rti.number_of_transient_federates); } else if (strcmp(argv[i], "-p") == 0 || strcmp(argv[i], "--port") == 0) { #if defined(COMM_TYPE_TCP) || defined(COMM_TYPE_SST) || defined(COMM_TYPE_TLS) if (argc < i + 2) { @@ -303,6 +332,16 @@ int process_args(int argc, const char* argv[]) { return 0; } } + if (rti.base.number_of_scheduling_nodes == 0) { + lf_print_error("--number_of_federates needs a valid positive integer argument."); + usage(argc, argv); + return 0; + } + if (rti.number_of_transient_federates > rti.base.number_of_scheduling_nodes) { + lf_print_error("--number_of_transient_federates cannot be higher than the number of federates."); + usage(argc, argv); + return 0; + } return 1; } int main(int argc, const char* argv[]) { @@ -342,8 +381,8 @@ int main(int argc, const char* argv[]) { lf_print("Tracing the RTI execution in %s file.", rti_trace_file_name); } - lf_print("Starting RTI for %d federates in federation ID %s.", rti.base.number_of_scheduling_nodes, - rti.federation_id); + lf_print("Starting RTI for a total of %d federates, with %d being transient, in federation ID %s", + rti.base.number_of_scheduling_nodes, rti.number_of_transient_federates, rti.federation_id); assert(rti.base.number_of_scheduling_nodes < UINT16_MAX); // Allocate memory for the federates @@ -355,6 +394,10 @@ int main(int argc, const char* argv[]) { rti.base.scheduling_nodes[i] = (scheduling_node_t*)fed_info; } + // One of the options for a place to parse the transient federate config file and set the fields of the transient federates?? + // TODO: Find other safe places to parse the transeint config file?? + parse_transient_federate_config(transient_federates_file_path); + if (!start_rti_server()) { wait_for_federates(); normal_termination = true; diff --git a/core/federated/RTI/rti_common.c b/core/federated/RTI/rti_common.c index 15a5596c7..b1b4bbadd 100644 --- a/core/federated/RTI/rti_common.c +++ b/core/federated/RTI/rti_common.c @@ -42,6 +42,7 @@ void invalidate_min_delays() { node->flags = 0; // All flags cleared because they get set lazily. } free(rti_common->min_delays); + rti_common->min_delays = NULL; } } @@ -101,6 +102,8 @@ tag_t earliest_future_incoming_message_tag(scheduling_node_t* e) { if (lf_tag_compare(rti_common->min_delays[i * n + e->id], FOREVER_TAG) != 0) { // Node i is upstream of e with min delay rti_common->min_delays[i * n + e->id] scheduling_node_t* upstream = rti_common->scheduling_nodes[i]; + if (upstream->state == NOT_CONNECTED) + continue; // If we haven't heard from the upstream node, then assume it can send an event at the start time. if (lf_tag_compare(upstream->next_event, NEVER_TAG) == 0) { tag_t start_tag = {.time = start_time, .microstep = 0}; @@ -163,6 +166,9 @@ tag_t eimt_strict(scheduling_node_t* e) { tag_advance_grant_t tag_advance_grant_if_safe(scheduling_node_t* e) { tag_advance_grant_t result = {.tag = NEVER_TAG, .is_provisional = false}; + // Check how many upstream federates are connected + int num_connected_upstream = 0; + // Find the earliest LTC of upstream scheduling_nodes (M). tag_t min_upstream_completed = FOREVER_TAG; @@ -172,6 +178,7 @@ tag_advance_grant_t tag_advance_grant_if_safe(scheduling_node_t* e) { // Ignore this enclave/federate if it is not connected. if (upstream->state == NOT_CONNECTED) continue; + num_connected_upstream++; // Adjust by the "after" delay. // Note that "no delay" is encoded as NEVER, @@ -184,8 +191,15 @@ tag_advance_grant_t tag_advance_grant_if_safe(scheduling_node_t* e) { } LF_PRINT_LOG("RTI: Minimum upstream LTC for federate/enclave %d is " PRINTF_TAG "(adjusted by after delay).", e->id, min_upstream_completed.time - start_time, min_upstream_completed.microstep); - if (lf_tag_compare(min_upstream_completed, e->last_granted) > 0 && - lf_tag_compare(min_upstream_completed, e->next_event) >= 0 // The enclave has to advance its tag + + if (num_connected_upstream == 0) { + // When none of the upstream federates is connected (case of transients), + if (lf_tag_compare(e->next_event, FOREVER_TAG) != 0) { + result.tag = e->next_event; + return result; + } + } else if (lf_tag_compare(min_upstream_completed, e->last_granted) > 0 && + lf_tag_compare(min_upstream_completed, e->next_event) >= 0 // The enclave has to advance its tag ) { result.tag = min_upstream_completed; return result; diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 622e2fd5f..437c8bef3 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -27,6 +27,8 @@ #include "rti_remote.h" #include "net_util.h" #include +#include "clock.h" // For lf_clock_cond_timedwait() +#include // Global variables defined in tag.c: extern instant_t start_time; @@ -36,6 +38,15 @@ extern instant_t start_time; */ static rti_remote_t* rti_remote; +// Referance to the federate instance to support hot swap +static federate_info_t* hot_swap_federate; + +// Indicates if a hot swap process is in progress +static bool hot_swap_in_progress = false; + +// Indicates that the old federate has stopped. +static bool hot_swap_old_resigned = false; + bool _lf_federate_reports_error = false; // A convenient macro for getting the `federate_info_t *` at index `_idx` @@ -43,24 +54,248 @@ bool _lf_federate_reports_error = false; #define GET_FED_INFO(_idx) (federate_info_t*)rti_remote->base.scheduling_nodes[_idx] lf_mutex_t rti_mutex; -lf_cond_t received_start_times; -lf_cond_t sent_start_time; +static lf_cond_t received_start_times; +static lf_cond_t sent_start_time; +static lf_cond_t updated_delayed_grants; extern int lf_critical_section_enter(environment_t* env) { return lf_mutex_lock(&rti_mutex); } extern int lf_critical_section_exit(environment_t* env) { return lf_mutex_unlock(&rti_mutex); } -void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { - if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || - lf_tag_compare(tag, e->last_provisionally_granted) < 0) { +// Utility functions to simplify the call of pqueue_tag routines. +// These functions mainly do the casting. +// FIXME: Should we remove the queue parameter from the functions? + +/** + * @brief Creates a priority queue of delayed grants that is sorted by tags. + * + * @param nbr_delayed_grants The size. + * @return The dynamically allocated queue or NULL. + */ +static pqueue_delayed_grants_t* pqueue_delayed_grants_init(uint16_t nbr_delayed_grants) { + return (pqueue_delayed_grants_t*)pqueue_tag_init((size_t)nbr_delayed_grants); +} + +/** + * @brief Return the size of the queue. + * + * @param q The queue. + * @return The size. + */ +static size_t pqueue_delayed_grants_size(pqueue_delayed_grants_t* q) { return pqueue_tag_size((pqueue_tag_t*)q); } + +/** + * @brief Insert an\ delayed grant element into the queue. + * + * @param q The queue. + * @param e The delayed grant element to insert. + * @return 0 on success + */ +static int pqueue_delayed_grants_insert(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* d) { + return pqueue_tag_insert((pqueue_tag_t*)q, (void*)d); +} + +/** + * @brief Pop the least-tag element from the queue. + * + * @param q The queue. + * @return NULL on error, otherwise the entry + */ +static pqueue_delayed_grant_element_t* pqueue_delayed_grants_pop(pqueue_delayed_grants_t* q) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_pop((pqueue_tag_t*)q); +} + +/** + * @brief Return highest-ranking element without removing it. + * + * @param q The queue. + * @return NULL on if the queue is empty, otherwise the delayed grant element. + */ +static pqueue_delayed_grant_element_t* pqueue_delayed_grants_peek(pqueue_delayed_grants_t* q) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_peek((pqueue_tag_t*)q); +} + +/** + * @brief Free all memory used by the queue including elements that are marked dynamic. + * + * @param q The queue. + */ +static void pqueue_delayed_grants_free(pqueue_delayed_grants_t* q) { pqueue_tag_free((pqueue_tag_t*)q); } + +/** + * @brief Remove an item from the delayed grants queue. + * + * @param q The queue. + * @param e The entry to remove. + */ +static void pqueue_delayed_grants_remove(pqueue_delayed_grants_t* q, pqueue_delayed_grant_element_t* e) { + pqueue_tag_remove((pqueue_tag_t*)q, (void*)e); +} + +/** + * @brief Return the first item with the specified tag or NULL if there is none. + * @param q The queue. + * @param t The tag. + * @return An entry with the specified tag or NULL if there isn't one. + */ +pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_with_tag(pqueue_delayed_grants_t* q, tag_t t) { + return (pqueue_delayed_grant_element_t*)pqueue_tag_find_with_tag((pqueue_tag_t*)q, t); +} + +// Function that does not in pqueue_tag.c +/** + * @brief Return the first item with the specified federate id or NULL if there is none. + * @param q The queue. + * @param fed_id The federate id. + * @return An entry with the specified federate if or NULL if there isn't one. + */ +static pqueue_delayed_grant_element_t* pqueue_delayed_grants_find_by_fed_id(pqueue_delayed_grants_t* q, + uint16_t fed_id) { + pqueue_delayed_grant_element_t* dge; + pqueue_t* _q = (pqueue_t*)q; + if (!q || q->size == 1) + return NULL; + for (int i = 1; i < q->size; i++) { + dge = (pqueue_delayed_grant_element_t*)q->d[i]; + if (dge) { + if (dge->fed_id == fed_id) { + return dge; + } + } + } + return NULL; +} + +/** + * @brief Insert the delayed grant into the delayed_grants queue and notify. + * + * This function assumes the caller holds the rti_mutex. + * @param fed The federate. + * @param tag The tag to grant. + * @param is_provisional State whther the grant is provisional. + */ +static void notify_grant_delayed(federate_info_t* fed, tag_t tag, bool is_provisional) { + // Check wether there is already a pending grant. + pqueue_delayed_grant_element_t* dge = + pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, fed->enclave.id); + if (dge == NULL) { + pqueue_delayed_grant_element_t* dge = + (pqueue_delayed_grant_element_t*)malloc(sizeof(pqueue_delayed_grant_element_t)); + dge->base.is_dynamic = 1; + dge->base.tag = tag; + dge->fed_id = fed->enclave.id; + dge->is_provisional = is_provisional; + pqueue_delayed_grants_insert(rti_remote->delayed_grants, dge); + LF_PRINT_LOG("RTI: Inserting a delayed grant of " PRINTF_TAG " for federate %d.", dge->base.tag.time - start_time, + dge->base.tag.microstep, dge->fed_id); + lf_cond_signal(&updated_delayed_grants); + } else { + // Note that there should never be more than one pending grant for a federate. + int compare = lf_tag_compare(dge->base.tag, tag); + if (compare > 0) { + // Update the pre-existing grant. + dge->base.tag = tag; + dge->is_provisional = is_provisional; + LF_PRINT_LOG("RTI: Updating a delayed grant of " PRINTF_TAG " for federate %d.", tag.time - start_time, + tag.microstep, dge->fed_id); + lf_cond_signal(&updated_delayed_grants); + } else if (compare == 0) { + if (dge->is_provisional != is_provisional) { + // Update the grant to keep the most recent is_provisional status. + dge->is_provisional = is_provisional; + LF_PRINT_LOG("RTI: Changing status of a delayed grant of " PRINTF_TAG " for federate %d to provisional: %d.", + dge->base.tag.time - start_time, dge->base.tag.microstep, dge->fed_id, is_provisional); + } + } + } +} + +/** + * Find the number of non connected upstream transients + * @param fed The federate + * @return the number of non connected upstream transients + */ +static int get_num_absent_upstream_transients(federate_info_t* fed) { + int num_absent_upstream_transients = 0; + for (int j = 0; j < fed->enclave.num_immediate_upstreams; j++) { + federate_info_t* upstream = GET_FED_INFO(fed->enclave.immediate_upstreams[j]); + // Ignore this enclave if it no longer connected. + if ((upstream->enclave.state == NOT_CONNECTED) && (upstream->is_transient)) { + num_absent_upstream_transients++; + } + } + return num_absent_upstream_transients; +} + +/** + * @brief Send MSG_TYPE_UPSTREAM_CONNECTED to the specified `destination` if it is connected to the RTI, + * telling it that the specified `upstream` federate is also now connected. + * + * This function assumes that the mutex lock is already held. + * @param destination The destination federate. + * @param disconnected The connected federate. + */ +static void send_upstream_connected_locked(federate_info_t* destination, federate_info_t* connected) { + if (destination->enclave.state == NOT_CONNECTED) { + LF_PRINT_LOG("RTI did not send upstream connected message to federate %d, because it is not connected.", + destination->enclave.id); return; } - // Need to make sure that the destination federate's thread has already - // sent the starting MSG_TYPE_TIMESTAMP message. - while (e->state == PENDING) { - // Need to wait here. - lf_cond_wait(&sent_start_time); + unsigned char buffer[MSG_TYPE_UPSTREAM_CONNECTED_LENGTH]; + buffer[0] = MSG_TYPE_UPSTREAM_CONNECTED; + encode_uint16(connected->enclave.id, &buffer[1]); + if (write_to_net_close_on_error(destination->net, MSG_TYPE_UPSTREAM_CONNECTED_LENGTH, buffer)) { + lf_print_warning("RTI: Failed to send upstream connected message to federate %d.", destination->enclave.id); + } +} + +/** + * @brief Send MSG_TYPE_UPSTREAM_DISCONNECTED to the specified federate. + * + * This function assumes that the mutex lock is already held. + * @param destination The destination federate. + * @param disconnected The disconnected federate. + */ +static void send_upstream_disconnected_locked(federate_info_t* destination, federate_info_t* disconnected) { + unsigned char buffer[MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH]; + buffer[0] = MSG_TYPE_UPSTREAM_DISCONNECTED; + encode_uint16(disconnected->enclave.id, &buffer[1]); + if (write_to_net_close_on_error(destination->net, MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH, buffer)) { + lf_print_warning("RTI: Failed to send upstream disconnected message to federate %d.", disconnected->enclave.id); } +} + +/** + * @brief Mark a federate as disconnected and, if this is a transient, inform downstream federates. + * @param fed The disconnected federate. + */ +static void notify_federate_disconnected(federate_info_t* fed) { + fed->enclave.state = NOT_CONNECTED; + // Notify downstream federates. Need to hold the mutex lock to do this. + if (fed->is_transient) { + LF_MUTEX_LOCK(&rti_mutex); + for (int j = 0; j < fed->enclave.num_immediate_downstreams; j++) { + federate_info_t* downstream = GET_FED_INFO(fed->enclave.immediate_downstreams[j]); + // Ignore this enclave if it no longer connected. + if (downstream->enclave.state != NOT_CONNECTED) { + // Notify the downstream enclave. + send_upstream_disconnected_locked(downstream, fed); + } + } + LF_MUTEX_UNLOCK(&rti_mutex); + } +} + +/** + * Notify a tag advance grant (TAG) message to the specified federate immediately. + * + * This function will keep a record of this TAG in the enclave's last_granted + * field. + * + * @param e The enclave. + * @param tag The tag to grant. + */ +static void notify_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); unsigned char buffer[message_length]; buffer[0] = MSG_TYPE_TAG_ADVANCE_GRANT; @@ -75,7 +310,8 @@ void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { // to fail. Consider a failure here a soft failure and update the federate's status. if (write_to_net(((federate_info_t*)e)->net, message_length, buffer)) { lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - e->state = NOT_CONNECTED; + // Mark a federate as disconnected and inform if necessary + notify_federate_disconnected(GET_FED_INFO(e->id)); } else { e->last_granted = tag; LF_PRINT_LOG("RTI sent to federate %d the tag advance grant (TAG) " PRINTF_TAG ".", e->id, tag.time - start_time, @@ -83,7 +319,7 @@ void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { } } -void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { +void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { return; @@ -94,6 +330,32 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { // Need to wait here. lf_cond_wait(&sent_start_time); } + + // Check if sending the tag advance grant needs to be delayed or not + // Delay is needed when a federate has, at least one, absent upstream transient + federate_info_t* fed = GET_FED_INFO(e->id); + if (!fed->has_upstream_transient_federates) { + notify_tag_advance_grant_immediate(e, tag); + } else { + if (get_num_absent_upstream_transients(fed) > 0) { + notify_grant_delayed(fed, tag, false); + } else { + notify_tag_advance_grant_immediate(e, tag); + } + } +} + +/** + * Notify a provisional tag advance grant (PTAG) message to the specified federate + * immediately. + * + * This function will keep a record of this TAG in the enclave's last_provisionally_granted + * field. + * + * @param e The scheduling node. + * @param tag The tag to grant. + */ +void notify_provisional_tag_advance_grant_immediate(scheduling_node_t* e, tag_t tag) { size_t message_length = 1 + sizeof(int64_t) + sizeof(uint32_t); unsigned char buffer[message_length]; buffer[0] = MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT; @@ -108,7 +370,8 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { // to fail. Consider a failure here a soft failure and update the federate's status. if (write_to_net(((federate_info_t*)e)->net, message_length, buffer)) { lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - e->state = NOT_CONNECTED; + // Mark a federate as disconnected and inform if necessary + notify_federate_disconnected(GET_FED_INFO(e->id)); } else { e->last_provisionally_granted = tag; LF_PRINT_LOG("RTI sent to federate %d the Provisional Tag Advance Grant (PTAG) " PRINTF_TAG ".", e->id, @@ -144,6 +407,32 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { } } +void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { + if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || + lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { + return; + } + // Need to make sure that the destination federate's thread has already + // sent the starting MSG_TYPE_TIMESTAMP message. + while (e->state == PENDING) { + // Need to wait here. + lf_cond_wait(&sent_start_time); + } + + // Check if sending the tag advance grant needs to be delayed or not + // Delay is needed when a federate has, at least one, absent upstream transient + federate_info_t* fed = GET_FED_INFO(e->id); + if (!fed->has_upstream_transient_federates) { + notify_provisional_tag_advance_grant_immediate(e, tag); + } else { + if (get_num_absent_upstream_transients(fed) > 0) { + notify_grant_delayed(fed, tag, true); + } else { + notify_provisional_tag_advance_grant_immediate(e, tag); + } + } +} + void notify_downstream_next_event_tag(scheduling_node_t* e, tag_t tag) { if (e->state == NOT_CONNECTED) { return; @@ -287,20 +576,22 @@ void handle_timed_message(federate_info_t* sending_federate, unsigned char* buff // issue a TAG before this message has been forwarded. LF_MUTEX_LOCK(&rti_mutex); - // If the destination federate is no longer connected, issue a warning, - // remove the message from the network abstraction and return. + // If the destination federate is no longer connected, or it is a transient that has not started executing yet + // (the delayed intended tag is less than the effective start tag of the destination), issue a warning, remove the + // message from the socket, and return. federate_info_t* fed = GET_FED_INFO(federate_id); - if (fed->enclave.state == NOT_CONNECTED) { - lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); - LF_PRINT_LOG("Fed status: next_event " PRINTF_TAG ", " - "completed " PRINTF_TAG ", " - "last_granted " PRINTF_TAG ", " - "last_provisionally_granted " PRINTF_TAG ".", - fed->enclave.next_event.time - start_time, fed->enclave.next_event.microstep, - fed->enclave.completed.time - start_time, fed->enclave.completed.microstep, - fed->enclave.last_granted.time - start_time, fed->enclave.last_granted.microstep, - fed->enclave.last_provisionally_granted.time - start_time, - fed->enclave.last_provisionally_granted.microstep); + interval_t delay = NEVER; + for (int i = 0; i < fed->enclave.num_immediate_upstreams; i++) { + if (fed->enclave.immediate_upstreams[i] == sending_federate->enclave.id) { + delay = fed->enclave.immediate_upstream_delays[i]; + break; + } + } + if (fed->enclave.state == NOT_CONNECTED || + lf_tag_compare(lf_delay_tag(intended_tag, delay), fed->effective_start_tag) < 0) { + lf_print_warning("RTI: Destination federate %d is not connected at logical time (" PRINTF_TAG + "). Dropping message.", + federate_id, intended_tag.time - start_time, intended_tag.microstep); // If the message was larger than the buffer, we must empty out the remainder also. size_t total_bytes_read = bytes_read; while (total_bytes_read < total_bytes_to_read) { @@ -462,18 +753,21 @@ static void broadcast_stop_time_to_federates_locked() { } /** - * Mark a federate requesting stop. If the number of federates handling stop reaches the - * NUM_OF_FEDERATES, broadcast MSG_TYPE_STOP_GRANTED to every federate. + * Mark a federate requesting stop. If the number of federates handling stop reaches + * the number of persistent federates, broadcast MSG_TYPE_STOP_GRANTED to every federate. * This function assumes the _RTI.mutex is already locked. * @param fed The federate that has requested a stop. * @return 1 if stop time has been sent to all federates and 0 otherwise. */ static int mark_federate_requesting_stop(federate_info_t* fed) { if (!fed->requested_stop) { - rti_remote->base.num_scheduling_nodes_handling_stop++; + // Increment the number of federates handling stop only if it is persistent + if (!fed->is_transient) + rti_remote->base.num_scheduling_nodes_handling_stop++; fed->requested_stop = true; } - if (rti_remote->base.num_scheduling_nodes_handling_stop == rti_remote->base.number_of_scheduling_nodes) { + if (rti_remote->base.num_scheduling_nodes_handling_stop == + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { // We now have information about the stop time of all // federates. broadcast_stop_time_to_federates_locked(); @@ -693,6 +987,69 @@ void handle_address_ad(uint16_t federate_id) { } } +/** + * @brief Send the global federation start time and the federate-specific starting tag to the specified federate. + * + * For persistent federates and transient federates that happen to join during federation startup, the + * `federation_start_time` will match the time in the `federate_start_tag`, and the microstep will be 0. + * For a transient federate that joins later, the time in the `federate_start_tag` will be greater than the + * federation_start_time`. + * + * + * Before sending the start time and tag, this function notifies my_fed of all upstream transient federates that are + * connected. After sending the start time and tag, and if my_fed is transient, notify federates downstream of its + * connection, ensuring proper handling of zero-delay cycles. + * + * This function assumes that the mutex lock is already held. + * + * @param my_fed the federate to send the start time to. + * @param federation_start_time the federation start_time + * @param federate_start_tag the federate effective start tag + */ +static void send_start_tag_locked(federate_info_t* my_fed, instant_t federation_start_time, tag_t federate_start_tag) { + // Notify my_fed of any upstream transient federates that are connected. + // This has to occur before sending the start tag so that my_fed does not begin executing thinking that these + // upstream federates are not connected. + for (int i = 0; i < my_fed->enclave.num_immediate_upstreams; i++) { + federate_info_t* fed = GET_FED_INFO(my_fed->enclave.immediate_upstreams[i]); + if (fed->is_transient && fed->enclave.state == GRANTED) { + send_upstream_connected_locked(my_fed, fed); + } + } + + // Send back to the federate the maximum time plus an offset on a TIMESTAMP_START + // message. + // If it is a persistent federate, only the start_time is sent. If, however, it is a transient + // federate, the effective_start_tag is also sent. + size_t buffer_size = (my_fed->is_transient) ? MSG_TYPE_TIMESTAMP_TAG_LENGTH : MSG_TYPE_TIMESTAMP_LENGTH; + unsigned char start_time_buffer[buffer_size]; + start_time_buffer[0] = MSG_TYPE_TIMESTAMP; + encode_int64(swap_bytes_if_big_endian_int64(federation_start_time), &start_time_buffer[1]); + if (my_fed->is_transient) { + encode_tag(&(start_time_buffer[1 + sizeof(instant_t)]), federate_start_tag); + } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_TIMESTAMP, my_fed->enclave.id, &federate_start_tag); + } + if (write_to_net(my_fed->net, buffer_size, start_time_buffer)) { + lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); + } else { + // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP_START + // message has been sent. That MSG_TYPE_TIMESTAMP_START message grants time advance to + // the federate to the federate_start_tag.time. + my_fed->enclave.state = GRANTED; + lf_cond_broadcast(&sent_start_time); + LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); + + // If this is a transient federate, notify its downstream federates that it is now connected. + if (my_fed->is_transient) { + for (int i = 0; i < my_fed->enclave.num_immediate_downstreams; i++) { + send_upstream_connected_locked(GET_FED_INFO(my_fed->enclave.immediate_downstreams[i]), my_fed); + } + } + } +} + void handle_timestamp(federate_info_t* my_fed) { unsigned char buffer[sizeof(int64_t)]; // Read bytes from the network abstraction. We need 8 bytes. @@ -707,49 +1064,151 @@ void handle_timestamp(federate_info_t* my_fed) { LF_PRINT_DEBUG("RTI received timestamp message with time: " PRINTF_TIME ".", timestamp); LF_MUTEX_LOCK(&rti_mutex); - rti_remote->num_feds_proposed_start++; - if (timestamp > rti_remote->max_start_time) { - rti_remote->max_start_time = timestamp; - } - if (rti_remote->num_feds_proposed_start == rti_remote->base.number_of_scheduling_nodes) { - // All federates have proposed a start time. - lf_cond_broadcast(&received_start_times); + + // Processing the TIMESTAMP depends on whether it is the startup phase. + if (rti_remote->phase == startup_phase) { + // Not all persistent federates have proposed a start time. + if (timestamp > rti_remote->max_start_time) { + rti_remote->max_start_time = timestamp; + } + // Note that if a transient federate's thread gets here during the startup phase, + // then it will be assigned the same global tag as its effective start tag and its + // timestamp will affect that start tag. + if (!my_fed->is_transient) { + rti_remote->num_feds_proposed_start++; + } + + if (rti_remote->num_feds_proposed_start == + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + // This federate is the last persistent federate to proposed a start time. + lf_cond_broadcast(&received_start_times); + rti_remote->phase = execution_phase; + } else { + // Wait until all persistent federates have proposed a start time. + while (rti_remote->num_feds_proposed_start < + (rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates)) { + lf_cond_wait(&received_start_times); + } + } + // Add an offset to the maximum tag to get everyone starting together. + start_time = rti_remote->max_start_time + DELAY_START; + // Set the start_time in the RTI trace + if (rti_remote->base.tracing_enabled) { + lf_tracing_set_start_time(start_time); + } + my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; + + // Notify the federate of its start tag. + // This has to be done while still holding the mutex. + send_start_tag_locked(my_fed, start_time, my_fed->effective_start_tag); + + LF_MUTEX_UNLOCK(&rti_mutex); + } else if (rti_remote->phase == shutdown_phase || !my_fed->is_transient) { + LF_MUTEX_UNLOCK(&rti_mutex); + // Send reject message if the federation is in shutdown phase or if + // it is in the execution phase but the federate is persistent. + send_reject(my_fed->net, JOINING_TOO_LATE); + return; } else { - // Some federates have not yet proposed a start time. - // wait for a notification. - while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) { - // FIXME: Should have a timeout here? - lf_cond_wait(&received_start_times); + // The federate is transient and we are in the execution phase. + // At this point, we already hold the mutex. + + //// Algorithm for computing the effective_start_time of a joining transient + // The effective_start_time will be the max among all the following tags: + // 1. At tag: (joining time, 0 microstep) + // 2. (start_time, 0 microstep) + // 3. The latest completed logical tag + 1 microstep + // 4. The latest granted (P)TAG + 1 microstep, of every downstream federate + // 5. The maximun tag of messages from the upstream federates + 1 microstep + + // Condition 1. + my_fed->effective_start_tag = (tag_t){.time = timestamp, .microstep = 0u}; + + // Condition 2. + if (timestamp < start_time) { + my_fed->effective_start_tag = (tag_t){.time = start_time, .microstep = 0u}; } - } - LF_MUTEX_UNLOCK(&rti_mutex); + // Condition 3. + if (lf_tag_compare(my_fed->enclave.completed, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = my_fed->enclave.completed; + my_fed->effective_start_tag.microstep++; + } - // Send back to the federate the maximum time plus an offset on a TIMESTAMP - // message. - unsigned char start_time_buffer[MSG_TYPE_TIMESTAMP_LENGTH]; - start_time_buffer[0] = MSG_TYPE_TIMESTAMP; - // Add an offset to this start time to get everyone starting together. - start_time = rti_remote->max_start_time + DELAY_START; - lf_tracing_set_start_time(start_time); - encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); + // Condition 4. Iterate over the downstream federates + for (int j = 0; j < my_fed->enclave.num_immediate_downstreams; j++) { + federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.immediate_downstreams[j]); - if (rti_remote->base.tracing_enabled) { - tag_t tag = {.time = start_time, .microstep = 0}; - tracepoint_rti_to_federate(send_TIMESTAMP, my_fed->enclave.id, &tag); - } - if (write_to_net(my_fed->net, MSG_TYPE_TIMESTAMP_LENGTH, start_time_buffer)) { - lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); - } + // Get the max over the TAG of the downstreams + if (lf_tag_compare(downstream->enclave.last_granted, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = downstream->enclave.last_granted; + my_fed->effective_start_tag.microstep++; + } - LF_MUTEX_LOCK(&rti_mutex); - // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP - // message has been sent. That MSG_TYPE_TIMESTAMP message grants time advance to - // the federate to the start time. - my_fed->enclave.state = GRANTED; - lf_cond_broadcast(&sent_start_time); - LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); - LF_MUTEX_UNLOCK(&rti_mutex); + // Get the max over the PTAG of the downstreams + if (lf_tag_compare(downstream->enclave.last_provisionally_granted, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = downstream->enclave.last_provisionally_granted; + my_fed->effective_start_tag.microstep++; + } + } + + // Condition 5. + // This one is a bit subtle. Any messages from upstream federates that the RTI has + // not yet seen will be sent to this joining federate after the effective_start_tag + // because the effective_start_tag is sent while still holding the mutex. + + // Iterate over the messages from the upstream federates + for (int j = 0; j < my_fed->enclave.num_immediate_upstreams; j++) { + federate_info_t* upstream = GET_FED_INFO(my_fed->enclave.immediate_upstreams[j]); + + size_t queue_size = pqueue_tag_size(upstream->in_transit_message_tags); + if (queue_size != 0) { + tag_t max_tag = pqueue_tag_max_tag(upstream->in_transit_message_tags); + + if (lf_tag_compare(max_tag, my_fed->effective_start_tag) >= 0) { + my_fed->effective_start_tag = max_tag; + my_fed->effective_start_tag.microstep++; + } + } + } + + // For every downstream that has a pending grant that is higher than the + // effective_start_time of the federate, cancel it. + // FIXME: Should this be higher-than or equal to? + // FIXME: Also, won't the grant simply be lost? + // If the joining federate doesn't send anything, the downstream federate won't issue another NET. + for (int j = 0; j < my_fed->enclave.num_immediate_downstreams; j++) { + federate_info_t* downstream = GET_FED_INFO(my_fed->enclave.immediate_downstreams[j]); + + // Ignore this federate if it has resigned. + if (downstream->enclave.state == NOT_CONNECTED) { + continue; + } + + // Check the pending grants, if any, and keep it only if it is + // sooner than the effective start tag. + pqueue_delayed_grant_element_t* dge = + pqueue_delayed_grants_find_by_fed_id(rti_remote->delayed_grants, downstream->enclave.id); + if (dge != NULL && lf_tag_compare(dge->base.tag, my_fed->effective_start_tag) >= 0) { + pqueue_delayed_grants_remove(rti_remote->delayed_grants, dge); + } + } + + my_fed->enclave.next_event = my_fed->effective_start_tag; + // Once the effective start time set, sent it to the joining transient, + // together with the start time of the federation. + + // Have to send the start tag while still holding the mutex to ensure that no message + // from an upstream federate is forwarded before the start tag. + send_start_tag_locked(my_fed, start_time, my_fed->effective_start_tag); + + // Whenver a transient joins, invalidate all federates, so that all min_delays_upstream + // get re-computed. + // FIXME: Maybe optimize it to only invalidate those affected by the transient + invalidate_min_delays(); + + LF_MUTEX_UNLOCK(&rti_mutex); + } } void send_physical_clock(unsigned char message_type, federate_info_t* fed, socket_type_t socket_type) { @@ -907,12 +1366,15 @@ static void handle_federate_failed(federate_info_t* my_fed) { tracepoint_rti_from_federate(receive_FAILED, my_fed->enclave.id, NULL); } + // First, mark a federate as disconnected and inform if necessary + notify_federate_disconnected(my_fed); + + LF_MUTEX_LOCK(&rti_mutex); + // Set the flag telling the RTI to exit with an error code when it exits. _lf_federate_reports_error = true; lf_print_error("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); - my_fed->enclave.state = NOT_CONNECTED; - // Indicate that there will no further events from this federate. my_fed->enclave.next_event = FOREVER_TAG; @@ -948,14 +1410,15 @@ static void handle_federate_resign(federate_info_t* my_fed) { tracepoint_rti_from_federate(receive_RESIGN, my_fed->enclave.id, NULL); } + // First, mark a federate as disconnected and inform if necessary + notify_federate_disconnected(my_fed); lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); - my_fed->enclave.state = NOT_CONNECTED; - // Indicate that there will no further events from this federate. my_fed->enclave.next_event = FOREVER_TAG; shutdown_net(my_fed->net, true); + my_fed->net = NULL; // Check downstream federates to see whether they should now be granted a TAG. // To handle cycles, need to create a boolean array to keep @@ -983,10 +1446,12 @@ void* federate_info_thread_TCP(void* fed) { if (read_failed) { // network abstraction is closed lf_print_error("RTI: Connection to federate %d is closed. Exiting the thread.", my_fed->enclave.id); - my_fed->enclave.state = NOT_CONNECTED; + notify_federate_disconnected(my_fed); + // Nothing more to do. Close the network abstraction and exit. // Prevent multiple threads from closing the same network abstraction at the same time. shutdown_net(my_fed->net, false); + my_fed->net = NULL; // FIXME: We need better error handling here, but do not stop execution here. break; } @@ -1006,7 +1471,7 @@ void* federate_info_thread_TCP(void* fed) { break; case MSG_TYPE_RESIGN: handle_federate_resign(my_fed); - return NULL; + break; case MSG_TYPE_NEXT_EVENT_TAG: handle_next_event_tag(my_fed); break; @@ -1028,14 +1493,47 @@ void* federate_info_thread_TCP(void* fed) { case MSG_TYPE_FAILED: handle_federate_failed(my_fed); return NULL; + /**A federate is initiating an SST session key rotation. Calls handle_key_refresh_request + * fetches the new key from the RTI's SST context, ACKs the federate, then swaps + * the key into the active slot under rti_mutex. + */ + case MSG_TYPE_SST_KEY_REFRESH_REQUEST: + #ifdef COMM_TYPE_SST + handle_key_refresh_request(my_fed); + #endif + break; + + case MSG_TYPE_TRANSIENT_LAUNCH_REQUEST: + handle_transient_launch_request(my_fed); + break; default: - lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u.", my_fed->enclave.id, - buffer[0]); + lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u", my_fed->enclave.id, buffer[0]); if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(receive_UNIDENTIFIED, my_fed->enclave.id, NULL); } } } + + // Nothing more to do. Close the socket and exit. + // Prevent multiple threads from closing the same socket at the same time. + LF_MUTEX_LOCK(&rti_mutex); + shutdown_net(my_fed->net, false); // from unistd.h + // Manual clean, in case of a transient federate + if (my_fed->is_transient) { + // FIXME: Aren't there transit messages anymore??? + // free_in_transit_message_q(my_fed->in_transit_message_tags); + + // Update the number of connected transient federates + rti_remote->number_of_connected_transient_federates--; + + // Reset the status of the leaving federate + reset_transient_federate(my_fed); + } + // Signal the hot swap mechanism, if needed + if (hot_swap_in_progress && hot_swap_federate->enclave.id == my_fed->enclave.id) { + hot_swap_old_resigned = true; + } + LF_MUTEX_UNLOCK(&rti_mutex); return NULL; } @@ -1043,7 +1541,7 @@ void send_reject(net_abstraction_t net_abs, unsigned char error_code) { LF_PRINT_DEBUG("RTI sending MSG_TYPE_REJECT."); unsigned char response[2]; response[0] = MSG_TYPE_REJECT; - response[1] = error_code; + response[1] = (unsigned char)error_code; LF_MUTEX_LOCK(&rti_mutex); // NOTE: Ignore errors on this response. if (write_to_net(net_abs, 2, response)) { @@ -1074,9 +1572,10 @@ static int32_t receive_and_check_fed_id_message(net_abstraction_t fed_net) { } uint16_t fed_id = rti_remote->base.number_of_scheduling_nodes; // Initialize to an invalid value. + bool is_transient = false; // First byte received is the message type. - if (buffer[0] != MSG_TYPE_FED_IDS) { + if (buffer[0] != MSG_TYPE_FED_IDS && buffer[0] != MSG_TYPE_TRANSIENT_FED_IDS) { if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); } @@ -1100,10 +1599,21 @@ static int32_t receive_and_check_fed_id_message(net_abstraction_t fed_net) { } else { // Received federate ID. fed_id = extract_uint16(buffer + 1); - LF_PRINT_DEBUG("RTI received federate ID: %d.", fed_id); - - // Read the federation ID. First read the length, which is one byte. + // Read the federation ID length, which is one byte. size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 1]; + if (buffer[0] == MSG_TYPE_TRANSIENT_FED_IDS) { + unsigned char buf; + read_from_net_close_on_error(fed_net, 1, &buf); + is_transient = (buf == 1) ? true : false; + } + + if (is_transient) { + LF_PRINT_LOG("RTI received federate ID: %d, which is transient.", fed_id); + } else { + LF_PRINT_LOG("RTI received federate ID: %d, which is persistent.", fed_id); + } + + // Read the federation ID. char federation_id_received[federation_id_length + 1]; // One extra for null terminator. // Next read the actual federation ID. if (read_from_net_close_on_error(fed_net, federation_id_length, (unsigned char*)federation_id_received)) { @@ -1139,18 +1649,65 @@ static int32_t receive_and_check_fed_id_message(net_abstraction_t fed_net) { send_reject(fed_net, FEDERATE_ID_OUT_OF_RANGE); return -1; } else { + // Find out if it is a new connection or a hot swap. + // Reject if: + // - duplicate of a connected persistent federate + // - or hot_swap is already in progress (Only 1 hot swap at a time!), for that + // particular federate + // - or it is a hot swap, but it is not the execution phase yet if ((rti_remote->base.scheduling_nodes[fed_id])->state != NOT_CONNECTED) { - lf_print_error("RTI received duplicate federate ID: %d.", fed_id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + if (!is_transient) { + lf_print_error("RTI received duplicate federate ID: %d.", fed_id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + } + send_reject(fed_net, FEDERATE_ID_IN_USE); + return -1; + } else if (hot_swap_in_progress || rti_remote->phase != execution_phase) { + lf_print_warning("RTI rejects the connection of transient federate %d, \ + because a hot swap is already in progress for federate %d. \n\ + Only one hot swap operation is allowed at a time.", + fed_id, hot_swap_federate->enclave.id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_REJECT, fed_id, NULL); + } + send_reject(fed_net, FEDERATE_ID_IN_USE); + return -1; } - send_reject(fed_net, FEDERATE_ID_IN_USE); - return -1; + + // Do NOT return -1 here. This is the hot-swap case: a transient federate + // that is already connected is reconnecting during execution with no other + // hot swap in progress. Fall through so the hot-swap initialization block + // below can allocate hot_swap_federate and set hot_swap_in_progress = true. } } } } - federate_info_t* fed = GET_FED_INFO(fed_id); + + federate_info_t* fed_twin = GET_FED_INFO(fed_id); + federate_info_t* fed; + // If the federate is already connected (making the request a duplicate), and that + // the federate is transient, and it is the execution phase, then mark that a hot + // swap is in progreass and initialize the hot_swap_federate. + // Otherwise, proceed with a normal transinet connection + if (fed_twin->enclave.state != NOT_CONNECTED && is_transient && fed_twin->is_transient && + rti_remote->phase == execution_phase && !hot_swap_in_progress) { + // Allocate memory for the new federate and initilize it + hot_swap_federate = (federate_info_t*)malloc(sizeof(federate_info_t)); + initialize_federate(hot_swap_federate, fed_id); + + // Mark hot swap as in progress and reset the old-federate-resigned flag, + // so the RTI waits for the old instance to disconnect before promoting the new one. + hot_swap_in_progress = true; + hot_swap_old_resigned = false; + // free(fed); // Free the old memory to prevent memory leak + fed = hot_swap_federate; + lf_print("RTI: Hot Swap starting for federate %d.", fed_id); + } else { + fed = fed_twin; + fed->is_transient = is_transient; + } + // The MSG_TYPE_FED_IDS message has the right federation ID. fed->net = fed_net; @@ -1182,6 +1739,11 @@ static int32_t receive_and_check_fed_id_message(net_abstraction_t fed_net) { /** * Listen for a MSG_TYPE_NEIGHBOR_STRUCTURE message, and upon receiving it, fill * out the relevant information in the federate's struct. + * + * In case of a hot swap, check that no changes were made to the connections, compared + * to the first instance that joigned. This means that the first instance to join + * __is__ the reference. + * * @return 1 on success and 0 on failure. */ static int receive_connection_information(net_abstraction_t fed_net, uint16_t fed_id) { @@ -1198,7 +1760,19 @@ static int receive_connection_information(net_abstraction_t fed_net, uint16_t fe send_reject(fed_net, UNEXPECTED_MESSAGE); return 0; } else { + // In case of a transient federate that is joining again, or a hot swap, then + // check that the connection information did not change. federate_info_t* fed = GET_FED_INFO(fed_id); + federate_info_t* temp_fed = NULL; + if (lf_tag_compare(fed->effective_start_tag, NEVER_TAG) != 0) { + if (hot_swap_in_progress) { + fed = hot_swap_federate; + } else { + temp_fed = (federate_info_t*)calloc(1, sizeof(federate_info_t)); + initialize_federate(temp_fed, fed_id); + fed = temp_fed; + } + } // Read the number of upstream and downstream connections fed->enclave.num_immediate_upstreams = extract_int32(&(connection_info_header[1])); fed->enclave.num_immediate_downstreams = extract_int32(&(connection_info_header[1 + sizeof(int32_t)])); @@ -1248,6 +1822,46 @@ static int receive_connection_information(net_abstraction_t fed_net, uint16_t fe free(connections_info_body); } + + // NOTE: In this design, changes in the connections are not allowed. This means that the first + // instance to join __is__ the reference. If this policy is to be changed, then it is in + // the following lines will be updated accordingly. + if (hot_swap_in_progress || temp_fed != NULL) { + if (temp_fed == NULL) { + temp_fed = hot_swap_federate; + } + // Now, compare the previous and the new neighberhood structure + // Start with the number of upstreams and downstreams + bool reject = false; + if ((fed->enclave.num_immediate_upstreams != temp_fed->enclave.num_immediate_upstreams) || + (fed->enclave.num_immediate_downstreams != temp_fed->enclave.num_immediate_downstreams)) { + reject = true; + } else { + // Then check all upstreams and their delays + for (int i = 0; i < fed->enclave.num_immediate_upstreams; i++) { + if ((fed->enclave.immediate_upstreams[i] != temp_fed->enclave.immediate_upstreams[i]) || + (fed->enclave.immediate_upstream_delays[i] != temp_fed->enclave.immediate_upstream_delays[i])) { + reject = true; + break; + } + } + if (!reject) { + // Finally, check all downstream federates + for (int i = 0; i < fed->enclave.num_immediate_downstreams; i++) { + if (fed->enclave.immediate_downstreams[i] != temp_fed->enclave.immediate_downstreams[i]) { + reject = true; + break; + } + } + } + } + if (reject) { + if (temp_fed != hot_swap_federate) { + free(temp_fed); + } + return 0; + } + } } LF_PRINT_DEBUG("RTI received neighbor structure from federate %d.", fed_id); return 1; @@ -1280,7 +1894,12 @@ static int receive_udp_message_and_set_up_clock_sync(net_abstraction_t fed_net, send_reject(fed_net, UNEXPECTED_MESSAGE); return 0; } else { - federate_info_t* fed = GET_FED_INFO(fed_id); + federate_info_t* fed; + if (hot_swap_in_progress) { + fed = hot_swap_federate; + } else { + fed = GET_FED_INFO(fed_id); + } if (rti_remote->clock_sync_global_status >= clock_sync_init) { // If no initial clock sync, no need perform initial clock sync. uint16_t federate_UDP_port_number = extract_uint16(&(response[1])); @@ -1406,8 +2025,8 @@ static bool authenticate_federate(net_abstraction_t fed_net) { } #endif -void lf_connect_to_federates(net_abstraction_t rti_net) { - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { +void lf_connect_to_persistent_federates(net_abstraction_t rti_net) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes - rti_remote->number_of_transient_federates; i++) { net_abstraction_t fed_net = accept_net(rti_net); if (fed_net == NULL) { lf_print_warning("RTI failed to accept the federate."); @@ -1438,13 +2057,21 @@ void lf_connect_to_federates(net_abstraction_t rti_net) { // synchronization messages. federate_info_t* fed = GET_FED_INFO(fed_id); lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + + // If the federate is transient, then do not count it. + if (fed->is_transient) { + rti_remote->number_of_connected_transient_federates++; + assert(rti_remote->number_of_connected_transient_federates <= rti_remote->number_of_transient_federates); + i--; + lf_print("RTI: Transient federate %d joined.", fed->enclave.id); + } } else { // Received message was rejected. Try again. i--; } } // All federates have connected. - LF_PRINT_DEBUG("All federates have connected to RTI."); + LF_PRINT_DEBUG("All persistent federates have connected to RTI."); if (rti_remote->clock_sync_global_status >= clock_sync_on) { // Create the thread that performs periodic PTP clock synchronization sessions @@ -1464,6 +2091,178 @@ void lf_connect_to_federates(net_abstraction_t rti_net) { } } +/** + * @brief A request for immediate stop to the federate + * + * @param fed: the deferate to stop + */ +void send_stop(federate_info_t* fed) { + // Reply with a stop granted to all federates + unsigned char outgoing_buffer[MSG_TYPE_STOP_LENGTH]; + outgoing_buffer[0] = MSG_TYPE_STOP; + lf_print("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); + + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(send_STOP, fed->enclave.id, NULL); + } + write_to_net_fail_on_error(fed->net, MSG_TYPE_STOP_LENGTH, outgoing_buffer, NULL, + "RTI failed to send MSG_TYPE_STOP message to federate %d.", fed->enclave.id); + + LF_PRINT_LOG("RTI sent MSG_TYPE_STOP to federate %d.", fed->enclave.id); +} + +void* lf_connect_to_transient_federates_thread(void* nothing) { + // This loop will continue to accept connections of transient federates, as soon as there is room, or enable hot swap + while (!rti_remote->all_persistent_federates_exited) { + // Continue waiting for an incoming connection requests from transients to join, or for hot swap. + // Wait for an incoming connection request. + net_abstraction_t fed_net = accept_net(rti_remote->rti_net); + if(fed_net == NULL){ + return NULL; + } + +// Wait for the first message from the federate when RTI -a option is on. +#ifdef __RTI_AUTH__ + if (rti_remote->authentication_enabled) { + if (!authenticate_federate(fed_net)) { + lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the network abstraction + shutdown_net(fed_net, false); + continue; + } + } +#endif + + // The first message from the federate should contain its ID and the federation ID. + // The function also detects if a hot swap request is initiated. + int32_t fed_id = receive_and_check_fed_id_message(fed_net); + + if (fed_id >= 0 && receive_connection_information(fed_net, (uint16_t)fed_id) && + receive_udp_message_and_set_up_clock_sync(fed_net, (uint16_t)fed_id)) { + LF_MUTEX_LOCK(&rti_mutex); + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap confirmed for federate %d.", fed_id); + + // Then send STOP + federate_info_t* fed_old = GET_FED_INFO(fed_id); + hot_swap_federate->enclave.completed = fed_old->enclave.completed; + + LF_PRINT_LOG("RTI: Send MSG_TYPE_STOP to old federate %d.", fed_id); + send_stop(fed_old); + LF_MUTEX_UNLOCK(&rti_mutex); + + // Wait for the old federate to send MSG_TYPE_RESIGN + LF_PRINT_LOG("RTI: Waiting for old federate %d to send resign.", fed_id); + // FIXME: This is a busy wait! Need instead a lf_cond_wait on a condition variable. + while (!hot_swap_old_resigned) { + } + + // The latest LTC is the tag at which the old federate resigned. This is useful + // for computing the effective_start_time of the new joining federate. + hot_swap_federate->enclave.completed = fed_old->enclave.completed; + + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + lf_thread_create(&(hot_swap_federate->thread_id), federate_info_thread_TCP, hot_swap_federate); + + // Redirect the federate in rti_remote + rti_remote->base.scheduling_nodes[fed_id] = (scheduling_node_t*)hot_swap_federate; + + //Assign the transient federates info required to launch the transient from the old federate to the hot swap federate + hot_swap_federate->transient_launch_name = fed_old->transient_launch_name; + hot_swap_federate->transient_launch_ip = fed_old->transient_launch_ip; + hot_swap_federate->transient_launch_binary_path = fed_old->transient_launch_binary_path; + hot_swap_federate->transient_launch_sst_config_path = fed_old->transient_launch_sst_config_path; + hot_swap_federate->transient_launch_user = fed_old->transient_launch_user; + // Free the old federate memory and reset the Hot wap indicators + // FIXME: Is this enough to free the memory allocated to the federate? + free(fed_old); + lf_mutex_lock(&rti_mutex); + hot_swap_in_progress = false; + lf_mutex_unlock(&rti_mutex); + + lf_print("RTI: Hot swap succeeded for federate %d.", fed_id); + } else { + lf_mutex_unlock(&rti_mutex); + + // Create a thread to communicate with the federate. + // This has to be done after clock synchronization is finished + // or that thread may end up attempting to handle incoming clock + // synchronization messages. + federate_info_t* fed = GET_FED_INFO(fed_id); + lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); + lf_print("RTI: Transient federate %d joined.", fed_id); + } + rti_remote->number_of_connected_transient_federates++; + } else { + // If a hot swap was initialed, but the connection information or/and clock + // synchronization fail, then reset hot_swap_in_profress, and free the memory + // allocated for hot_swap_federate + if (hot_swap_in_progress) { + lf_print("RTI: Hot swap canceled for federate %d.", fed_id); + lf_mutex_lock(&rti_mutex); + hot_swap_in_progress = false; + lf_mutex_unlock(&rti_mutex); + + // FIXME: Is this enough to free the memory of a federate_info_t data structure? + free(hot_swap_federate); + } + } + } + return NULL; +} + +/** + * @brief Thread that manages the delayed grants using a priprity queue. + * + * This thread is responsible for managing the priority queue of delayed grants to be issued. + * It waits until the current time matches the highest priority tag time in the queue. + * If reached, it notifies the grant immediately. If, however, the current time has not yet + * reached the highest priority tag and the queue has been updated (either by inserting or + * canceling an entry), the thread stops waiting and restarts the process again. + */ +static void* lf_delayed_grants_thread(void* nothing) { + initialize_lf_thread_id(); + // Hold the mutex when not waiting. + LF_MUTEX_LOCK(&rti_mutex); + while (!rti_remote->all_federates_exited) { + if (pqueue_delayed_grants_size(rti_remote->delayed_grants) > 0) { + // Do not pop, but rather peek. + pqueue_delayed_grant_element_t* next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + instant_t next_time = next->base.tag.time; + // Wait for expiration, or a signal to stop or terminate. + int ret = lf_clock_cond_timedwait(&updated_delayed_grants, next_time); + if (ret == LF_TIMEOUT) { + // Time reached to send the grant. + // However, the grant may have been canceled while we were waiting. + pqueue_delayed_grant_element_t* new_next = pqueue_delayed_grants_peek(rti_remote->delayed_grants); + if (next == new_next) { + pqueue_delayed_grants_pop(rti_remote->delayed_grants); + federate_info_t* fed = GET_FED_INFO(next->fed_id); + if (next->is_provisional) { + notify_provisional_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } else { + notify_tag_advance_grant_immediate(&(fed->enclave), next->base.tag); + } + free(next); + } + } else if (ret != 0) { + // An error occurred. + lf_print_error_and_exit("lf_delayed_grants_thread: lf_clock_cond_timedwait failed with code %d.", ret); + } + } else if (pqueue_delayed_grants_size(rti_remote->delayed_grants) == 0) { + // Wait for something to appear on the queue. + lf_cond_wait(&updated_delayed_grants); + } + } + // Free any delayed grants that are still on the queue. + pqueue_delayed_grants_free(rti_remote->delayed_grants); + LF_MUTEX_UNLOCK(&rti_mutex); + return NULL; +} + void* respond_to_erroneous_connections(void* nothing) { initialize_lf_thread_id(); while (true) { @@ -1481,7 +2280,7 @@ void* respond_to_erroneous_connections(void* nothing) { lf_print_error("RTI received an unexpected connection request. Federation is running."); unsigned char response[2]; response[0] = MSG_TYPE_REJECT; - response[1] = FEDERATION_ID_DOES_NOT_MATCH; + response[1] = (unsigned char)FEDERATION_ID_DOES_NOT_MATCH; // Ignore errors on this response. if (write_to_net(fed_net, 2, response)) { lf_print_warning("RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); @@ -1497,6 +2296,29 @@ void initialize_federate(federate_info_t* fed, uint16_t id) { fed->requested_stop = false; fed->clock_synchronization_enabled = true; fed->in_transit_message_tags = pqueue_tag_init(10); + fed->has_upstream_transient_federates = false; + fed->is_transient = true; + fed->effective_start_tag = NEVER_TAG; +} + +void reset_transient_federate(federate_info_t* fed) { + // Reset all the timing information from the previous run + fed->enclave.completed = NEVER_TAG; + fed->enclave.last_granted = NEVER_TAG; + fed->enclave.last_provisionally_granted = NEVER_TAG; + fed->enclave.next_event = NEVER_TAG; + // Reset of the federate-related attributes + fed->net = NULL; // No socket. + fed->clock_synchronization_enabled = true; + // FIXME: The following two lines can be improved? + pqueue_tag_free(fed->in_transit_message_tags); + fed->in_transit_message_tags = pqueue_tag_init(10); + fed->requested_stop = false; + fed->effective_start_tag = NEVER_TAG; + // Whenver a transient resigns or leaves, invalidate all federates, so that all min_delays_upstream + // get re-computed. + // FIXME: Maybe optimize it to only invalidate those affected by the transient + invalidate_min_delays(); } int start_rti_server() { @@ -1521,28 +2343,113 @@ int start_rti_server() { return 0; } +/** + * Iterate over the federates and sets 'has_upstream_transient_federates'. + * Once done, check that no transient federate has an upstream transient federate. + * and compute the number of persistent federates that do have upstream transients, + * which is the maximun number of delayed grants that can be pending at the same time. + * This is useful for initialyzing the queue of delayed grants. + + * @return -1, if there is more than one level of transiency, else, the number of + * persistents that have an upstream transient + */ +static int set_has_upstream_transient_federates_parameter_and_check() { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + for (int j = 0; j < fed->enclave.num_immediate_upstreams; j++) { + federate_info_t* upstream_fed = GET_FED_INFO(fed->enclave.immediate_upstreams[j]); + if (upstream_fed->is_transient) { + fed->has_upstream_transient_federates = true; + break; + } + } + } + + // Now check that no transient has an upstream transient + // FIXME: Do we really need this? Or should it be the job of the validator? + uint16_t max_number_of_delayed_grants = 0; + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient && fed->has_upstream_transient_federates) { + return -1; + } + if (!fed->is_transient && fed->has_upstream_transient_federates) { + max_number_of_delayed_grants++; + } + } + return max_number_of_delayed_grants; +} + void wait_for_federates() { - // Wait for connections from federates and create a thread for each. - lf_connect_to_federates(rti_remote->rti_net); + // Wait for connections from persistent federates and create a thread for each. + lf_connect_to_persistent_federates(rti_remote->rti_net); +// void wait_for_federates() { +// // Wait for connections from federates and create a thread for each. +// lf_connect_to_federates(rti_remote->rti_net); + + // Set has_upstream_transient_federates parameter in all federates and check + // that there is no more than one level of transiency + if (rti_remote->number_of_transient_federates > 0) { + int max_number_of_pending_grants = set_has_upstream_transient_federates_parameter_and_check(); + if (max_number_of_pending_grants == -1) { + lf_print_error_and_exit("RTI: Transient federates cannot have transient upstreams!"); + } + rti_remote->delayed_grants = pqueue_delayed_grants_init(max_number_of_pending_grants); + } - // All federates have connected. - lf_print("RTI: All expected federates have connected. Starting execution."); + // All persistent federates have connected. + lf_print("RTI: All expected persistent federates have connected. Starting execution."); + if (rti_remote->number_of_transient_federates > 0) { + lf_print("RTI: Transient Federates can join and leave the federation at anytime."); + } + // The socket server will only continue to accept connections from transient + // federates. // The network abstraction server will not continue to accept connections after all the federates // have joined. // In case some other federation's federates are trying to join the wrong // federation, need to respond. Start a separate thread to do that. lf_thread_t responder_thread; - lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); + lf_thread_t transient_thread; + lf_thread_t delayed_grants_thread; - // Wait for federate threads to exit. + // If the federation does not include transient federates, then respond to + // erronous connections. Otherwise, continue to accept transients joining and + // respond to duplicate joing requests. + if (rti_remote->number_of_transient_federates == 0) { + lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); + } else if (rti_remote->number_of_transient_federates > 0) { + lf_thread_create(&transient_thread, lf_connect_to_transient_federates_thread, NULL); + lf_thread_create(&delayed_grants_thread, lf_delayed_grants_thread, NULL); + } + + // Wait for persistent federate threads to exit. void* thread_exit_status; for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { federate_info_t* fed = GET_FED_INFO(i); - lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); - lf_thread_join(fed->thread_id, &thread_exit_status); - pqueue_tag_free(fed->in_transit_message_tags); - lf_print("RTI: Federate %d thread exited.", fed->enclave.id); + if (!fed->is_transient) { + lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); + lf_thread_join(fed->thread_id, &thread_exit_status); + pqueue_tag_free(fed->in_transit_message_tags); + lf_print("RTI: Persistent federate %d thread exited.", fed->enclave.id); + } + } + + rti_remote->all_persistent_federates_exited = true; + rti_remote->phase = shutdown_phase; + lf_print("RTI: All persistent threads exited."); + + // Wait for transient federate threads to exit, if any. + if (rti_remote->number_of_transient_federates > 0) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + federate_info_t* fed = GET_FED_INFO(i); + if (fed->is_transient) { + lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); + lf_thread_join(fed->thread_id, &thread_exit_status); + pqueue_tag_free(fed->in_transit_message_tags); + lf_print("RTI: Transient federate %d thread exited.", fed->enclave.id); + } + } } rti_remote->all_federates_exited = true; @@ -1566,6 +2473,7 @@ void initialize_RTI(rti_remote_t* rti) { init_shutdown_mutex(); LF_COND_INIT(&received_start_times, &rti_mutex); LF_COND_INIT(&sent_start_time, &rti_mutex); + LF_COND_INIT(&updated_delayed_grants, &rti_mutex); initialize_rti_common(&rti_remote->base); rti_remote->base.mutex = &rti_mutex; @@ -1585,6 +2493,8 @@ void initialize_RTI(rti_remote_t* rti) { rti_remote->base.tracing_enabled = false; rti_remote->base.dnet_disabled = false; rti_remote->stop_in_progress = false; + rti_remote->number_of_transient_federates = 0; + rti_remote->phase = startup_phase; } // The RTI includes clock.c, which requires the following functions that are defined @@ -1596,6 +2506,7 @@ void clock_sync_subtract_offset(instant_t* t) { (void)t; } void free_scheduling_nodes(scheduling_node_t** scheduling_nodes, uint16_t number_of_scheduling_nodes) { invalidate_min_delays(); for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { + // FIXME: Gives error freeing memory not allocated!!!! scheduling_node_t* node = scheduling_nodes[i]; if (node->immediate_upstreams != NULL) { free(node->immediate_upstreams); @@ -1609,4 +2520,123 @@ void free_scheduling_nodes(scheduling_node_t** scheduling_nodes, uint16_t number free(scheduling_nodes); } +#ifdef COMM_TYPE_SST +void handle_key_refresh_request(federate_info_t* fed){ + unsigned char key_id[8]; + read_from_net_fail_on_error(fed->net, 8, key_id, NULL); + + fetch_pending_session_key(fed->net, key_id); + lf_print("New session key fetched and stored in the pending key field"); + + LF_MUTEX_LOCK(&rti_mutex); + send_key_refresh_request(fed->net, MSG_TYPE_SST_KEY_ACK); + swap_to_pending_key(fed->net); + LF_MUTEX_UNLOCK(&rti_mutex); + + lf_print("Key ID swap complete"); + + + +} +#endif +void handle_transient_launch_request(federate_info_t* fed){ + unsigned char fed_id[2]; + read_from_net_fail_on_error(fed->net, sizeof(uint16_t), fed_id, NULL); + uint16_t transient_fed_id = extract_uint16(fed_id); + + federate_info_t* transient_federate = GET_FED_INFO(transient_fed_id); + + char cmd[4096]; + bool is_local = (transient_federate->transient_launch_ip == NULL || + strcmp(transient_federate->transient_launch_ip, "localhost") == 0); + if (is_local) { + // The transient federate is on the same machine as the RTI — run directly. + snprintf(cmd, sizeof(cmd), + "nohup %s -i %s -sst %s >federate_%d.log 2>&1 &", + transient_federate->transient_launch_binary_path, + rti_remote->federation_id, + transient_federate->transient_launch_sst_config_path, + transient_fed_id + ); + } else { + // Launch the transient federate remotely via SSH. The command changes to the + // LinguaFrancaRemote working directory, then starts the federate binary with + // nohup so it survives SSH disconnect. The -sst flag passes the SST security + // config, and both stdout and stderr are redirected to a per-federate log file. + // The trailing '&' runs the process in the background so the SSH session exits + // immediately after spawning it. + snprintf(cmd, sizeof(cmd), + "ssh %s@%s 'mkdir -p ~/LinguaFrancaRemote/TransientFederateLog && cd ~/LinguaFrancaRemote && nohup %s -i %s -sst %s >~/LinguaFrancaRemote/TransientFederateLog/federate_%d.log 2>&1 &'", + transient_federate->transient_launch_user, + transient_federate->transient_launch_ip, + transient_federate->transient_launch_binary_path, + rti_remote->federation_id, + transient_federate->transient_launch_sst_config_path, + transient_fed_id + ); + } + + if (system(cmd) != 0) { + lf_print_error("RTI failed to launch transient federate %d.", transient_fed_id); + } + +} + +void parse_transient_federate_config(const char* file_path){ + if (file_path == NULL) return; + + FILE* file_pointer; + char line[4096]; + + file_pointer = fopen(file_path, "r"); + if (file_pointer == NULL){ + lf_print_error("Error opening the transient config file"); + return; + } + federate_info_t* federate_instance = NULL; + while(fgets(line, sizeof(line), file_pointer) != NULL){ + line[strcspn(line, "\n")] = '\0'; + char* equals = strchr(line, '='); + if(equals == NULL){ + continue; + } + char* value = equals + 1; + *equals = '\0'; + + if(strcmp(line, "federateId") == 0){ + errno = 0; + char* endptr; + long id_long = strtol(value, &endptr, 10); + + if(errno != 0 || id_long > UINT16_MAX || id_long<0){ + lf_print_error("Invalid federateID: %s", value); + continue; + } + + uint16_t id = (uint16_t)id_long; + federate_instance = GET_FED_INFO(id); + } + else if(federate_instance == NULL){ + continue; + } + else if(strcmp(line, "federateName") == 0){ + federate_instance->transient_launch_name = strdup(value); + } + else if(strcmp(line, "federateIp") == 0){ + federate_instance->transient_launch_ip = strdup(value); + } + else if(strcmp(line, "federateLaunchPath") == 0){ + federate_instance->transient_launch_binary_path = strdup(value); + } + else if(strcmp(line, "federateSSTPath") == 0){ + federate_instance->transient_launch_sst_config_path = strdup(value); + } + else if(strcmp(line, "federateUser") == 0){ + federate_instance->transient_launch_user = strdup(value); + } + + } + fclose(file_pointer); +} + #endif // STANDALONE_RTI diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index 58ac3b60a..df87ca6ec 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -62,6 +62,27 @@ typedef struct federate_info_t { /** @brief Record of in-transit messages to this federate that are not yet processed. This record is ordered based on * the time value of each message for a more efficient access. */ pqueue_tag_t* in_transit_message_tags; + /** @brief Indicates whether the federate has uptream transient federates */ + bool has_upstream_transient_federates; + /** @brief Indicates whether the federate is transient or persistent. */ + bool is_transient; + /** @brief Records the start time of the federate, which is mainly useful for transient federates */ + tag_t effective_start_tag; + + /** These fields are only initialised to a value if the federate is transient. These fields are used by the rti to launch a + * transient federate on a remote machine using ssh from rti's machine. + */ + + /** @brief Username user to identify the unique account we would ssh into */ + char* transient_launch_user; + /** @brief Ip address for the transient federate remote machine */ + char* transient_launch_ip; + /** @brief Binary file path for the transient federate on the remote machine where it will be launched */ + char* transient_launch_binary_path; + /** @brief SST config file path for the transeint federate on the remote machine */ + char* transient_launch_sst_config_path; + /** @brief Transient federates name */ + char* transient_launch_name; } federate_info_t; /** @@ -70,6 +91,29 @@ typedef struct federate_info_t { */ typedef enum clock_sync_stat { clock_sync_off, clock_sync_init, clock_sync_on } clock_sync_stat; +/** + * The federation life cycle phases. + */ +typedef enum federation_life_cycle_phase { + startup_phase, // Not all persistent federates have joined. + execution_phase, // All persistent federates have joined. + shutdown_phase // Federation is shutting down. +} federation_life_cycle_phase; + +/** + * @brief The type for an element in a delayed grants priority queue that is sorted by tag. + */ +typedef struct pqueue_delayed_grant_element_t { + pqueue_tag_element_t base; + uint16_t fed_id; // Id of the federate with delayed grant of tag (in base) + bool is_provisional; // Boolean recoding if the delayed grant is provisional +} pqueue_delayed_grant_element_t; + +/** + * @brief Type of a delayed grants queue sorted by tags. + */ +typedef pqueue_tag_t pqueue_delayed_grants_t; + /** * @brief Structure that an RTI instance uses to keep track of its own and its * corresponding federates' state. @@ -104,6 +148,16 @@ typedef struct rti_remote_t { */ volatile bool all_federates_exited; + /** + * @brief Boolean indicating that all persistent federates have exited. + * + * This gets set to true exactly once before the program waits for + * persistent federates, then exits. + * It is marked volatile because the write is not guarded by a mutex. + * The main thread makes this true. + */ + volatile bool all_persistent_federates_exited; + /** * @brief The ID of the federation that this RTI will supervise. * @@ -149,6 +203,27 @@ typedef struct rti_remote_t { /** @brief Boolean indicating that a stop request is already in progress. */ bool stop_in_progress; + + /** + * Number of transient federates + */ + int32_t number_of_transient_federates; + + /** + * Number of connected transient federates + */ + int32_t number_of_connected_transient_federates; + + /** + * Indicates the life cycle phase of the federation. + */ + federation_life_cycle_phase phase; + + /** + * Queue of the pending grants, in case transient federates are absent and + * issuing grants to their downstreams need to be delayed. + */ + pqueue_delayed_grants_t* delayed_grants; } rti_remote_t; extern int lf_critical_section_enter(environment_t* env); @@ -272,7 +347,7 @@ void handle_address_query(uint16_t fed_id); * field of the _RTI.federates[federate_id] array of structs. * * The server_hostname and server_ip_addr fields are assigned - * in lf_connect_to_federates() upon accepting the socket + * in lf_connect_to_persistent_federates() upon accepting the socket * from the remote federate. * * This function assumes the caller does not hold the mutex. @@ -354,6 +429,14 @@ void* federate_info_thread_TCP(void* fed); */ void send_reject(net_abstraction_t net_abs, unsigned char error_code); +/** + * Thread to wait for incoming connection request from transient federates. + * Upon receiving the connection request, check if a hot swap should start or + * simply create a thread to communicate with that federate. + * Stops if all persistent federates exited. + */ +void* lf_connect_to_transient_federates_thread(void* nothing); + /** * @brief Wait for one incoming connection request from each federate, * and, upon receiving it, create a thread to communicate with that federate. @@ -384,6 +467,14 @@ void* respond_to_erroneous_connections(void* nothing); */ void initialize_federate(federate_info_t* fed, uint16_t id); +/** + * @brief Reset the federate. The federate has to be transient. + * @ingroup RTI + * + * @param fed A pointer to the federate + */ +void reset_transient_federate(federate_info_t* fed); + /** * @brief Start the socket server for the runtime infrastructure (RTI) and return the socket descriptor. * @ingroup RTI @@ -439,6 +530,41 @@ int process_args(int argc, const char* argv[]); * @param rti The RTI instance to initialize. */ void initialize_RTI(rti_remote_t* rti); +/** + * @brief Handle a session key refresh request from a federate. + * + * Reads the new key ID sent by the federate, fetches the corresponding + * session key from the SST context, acknowledges the request with + * MSG_TYPE_SST_KEY_ACK, and swaps the new key into the active slot. + * The key swap is performed under the RTI mutex to prevent concurrent + * access during the transition. + * + * This is called when the RTI receives a MSG_TYPE_SST_KEY_REFRESH_REQUEST + * message from a federate. + * + * @param fed The federate that initiated the key refresh. + */ +void handle_key_refresh_request(federate_info_t* fed); + +/** + * @brief Handle transient federate launch request + * + * This function is called when a message type MSG_TYPE_TRANSIENT_LAUNCH_REQUEST is received by the + * RTI. The function reads the federate id that was requested to be launched. Federate information is + * retrived using the GET_FED_INFO(id) command to retrieve the transient federate fields required by the RTI + * to SSH into the federates machine and the other data required to locate and launch the federate + */ +void handle_transient_launch_request(federate_info_t* fed); + +/** + * @brief Parse the transient federate config file which contains information on all transient + * federates present in the federation required to launch them + * + * Read the config file to get information on the transient federates and store + * the name, host address, remote user where the transient federate is deployed, + * launch file path and the sst config in the federate's instance fields + */ +void parse_transient_federate_config(const char* file_path); #endif // RTI_REMOTE_H #endif // STANDALONE_RTI diff --git a/core/federated/federate.c b/core/federated/federate.c index 1b5458378..a0136d88c 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -37,8 +37,13 @@ #include // For HMAC-based authentication of federates. #endif +// Global variable for synchronizing read and write when session key is being refreshed +#ifdef COMM_TYPE_SST +lf_cond_t lf_rekey_completed; +#endif // Global variables defined in tag.c: extern instant_t start_time; +extern tag_t effective_start_tag; // Global variable defined in reactor_common.c: extern bool _lf_termination_executed; @@ -85,7 +90,8 @@ federate_instance_t _fed = {.number_of_inbound_p2p_connections = 0, .last_sent_LTC = {.time = NEVER, .microstep = 0u}, .last_sent_NET = {.time = NEVER, .microstep = 0u}, .last_skipped_NET = {.time = NEVER, .microstep = 0u}, - .min_delay_from_physical_action_to_federate_output = NEVER}; + .min_delay_from_physical_action_to_federate_output = NEVER, + .is_transient = false}; federation_metadata_t federation_metadata = { .federation_id = "Unidentified Federation", .rti_host = NULL, .rti_port = -1, .rti_user = NULL}; @@ -151,6 +157,8 @@ extern interval_t _lf_action_delay_table[]; extern size_t _lf_action_table_size; extern lf_action_base_t* _lf_zero_delay_cycle_action_table[]; extern size_t _lf_zero_delay_cycle_action_table_size; +extern uint16_t _lf_zero_delay_cycle_upstream_ids[]; +extern bool _lf_zero_delay_cycle_upstream_disconnected[]; extern reaction_t* network_input_reactions[]; extern size_t num_network_input_reactions; extern reaction_t* port_absent_reaction[]; @@ -176,7 +184,7 @@ static lf_action_base_t* action_for_port(int port_id) { /** * Update the last known status tag of all network input ports - * to the value of `tag`, unless that the provided `tag` is less + * to the value of `tag`, unless the provided `tag` is less * than the last_known_status_tag of the port. This is called when * a TAG signal is received from the RTI in centralized coordination. * If any update occurs, then this broadcasts on `lf_port_status_changed`. @@ -242,8 +250,8 @@ static void update_last_known_status_on_input_ports(tag_t tag, environment_t* en * * @param env The top-level environment, whose mutex is assumed to be held. * @param tag The tag on which the latest status of the specified network input port is known. + * @param port_id The port ID. * @param warn If true, print a warning if the tag is less than the last known status tag of the port. - * @param portID The port ID. */ static void update_last_known_status_on_input_port(environment_t* env, tag_t tag, int port_id, bool warn) { if (lf_tag_compare(tag, env->current_tag) < 0) @@ -307,13 +315,41 @@ static void mark_inputs_known_absent(int fed_id) { } /** - * Set the status of network port with id portID. + * @brief Update the last known status tag of a network input action. + * + * This function is similar to update_last_known_status_on_input_port, but + * it is called when a PTAG is granted and an upstream transient federate is not + * connected. It updates the last known status tag of the network input action + * so that it will not wait for a message or absent message from the upstream federate. + * + * This function assumes the caller holds the mutex on the top-level environment, + * and, if the tag actually increases, it broadcasts on `lf_port_status_changed`. + * + * @param env The top-level environment, whose mutex is assumed to be held. + * @param action The action associated with the network input port. + * @param tag The tag of the PTAG. + */ +static void update_last_known_status_on_action(environment_t* env, lf_action_base_t* action, tag_t tag) { + if (lf_tag_compare(tag, env->current_tag) < 0) + tag = env->current_tag; + trigger_t* input_port_trigger = action->trigger; + if (lf_tag_compare(tag, input_port_trigger->last_known_status_tag) > 0) { + LF_PRINT_LOG("Updating the last known status tag of port for upstream absent transient federate from " PRINTF_TAG + " to " PRINTF_TAG ".", + input_port_trigger->last_known_status_tag.time - lf_time_start(), + input_port_trigger->last_known_status_tag.microstep, tag.time - lf_time_start(), tag.microstep); + input_port_trigger->last_known_status_tag = tag; + } +} + +/** + * Set the status of network port with id port_id. * - * @param portID The network port ID + * @param port_id The network port ID * @param status The network port status (port_status_t) */ -static void set_network_port_status(int portID, port_status_t status) { - lf_action_base_t* network_input_port_action = action_for_port(portID); +static void set_network_port_status(int port_id, port_status_t status) { + lf_action_base_t* network_input_port_action = action_for_port(port_id); network_input_port_action->trigger->status = status; } @@ -684,7 +720,7 @@ static int handle_port_absent_message(net_abstraction_t net, int fed_id) { tracepoint_federate_from_federate(receive_PORT_ABS, _lf_my_fed_id, fed_id, &intended_tag); } LF_PRINT_LOG("Handling port absent for tag " PRINTF_TAG " for port %hu of fed %d.", - intended_tag.time - lf_time_start(), intended_tag.microstep, port_id, fed_id); + intended_tag.time - lf_time_start(), intended_tag.microstep, port_id, _lf_my_fed_id); // Environment is always the one corresponding to the top-level scheduling enclave. environment_t* env; @@ -764,6 +800,9 @@ static void* listen_to_federates(void* _args) { net_closed = true; } break; + case MSG_TYPE_SST_KEY_REFRESH_REQUEST: + handle_key_refresh_request(net); + break; default: bad_message = true; } @@ -845,7 +884,7 @@ static int perform_hmac_authentication() { if (received[0] == MSG_TYPE_FAILED) { lf_print_error("RTI has failed."); return -1; - } else if (received[0] == MSG_TYPE_REJECT && received[1] == RTI_NOT_EXECUTED_WITH_AUTH) { + } else if (received[0] == MSG_TYPE_REJECT && received[1] == (unsigned char)RTI_NOT_EXECUTED_WITH_AUTH) { lf_print_error("RTI is not executed with HMAC option."); return -1; } else { @@ -868,7 +907,7 @@ static int perform_hmac_authentication() { lf_print_error("HMAC authentication failed."); unsigned char response[2]; response[0] = MSG_TYPE_REJECT; - response[1] = HMAC_DOES_NOT_MATCH; + response[1] = (unsigned char)HMAC_DOES_NOT_MATCH; // Ignore errors on writing back. write_to_net(_fed.net_to_RTI, 2, response); @@ -891,6 +930,44 @@ static int perform_hmac_authentication() { } #endif +/** + * @brief Handle message from the RTI that an upstream federate has connected. + * + */ +static void handle_upstream_connected_message(void) { + size_t bytes_to_read = sizeof(uint16_t); + unsigned char buffer[bytes_to_read]; + read_from_net_fail_on_error(_fed.net_to_RTI, bytes_to_read, buffer, NULL, + "Failed to read upstream connected message from RTI."); + uint16_t connected = extract_uint16(buffer); + LF_PRINT_DEBUG("Received notification that upstream federate %d has connected", connected); + // Mark the upstream as connected. + for (size_t i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { + if (_lf_zero_delay_cycle_upstream_ids[i] == connected) { + _lf_zero_delay_cycle_upstream_disconnected[i] = false; + } + } +} + +/** + * @brief Handle message from the RTI that an upstream federate has disconnected. + * + */ +static void handle_upstream_disconnected_message(void) { + size_t bytes_to_read = sizeof(uint16_t); + unsigned char buffer[bytes_to_read]; + read_from_net_fail_on_error(_fed.net_to_RTI, bytes_to_read, buffer, NULL, + "Failed to read upstream disconnected message from RTI."); + uint16_t disconnected = extract_uint16(buffer); + LF_PRINT_DEBUG("Received notification that upstream federate %d has disconnected", disconnected); + // Mark the upstream as disconnected. + for (size_t i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { + if (_lf_zero_delay_cycle_upstream_ids[i] == disconnected) { + _lf_zero_delay_cycle_upstream_disconnected[i] = true; + } + } +} + /** * Send the specified timestamp to the RTI and wait for a response. * The specified timestamp should be current physical time of the @@ -904,30 +981,50 @@ static instant_t get_start_time_from_rti(instant_t my_physical_time) { // Send the timestamp marker first. send_time(MSG_TYPE_TIMESTAMP, my_physical_time); - // Read bytes from the network abstraction. We need 9 bytes. + // Read bytes from the socket. We need either 9 bytes or 21, depending on the federate type // Buffer for message ID plus timestamp. - size_t buffer_length = 1 + sizeof(instant_t); + size_t buffer_length = (_fed.is_transient) ? MSG_TYPE_TIMESTAMP_TAG_LENGTH : MSG_TYPE_TIMESTAMP_LENGTH; unsigned char buffer[buffer_length]; - read_from_net_fail_on_error(_fed.net_to_RTI, buffer_length, buffer, - "Failed to read MSG_TYPE_TIMESTAMP message from RTI."); - LF_PRINT_DEBUG("Read 9 bytes."); - - // First byte received is the message ID. - if (buffer[0] != MSG_TYPE_TIMESTAMP) { - if (buffer[0] == MSG_TYPE_FAILED) { - lf_print_error_and_exit("RTI has failed."); + while (true) { + read_from_net_fail_on_error(_fed.net_to_RTI, 1, buffer, NULL, + "Failed to read MSG_TYPE_TIMESTAMP_START message from RTI."); + // First byte received is the message ID. + if (buffer[0] != MSG_TYPE_TIMESTAMP) { + if (buffer[0] == MSG_TYPE_FAILED) { + lf_print_error_and_exit("RTI has failed."); + } else if (buffer[0] == MSG_TYPE_UPSTREAM_CONNECTED) { + // We need to handle this message and continue waiting for MSG_TYPE_TIMESTAMP to arrive + handle_upstream_connected_message(); + continue; + } else if (buffer[0] == MSG_TYPE_UPSTREAM_DISCONNECTED) { + // We need to handle this message and continue waiting for MSG_TYPE_TIMESTAMP to arrive + handle_upstream_disconnected_message(); + continue; + } else { + lf_print_error_and_exit("Expected a MSG_TYPE_TIMESTAMP message from the RTI. Got %u (see net_common.h).", + buffer[0]); + } + } else { + read_from_net_fail_on_error(_fed.net_to_RTI, buffer_length - 1, buffer + 1, NULL, + "Failed to read MSG_TYPE_TIMESTAMP_START message from RTI."); + break; } - lf_print_error_and_exit("Expected a MSG_TYPE_TIMESTAMP message from the RTI. Got %u (see net_common.h).", - buffer[0]); } + LF_PRINT_DEBUG("Read %zu bytes.", buffer_length); + instant_t timestamp = extract_int64(&(buffer[1])); + if (_fed.is_transient) { + effective_start_tag = extract_tag(&(buffer[9])); + } else { + effective_start_tag = (tag_t){.time = timestamp, .microstep = 0u}; + } - tag_t tag = {.time = timestamp, .microstep = 0}; - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(receive_TIMESTAMP, _lf_my_fed_id, &tag); - lf_print("Starting timestamp is: " PRINTF_TIME ".", timestamp); + // Trace the event when tracing is enabled. + // Note that we report in the trace the effective_start_tag. + // This is rather a choice. To be changed, if needed, of course. + tracepoint_federate_from_rti(receive_TIMESTAMP, _lf_my_fed_id, &effective_start_tag); LF_PRINT_LOG("Current physical time is: " PRINTF_TIME ".", lf_time_physical()); return timestamp; @@ -943,7 +1040,7 @@ static instant_t get_start_time_from_rti(instant_t my_physical_time) { * a notification of this update, which may unblock whichever worker * thread is trying to advance time. * - * @note This function is very similar to handle_provisinal_tag_advance_grant() except that + * @note This function is very similar to handle_provisional_tag_advance_grant() except that * it sets last_TAG_was_provisional to false. */ static void handle_tag_advance_grant(void) { @@ -1190,7 +1287,8 @@ static void* update_ports_from_staa_offsets(void* args) { * * @note This function is similar to handle_tag_advance_grant() except that * it sets last_TAG_was_provisional to true and also it does not update the - * last known tag for input ports. + * last known tag for input ports unless there is an upstream federate that is + * disconnected. */ static void handle_provisional_tag_advance_grant() { // Environment is always the one corresponding to the top-level scheduling enclave. @@ -1227,6 +1325,12 @@ static void handle_provisional_tag_advance_grant() { env->current_tag.time - start_time, env->current_tag.microstep, _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); + for (size_t i = 0; i < _lf_zero_delay_cycle_action_table_size; i++) { + if (_lf_zero_delay_cycle_upstream_disconnected[i]) { + update_last_known_status_on_action(env, _lf_zero_delay_cycle_action_table[i], PTAG); + } + } + // Even if we don't modify the event queue, we need to broadcast a change // because we do not need to continue to wait for a TAG. lf_cond_broadcast(&env->event_q_changed); @@ -1326,6 +1430,20 @@ static void handle_stop_granted_message() { } } +/** + * @brief Handle a MSG_TYPE_STOP message from the RTI. + * + * This function simply calls lf_stop(). + */ +void handle_stop() { + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(receive_STOP, _lf_my_fed_id, NULL); + + lf_print("Received from RTI a MSG_TYPE_STOP at physical time " PRINTF_TIME ".", lf_time_physical()); + + lf_stop(); +} + /** * Handle a MSG_TYPE_STOP_REQUEST message from the RTI. */ @@ -1527,6 +1645,9 @@ static void* listen_to_rti_net(void* args) { case MSG_TYPE_STOP_GRANTED: handle_stop_granted_message(); break; + case MSG_TYPE_STOP: + handle_stop(); + break; case MSG_TYPE_PORT_ABSENT: if (handle_port_absent_message(_fed.net_to_RTI, -1)) { // Failures to complete the read of absent messages from the RTI are fatal. @@ -1539,10 +1660,19 @@ static void* listen_to_rti_net(void* args) { case MSG_TYPE_FAILED: handle_rti_failed_message(); break; + case MSG_TYPE_UPSTREAM_CONNECTED: + handle_upstream_connected_message(); + break; + case MSG_TYPE_UPSTREAM_DISCONNECTED: + handle_upstream_disconnected_message(); + break; case MSG_TYPE_CLOCK_SYNC_T1: case MSG_TYPE_CLOCK_SYNC_T4: lf_print_error("Federate %d received unexpected clock sync message from RTI.", _lf_my_fed_id); break; + case MSG_TYPE_SST_KEY_ACK: + handle_rti_session_key_ack(_fed.net_to_RTI); + break; default: lf_print_error_and_exit("Received from RTI an unrecognized message type: %hhx.", buffer[0]); // Trace the event when tracing is enabled @@ -1779,15 +1909,16 @@ void lf_connect_to_federate(uint16_t remote_federate_id) { break; } // Connect was successful. - size_t buffer_length = 1 + sizeof(uint16_t) + 1; + size_t buffer_length = 1 + sizeof(uint16_t) + 1 + 1; unsigned char buffer[buffer_length]; buffer[0] = MSG_TYPE_P2P_SENDING_FED_ID; if (_lf_my_fed_id == UINT16_MAX) { lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX - 1); } encode_uint16((uint16_t)_lf_my_fed_id, (unsigned char*)&(buffer[1])); + buffer[1 + sizeof(uint16_t)] = _fed.is_transient ? 1 : 0; unsigned char federation_id_length = (unsigned char)strnlen(federation_metadata.federation_id, 255); - buffer[sizeof(uint16_t) + 1] = federation_id_length; + buffer[sizeof(uint16_t) + 2] = federation_id_length; // Trace the event when tracing is enabled tracepoint_federate_to_federate(send_FED_ID, _lf_my_fed_id, remote_federate_id, NULL); @@ -1859,7 +1990,7 @@ void lf_connect_to_rti(const char* hostname, int port) { while (!CHECK_TIMEOUT(start_connect, CONNECT_TIMEOUT) && !_lf_termination_executed) { // Have connected to an RTI, but not sure it's the right RTI. - // Send a MSG_TYPE_FED_IDS message and wait for a reply. + // Send a MSG_TYPE_FED_IDS or MSG_TYPE_TRANSIENT_FED_IDS message and wait for a reply. // Notify the RTI of the ID of this federate and its federation. #ifdef FEDERATED_AUTHENTICATED @@ -1876,9 +2007,14 @@ void lf_connect_to_rti(const char* hostname, int port) { LF_PRINT_LOG("Connected to an RTI. Sending federation ID for authentication."); #endif + unsigned char buffer[5]; // Send the message type first. - unsigned char buffer[4]; - buffer[0] = MSG_TYPE_FED_IDS; + if (_fed.is_transient) { + buffer[0] = MSG_TYPE_TRANSIENT_FED_IDS; + } else { + buffer[0] = MSG_TYPE_FED_IDS; + } + // Next send the federate ID. if (_lf_my_fed_id == UINT16_MAX) { lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX - 1); @@ -1892,8 +2028,14 @@ void lf_connect_to_rti(const char* hostname, int port) { // Trace the event when tracing is enabled tracepoint_federate_to_rti(send_FED_ID, _lf_my_fed_id, NULL); + size_t size = 1 + sizeof(uint16_t) + 1; + if (_fed.is_transient) { + // Next send the federate type (persistent or transient) + buffer[2 + sizeof(uint16_t)] = _fed.is_transient ? 1 : 0; + size++; + } // No need for a mutex here because no other threads are writing to this network abstraction. - if (write_to_net(_fed.net_to_RTI, 2 + sizeof(uint16_t), buffer)) { + if (write_to_net(_fed.net_to_RTI, size, buffer)) { continue; // Try again, possibly on a new port. } @@ -1919,7 +2061,7 @@ void lf_connect_to_rti(const char* hostname, int port) { // Read one more byte to determine the cause of rejection. unsigned char cause; read_from_net_fail_on_error(_fed.net_to_RTI, 1, &cause, "Failed to read the cause of rejection by the RTI."); - if (cause == FEDERATION_ID_DOES_NOT_MATCH || cause == WRONG_SERVER) { + if (cause == (unsigned char)FEDERATION_ID_DOES_NOT_MATCH || cause == (unsigned char)WRONG_SERVER) { lf_print_warning("Connected to the wrong RTI. Will try again"); continue; } @@ -1931,8 +2073,12 @@ void lf_connect_to_rti(const char* hostname, int port) { } else if (response == MSG_TYPE_RESIGN) { lf_print_warning("RTI resigned. Will try again"); continue; + } else if (response == MSG_TYPE_UPSTREAM_CONNECTED) { + handle_upstream_connected_message(); + } else if (response == MSG_TYPE_UPSTREAM_DISCONNECTED) { + handle_upstream_disconnected_message(); } else { - lf_print_warning("RTI gave unexpect response %u. Will try again", response); + lf_print_warning("RTI on port %d gave unexpected response %u. Will try again", port, response); continue; } } @@ -2024,7 +2170,7 @@ void* lf_handle_p2p_connections_from_federates(void* env_arg) { } LF_PRINT_LOG("Accepted new connection from remote federate."); - size_t header_length = 1 + sizeof(uint16_t) + 1; + size_t header_length = 1 + sizeof(uint16_t) + 1 + 1; unsigned char buffer[header_length]; int read_failed = read_from_net(net, header_length, (unsigned char*)&buffer); if (read_failed || buffer[0] != MSG_TYPE_P2P_SENDING_FED_ID) { @@ -2034,7 +2180,7 @@ void* lf_handle_p2p_connections_from_federates(void* env_arg) { // Wrong message received. unsigned char response[2]; response[0] = MSG_TYPE_REJECT; - response[1] = WRONG_SERVER; + response[1] = (unsigned char)WRONG_SERVER; // Trace the event when tracing is enabled tracepoint_federate_to_federate(send_REJECT, _lf_my_fed_id, -3, NULL); // Ignore errors on this response. @@ -2054,7 +2200,7 @@ void* lf_handle_p2p_connections_from_federates(void* env_arg) { if (read_failed == 0) { unsigned char response[2]; response[0] = MSG_TYPE_REJECT; - response[1] = FEDERATION_ID_DOES_NOT_MATCH; + response[1] = (unsigned char)FEDERATION_ID_DOES_NOT_MATCH; // Trace the event when tracing is enabled tracepoint_federate_to_federate(send_REJECT, _lf_my_fed_id, -3, NULL); // Ignore errors on this response. @@ -2066,7 +2212,12 @@ void* lf_handle_p2p_connections_from_federates(void* env_arg) { // Extract the ID of the sending federate. uint16_t remote_fed_id = extract_uint16((unsigned char*)&(buffer[1])); - LF_PRINT_DEBUG("Received sending federate ID %d.", remote_fed_id); + bool remote_fed_is_transient = buffer[1 + sizeof(uint16_t)]; + if (remote_fed_is_transient) { + LF_PRINT_DEBUG("Received sending federate ID %d, which is transient.", remote_fed_id); + } else { + LF_PRINT_DEBUG("Received sending federate ID %d, which is persistent.", remote_fed_id); + } // Trace the event when tracing is enabled tracepoint_federate_to_federate(receive_FED_ID, _lf_my_fed_id, remote_fed_id, NULL); @@ -2542,6 +2693,21 @@ int lf_send_tagged_message(environment_t* env, interval_t additional_delay, int } } LF_MUTEX_UNLOCK(&lf_outbound_net_mutex); + #ifdef COMM_TYPE_SST + if(atomic_exchange(&_fed.rekey_requested, false)){ + _lf_check_and_perform_rekey(); + } + + #ifdef FEDERATED_CENTRALIZED + // Check if this federate initiated any transient federate launch + if (atomic_exchange(&_fed.transient_launch_requested, false)){ + _lf_send_launch_request(); + } + #endif + + #endif + + return result; } @@ -2576,11 +2742,19 @@ void lf_synchronize_with_other_federates(void) { _lf_get_environments(&top_level_env); LF_COND_INIT(&lf_port_status_changed, &top_level_env->mutex); + #ifdef COMM_TYPE_SST + LF_COND_INIT(&lf_rekey_completed, &lf_outbound_net_mutex); + #endif + LF_PRINT_DEBUG("Synchronizing with other federates."); // Reset the start time to the coordinated start time for all federates. // Note that this does not grant execution to this federate. start_time = get_start_time_from_rti(lf_time_physical()); + + lf_print("Starting timestamp is: " PRINTF_TIME " and effective start tag is: " PRINTF_TAG ".", lf_time_start(), + effective_start_tag.time - lf_time_start(), effective_start_tag.microstep); + lf_tracing_set_start_time(start_time); // Start a thread to listen for incoming messages from the RTI. @@ -2632,13 +2806,20 @@ bool lf_update_max_level(tag_t tag, bool is_provisional) { _lf_action_delay_table[i])) <= 0)) { continue; } +#else + // For centralized coordination, if there is an upstream transient federate that is not + // connected, then we don't want to block on its action. + if (_lf_zero_delay_cycle_upstream_disconnected[i]) { + // Mark the action known up to and including the current tag. It is absent. + update_last_known_status_on_action(env, input_port_action, env->current_tag); + } #endif // FEDERATED_DECENTRALIZED - // If the current tag is greater than the last known status tag of the input port, - // and the input port is not physical, then block on that port by ensuring - // the MLAA is no greater than the level of that port. - // For centralized coordination, this is applied only to input ports coming from - // federates that are in a ZDC. For decentralized coordination, this is applied - // to all input ports. + // If the current tag is greater than the last known status tag of the input port, + // and the input port is not physical, then block on that port by ensuring + // the MLAA is no greater than the level of that port. + // For centralized coordination, this is applied only to input ports coming from + // federates that are in a ZDC. For decentralized coordination, this is applied + // to all input ports. if (lf_tag_compare(env->current_tag, input_port_action->trigger->last_known_status_tag) > 0 && !input_port_action->trigger->is_physical) { max_level_allowed_to_advance = @@ -2650,6 +2831,32 @@ bool lf_update_max_level(tag_t tag, bool is_provisional) { return (prev_max_level_allowed_to_advance != max_level_allowed_to_advance); } +void lf_stop() { + environment_t* env; + int num_env = _lf_get_environments(&env); + + for (int i = 0; i < num_env; i++) { + LF_MUTEX_LOCK(&env[i].mutex); + + tag_t new_stop_tag; + new_stop_tag.time = env[i].current_tag.time; + new_stop_tag.microstep = env[i].current_tag.microstep + 1; + + lf_set_stop_tag(&env[i], new_stop_tag); + + LF_PRINT_LOG("Setting the stop tag of env %d to " PRINTF_TAG ".", i, env[i].stop_tag.time - start_time, + env[i].stop_tag.microstep); + + if (env[i].barrier.requestors) + _lf_decrement_tag_barrier_locked(&env[i]); + lf_cond_broadcast(&env[i].event_q_changed); + LF_MUTEX_UNLOCK(&env[i].mutex); + } + LF_PRINT_LOG("Federate is stopping."); +} + +const char* lf_get_federation_id() { return federation_metadata.federation_id; } + #ifdef FEDERATED_DECENTRALIZED instant_t lf_wait_until_time(tag_t tag) { instant_t result = tag.time; // Default. @@ -2675,4 +2882,126 @@ instant_t lf_wait_until_time(tag_t tag) { } #endif // FEDERATED_DECENTRALIZED +void lf_launch_transient_federate(char* port_name){ + // Search the port map for entries matching the given port name. + for(int i=0; i<_fed.port_map_size; i++){ + if (strcmp(port_name, _fed.port_to_transient_feds_mapping[i].port_name) == 0){ + // Queue each transient federate associated with this port for launch. + for (int j=0; j<_fed.port_to_transient_feds_mapping[i].num_of_transients; j++){ + uint16_t fed_id = *(_fed.port_to_transient_feds_mapping[i].transient_fed_id + j); + // Guard against overflow of the pending launches buffer + if (_fed.num_transient_fed_launch_requested < NUMBER_OF_FEDERATES){ + _fed.pending_transient_launches[_fed.num_transient_fed_launch_requested] = fed_id; + _fed.num_transient_fed_launch_requested++; + } + } + } + } + // If any federates are queued, set the launch request field to true to signal the runtime + // that there are transient federate launches required + if (_fed.num_transient_fed_launch_requested > 0){ + atomic_store(&_fed.transient_launch_requested, true); + } + +} + +void _lf_send_launch_request(){ + size_t bytes_to_write = 1 + sizeof(uint16_t); + unsigned char buffer[bytes_to_write]; + + for(int i=0; i<_fed.num_transient_fed_launch_requested; i++){ + uint16_t fed_id = _fed.pending_transient_launches[i]; + buffer[0] = MSG_TYPE_TRANSIENT_LAUNCH_REQUEST; + encode_uint16(fed_id, &(buffer[1])); + LF_MUTEX_LOCK(&lf_outbound_net_mutex); + write_to_net_fail_on_error(_fed.net_to_RTI, bytes_to_write, buffer, NULL, NULL); + LF_MUTEX_UNLOCK(&lf_outbound_net_mutex); + + memset(buffer, 0, bytes_to_write); + } + + // Reset the pending_transient_launches and count of the number of transient to be launched to 0 + memset(_fed.pending_transient_launches, 0, sizeof(uint16_t)*_fed.num_transient_fed_launch_requested); + _fed.num_transient_fed_launch_requested = 0; +} + +void lf_refresh_key(void){ + LF_PRINT_DEBUG("Rekey Requested"); + atomic_store(&_fed.rekey_requested, true); +} + + +#ifdef COMM_TYPE_SST +void _lf_check_and_perform_rekey(void){ + + #ifdef FEDERATED_CENTRALIZED + lf_print("Acquiring new session key and sending it to RTI"); + get_new_session_key(_fed.net_to_RTI); + + // Hold the outbound mutex while sending the request and waiting for the RTI's ACK. + // lf_rekey_completed is signaled by handle_rti_session_key_ack once the new key is + // active, at which point this thread wakes, releases the mutex, and returns. + LF_MUTEX_LOCK(&lf_outbound_net_mutex); + send_key_refresh_request(_fed.net_to_RTI, MSG_TYPE_SST_KEY_REFRESH_REQUEST); + LF_COND_WAIT(&lf_rekey_completed); + LF_MUTEX_UNLOCK(&lf_outbound_net_mutex); + #endif + + #ifdef FEDERATED_DECENTRALIZED + for (int i=0; i< _fed.number_of_outbound_p2p_connections; i++){ + + net_abstraction_t net = _fed.net_for_outbound_p2p_connections[i]; + if(net != NULL){ + get_new_session_key(net); + lf_print("DEBUG: New session key received\n"); + + LF_MUTEX_LOCK(&lf_outbound_net_mutex); + send_key_refresh_request(net, MSG_TYPE_SST_KEY_REFRESH_REQUEST); + lf_print("DEBUG: Refresh Request Sent"); + LF_MUTEX_UNLOCK(&lf_outbound_net_mutex); + + unsigned char buffer; + read_from_net_fail_on_error(net, 1, &buffer, NULL); + unsigned char key_id[SESSION_KEY_ID_SIZE]; + lf_print("DEBUG: ACK ARRIVED"); + read_from_net_fail_on_error(net, SESSION_KEY_ID_SIZE, key_id, NULL); + if(!verify_pending_key_id(net, key_id)){ + lf_print_error("Key IDs dont match"); + } + + LF_MUTEX_LOCK(&lf_outbound_net_mutex); + swap_to_pending_key(net); + LF_MUTEX_UNLOCK(&lf_outbound_net_mutex); + + } + } + #endif +} + +void handle_rti_session_key_ack(net_abstraction_t net_abs){ + unsigned char key_id[SESSION_KEY_ID_SIZE]; + read_from_net_fail_on_error(net_abs, SESSION_KEY_ID_SIZE, key_id, NULL); + if(!verify_pending_key_id(net_abs, key_id)){ + LF_PRINT_DEBUG("Key IDs dont match"); + return; + } + + swap_to_pending_key(net_abs); + LF_MUTEX_LOCK(&lf_outbound_net_mutex); + LF_COND_BROADCAST(&lf_rekey_completed); + LF_MUTEX_UNLOCK(&lf_outbound_net_mutex); + lf_print("Key ID match in ACK"); +} + +void handle_key_refresh_request(net_abstraction_t net_abs){ + unsigned char key_id[SESSION_KEY_ID_SIZE]; + read_from_net_fail_on_error(net_abs, SESSION_KEY_ID_SIZE, key_id, NULL); + fetch_pending_session_key(net_abs, key_id); + + LF_MUTEX_LOCK(&lf_outbound_net_mutex); + send_key_refresh_request(net_abs, MSG_TYPE_SST_KEY_ACK); + swap_to_pending_key(net_abs); + LF_MUTEX_UNLOCK(&lf_outbound_net_mutex); +} +#endif #endif // FEDERATED diff --git a/core/lf_trace.cmake b/core/lf_trace.cmake new file mode 100644 index 000000000..88362ef98 --- /dev/null +++ b/core/lf_trace.cmake @@ -0,0 +1,47 @@ +if(DEFINED LF_TRACE) + include(${LF_ROOT}/trace/api/CMakeLists.txt) + target_link_libraries(reactor-c PUBLIC lf::trace-api) + # If LF_TRACE_PLUGIN is set, treat it as a CMake package name and try to locate it via find_package(). + # If that fails (or the target isn't available), do not link anything here and assume the user supplies a + # cmake-include file to link against the trace plugin and its dependencies. + if(DEFINED LF_TRACE_PLUGIN AND NOT LF_TRACE_PLUGIN STREQUAL "") + if(DEFINED LF_TRACE_PLUGIN_PATHS AND NOT LF_TRACE_PLUGIN_PATHS STREQUAL "") + # Case A: LF_TRACE_PLUGIN is set and LF_TRACE_PLUGIN_PATHS is set, + # when the user specifies a "package" field, a "library" field, and a "path" field under "trace-plugin". + # Example: See https://github.com/lf-lang/lf-trace-xronos/blob/53e77a6b072f6b25d4fdfd53a4a3700fc199f938/tests/src/TracePluginUserPath.lf + message(STATUS "Trying to find package ${LF_TRACE_PLUGIN} in: ${LF_TRACE_PLUGIN_PATHS}") + find_package(${LF_TRACE_PLUGIN} QUIET CONFIG + NO_DEFAULT_PATH + PATHS ${LF_TRACE_PLUGIN_PATHS} + PATH_SUFFIXES lib/cmake/${LF_TRACE_PLUGIN} share/${LF_TRACE_PLUGIN}/cmake + ) + else() + # Case B: LF_TRACE_PLUGIN is set but LF_TRACE_PLUGIN_PATHS is not set, + # when the user specifies a "package" field and a "library" field under "trace-plugin". + # Example: See https://github.com/lf-lang/lf-trace-xronos/blob/53e77a6b072f6b25d4fdfd53a4a3700fc199f938/tests/src/TracePluginSystemPath.lf + message(STATUS "Trying to find package ${LF_TRACE_PLUGIN} in the default system path") + find_package(${LF_TRACE_PLUGIN} QUIET CONFIG) + endif() + + if(DEFINED LF_TRACE_PLUGIN_LIBRARY AND NOT LF_TRACE_PLUGIN_LIBRARY STREQUAL "" AND TARGET "${LF_TRACE_PLUGIN_LIBRARY}") + # In case A & B, the "library" field determines what gets linked. + message(STATUS "Package ${LF_TRACE_PLUGIN} found. Linking trace plugin target: ${LF_TRACE_PLUGIN_LIBRARY}") + target_link_libraries(reactor-c PRIVATE "${LF_TRACE_PLUGIN_LIBRARY}") + else() + # Case C: LF_TRACE_PLUGIN is set but the specified trace plugin library target is not available + # (LF_TRACE_PLUGIN_LIBRARY is undefined, empty, or does not name an existing target, e.g., package not found). + # This case covers when the user either does not use "trace-plugin" or the package/library cannot be resolved, + # and instead proceeds with a custom integration via cmake-include and cmake-args. + # Example: See https://github.com/lf-lang/lf-trace-xronos/blob/53e77a6b072f6b25d4fdfd53a4a3700fc199f938/tests/src/TracePluginCustomCmake.lf + message(STATUS "Trace plugin package or library not found. Expecting user cmake-include to link the plugin.") + endif() + else() + # If LF_TRACE_PLUGIN not set, use the default trace plugin implementation. + message(STATUS "Linking with default trace implementation") + include(${LF_ROOT}/trace/impl/CMakeLists.txt) + target_link_libraries(reactor-c PRIVATE lf::trace-impl) + endif() +else() + include(${LF_ROOT}/trace/api/types/CMakeLists.txt) + target_link_libraries(reactor-c PUBLIC lf::trace-api-types) +endif() \ No newline at end of file diff --git a/core/reactor_common.c b/core/reactor_common.c index c00e5989b..f72bf1ade 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -11,6 +11,7 @@ */ #include #include +#include #include #include @@ -952,26 +953,51 @@ void schedule_output_reactions(environment_t* env, reaction_t* reaction, int wor } } +// Defaults for the code-generated parameter table. +// The code generator overrides these if there are user-defined parameters. +lf_cli_param_t* _lf_cli_params = NULL; +int _lf_cli_params_count = 0; + /** - * Print a usage message. - * TODO: This is not necessary for NO_CLI + * Print a usage message listing user-defined parameters (if any) and runtime options. */ void usage(int argc, const char* argv[]) { - printf("\nCommand-line arguments: \n\n"); - printf(" -f, --fast [true | false]\n"); - printf(" Whether to wait for physical time to match logical time.\n\n"); +#if defined(NO_CLI) + printf("\nNo command-line arguments are supported.\n"); +#else + printf("\nUsage: %s [options]\n\n", argv[0]); + if (_lf_cli_params_count > 0) { + printf("Reactor Parameters:\n"); + for (int j = 0; j < _lf_cli_params_count; j++) { + lf_cli_param_t* p = &_lf_cli_params[j]; + if (p->type == CLI_TIME) { + printf(" --%s \n", p->name); + } else if (p->type == CLI_BOOL) { + printf(" --%s \n", p->name); + } else { + printf(" --%s \n", p->name); + } + printf(" %s\n\n", p->description); + } + } + printf("Runtime Options:\n"); + printf(" -f, --fast \n"); + printf(" Whether to wait for physical time to match logical time.\n\n"); printf(" -o, --timeout \n"); - printf(" Stop after the specified amount of logical time, where units are one of\n"); - printf(" nsec, usec, msec, sec, minute, hour, day, week, or the plurals of those.\n\n"); - printf(" -k, --keepalive\n"); - printf(" Whether continue execution even when there are no events to process.\n\n"); + printf(" Stop after the specified amount of logical time, where units are one of\n"); + printf(" nsec, usec, msec, sec, minute, hour, day, week, or the plurals of those.\n\n"); + printf(" -k, --keepalive \n"); + printf(" Whether to continue execution even when there are no events to process.\n\n"); printf(" -w, --workers \n"); - printf(" Executed in threads if possible (optional feature).\n\n"); - printf(" -i, --id \n"); - printf(" The ID of the federation that this reactor will join.\n\n"); + printf(" Execute in threads if possible (optional feature).\n\n"); + printf(" -h, --help\n"); + printf(" Display this help message.\n\n"); +#endif #ifdef FEDERATED + printf(" -i, --id \n"); + printf(" The ID of the federation that this reactor will join.\n\n"); printf(" -r, --rti \n"); - printf(" The address of the RTI, which can be in the form of user@host:port or ip:port.\n\n"); + printf(" The address of the RTI, which can be in the form of user@host:port or ip:port.\n\n"); printf(" -l\n"); printf(" Send stdout to individual log files for each federate.\n\n"); #ifdef COMM_TYPE_SST @@ -981,9 +1007,9 @@ void usage(int argc, const char* argv[]) { #ifdef COMM_TYPE_TLS printf(" -tls, --tls \n"); printf(" Paths to the TLS certificate and private key to use.\n\n"); + printf(" Send stdout to individual log files for each federate.\n\n"); #endif #endif - printf("Command given:\n"); for (int i = 0; i < argc; i++) { printf("%s ", argv[i]); @@ -991,6 +1017,96 @@ void usage(int argc, const char* argv[]) { printf("\n\n"); } +/** + * Process user-defined main reactor parameters from the command line. + * Returns 0 on success, 1 for --help (exit 0), 2 for error (exit 1). + */ +int process_user_args(int argc, const char* argv[], int* newargc, const char** newargv) { + *newargc = 0; + newargv[(*newargc)++] = argv[0]; + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) { + usage(argc, argv); + return 1; + } + bool matched = false; + for (int j = 0; j < _lf_cli_params_count; j++) { + lf_cli_param_t* p = &_lf_cli_params[j]; + char option[256]; + snprintf(option, sizeof(option), "--%s", p->name); + if (strcmp(argv[i], option) == 0) { + matched = true; + if (p->is_width) { + fprintf(stderr, "Error: Command-line changes to multiport and bank widths" + " are not supported.\n" + "Change the width in the source code and recompile instead.\n"); + return 2; + } + if (p->type == CLI_TIME) { + if (i + 2 >= argc) { + fprintf(stderr, "Error: --%s needs a time value and units (e.g., --%s 500 msec).\n", p->name, p->name); + return 2; + } + const char* time_str = argv[++i]; + const char* unit_str = argv[++i]; + if (lf_time_parse(time_str, unit_str, (interval_t*)p->value) != 0) { + fprintf(stderr, "Error: invalid time value '%s %s' for --%s.\n", time_str, unit_str, p->name); + return 2; + } + *p->given = true; + } else { + if (i + 1 >= argc) { + fprintf(stderr, "Error: --%s needs a value.\n", p->name); + return 2; + } + const char* val_str = argv[++i]; + char* end; + switch (p->type) { + case CLI_INT: + *((int*)p->value) = atoi(val_str); + break; + case CLI_DOUBLE: + *((double*)p->value) = strtod(val_str, &end); + if (*end != '\0') { + fprintf(stderr, "Error: invalid double value '%s' for --%s.\n", val_str, p->name); + return 2; + } + break; + case CLI_FLOAT: + *((float*)p->value) = strtof(val_str, &end); + if (*end != '\0') { + fprintf(stderr, "Error: invalid float value '%s' for --%s.\n", val_str, p->name); + return 2; + } + break; + case CLI_BOOL: + if (strcmp(val_str, "true") == 0 || strcmp(val_str, "1") == 0) { + *((bool*)p->value) = true; + } else if (strcmp(val_str, "false") == 0 || strcmp(val_str, "0") == 0) { + *((bool*)p->value) = false; + } else { + fprintf(stderr, "Error: invalid bool value '%s' for --%s (expected true or false).\n", val_str, p->name); + return 2; + } + break; + case CLI_STRING: + *((const char**)p->value) = val_str; + break; + default: + break; + } + *p->given = true; + } + break; + } + } + if (!matched) { + newargv[(*newargc)++] = argv[i]; + } + } + return 0; +} + // Some options given in the target directive are provided here as // default command-line options. int default_argc = 0; @@ -1000,7 +1116,6 @@ const char** default_argv = NULL; * Process the command-line arguments. If the command line arguments are not * understood, then print a usage message and return 0. Otherwise, return 1. * @return 1 if the arguments processed successfully, 0 otherwise. - * TODO: Not necessary for NO_CLI */ int process_args(int argc, const char* argv[]) { int i = 1; @@ -1029,39 +1144,10 @@ int process_args(int argc, const char* argv[]) { } const char* time_spec = argv[i++]; const char* units = argv[i++]; - -#if defined(PLATFORM_ARDUINO) - duration = atol(time_spec); -#else - duration = atoll(time_spec); -#endif - - // A parse error returns 0LL, so check to see whether that is what is meant. - if (duration == 0LL && strncmp(time_spec, "0", 1) != 0) { - // Parse error. - lf_print_error("Invalid time value: %s", time_spec); - usage(argc, argv); - return 0; - } - if (strncmp(units, "sec", 3) == 0) { - duration = SEC(duration); - } else if (strncmp(units, "msec", 4) == 0) { - duration = MSEC(duration); - } else if (strncmp(units, "usec", 4) == 0) { - duration = USEC(duration); - } else if (strncmp(units, "nsec", 4) == 0) { - duration = NSEC(duration); - } else if (strncmp(units, "min", 3) == 0) { - duration = MINUTE(duration); - } else if (strncmp(units, "hour", 4) == 0) { - duration = HOUR(duration); - } else if (strncmp(units, "day", 3) == 0) { - duration = DAY(duration); - } else if (strncmp(units, "week", 4) == 0) { - duration = WEEK(duration); - } else { - // Invalid units. - lf_print_error("Invalid time units: %s", units); + int parse_result = lf_time_parse(time_spec, units, &duration); + if (parse_result != 0) { + lf_print_error(parse_result == -1 ? "Invalid time value: %s" : "Invalid time units: %s", + parse_result == -1 ? time_spec : units); usage(argc, argv); return 0; } @@ -1079,6 +1165,9 @@ int process_args(int argc, const char* argv[]) { } else { lf_print_error("Invalid value for --keepalive: %s", keep_spec); } + } else if (strcmp(arg, "-h") == 0 || strcmp(arg, "--help") == 0) { + usage(argc, argv); + return 0; } else if (strcmp(arg, "-w") == 0 || strcmp(arg, "--workers") == 0) { if (argc < i + 1) { lf_print_error("--workers needs an integer argument.s"); @@ -1169,9 +1258,15 @@ int process_args(int argc, const char* argv[]) { else if (strcmp(arg, "--ros-args") == 0) { // FIXME: Ignore ROS arguments for now } else { +#ifdef FEDERATED + // In federated programs, arguments intended for other federates + // may be forwarded here. Skip them silently. + lf_print("Ignoring unrecognized command-line argument: %s. Assuming it is intended for another federate.", arg); +#else lf_print_error("Unrecognized command-line argument: %s", arg); usage(argc, argv); return 0; +#endif } } return 1; diff --git a/core/tag.c b/core/tag.c index e3f623421..003d4f20a 100644 --- a/core/tag.c +++ b/core/tag.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "tag.h" @@ -31,6 +32,12 @@ typedef enum _lf_time_type { LF_LOGICAL, LF_PHYSICAL, LF_ELAPSED_LOGICAL, LF_ELA // Global variables declared in tag.h: instant_t start_time = NEVER; +/** + * Only useful for transient federates. It records the effective start tag, to + * be used at startup. Elapsed logical time calculations will use start_time. + */ +tag_t effective_start_tag = {.time = 0LL, .microstep = 0}; + //////////////// Functions declared in tag.h tag_t lf_tag(void* env) { @@ -188,6 +195,8 @@ instant_t lf_time_physical_elapsed(void) { return lf_time_physical() - start_tim instant_t lf_time_start(void) { return start_time; } +tag_t lf_tag_start_effective(void) { return effective_start_tag; } + size_t lf_readable_time(char* buffer, instant_t time) { if (time <= (instant_t)0) { snprintf(buffer, 2, "0"); @@ -313,3 +322,31 @@ size_t lf_comma_separated_time(char* buffer, instant_t time) { } return result; } + +int lf_time_parse(const char* time_str, const char* units_str, interval_t* result) { + char* end; + int value = (int)strtol(time_str, &end, 10); + if (*end != '\0') { + return -1; + } + if (strncmp(units_str, "nsec", 4) == 0 || strcmp(units_str, "ns") == 0) { + *result = NSEC(value); + } else if (strncmp(units_str, "usec", 4) == 0 || strcmp(units_str, "us") == 0) { + *result = USEC(value); + } else if (strncmp(units_str, "msec", 4) == 0 || strcmp(units_str, "ms") == 0) { + *result = MSEC(value); + } else if (strncmp(units_str, "second", 6) == 0 || strncmp(units_str, "sec", 3) == 0 || strcmp(units_str, "s") == 0) { + *result = SEC(value); + } else if (strncmp(units_str, "minute", 6) == 0 || strncmp(units_str, "min", 3) == 0) { + *result = MINUTE(value); + } else if (strncmp(units_str, "hour", 4) == 0) { + *result = HOUR(value); + } else if (strncmp(units_str, "day", 3) == 0) { + *result = DAY(value); + } else if (strncmp(units_str, "week", 4) == 0) { + *result = WEEK(value); + } else { + return -2; + } + return 0; +} diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index bc82dec9e..2362ccbbd 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -33,6 +33,7 @@ // Global variables defined in tag.c and shared across environments: extern instant_t start_time; +extern tag_t effective_start_tag; /** * The maximum amount of time a worker thread should stall @@ -510,12 +511,12 @@ static void _lf_initialize_start_tag(environment_t* env) { // statuses to unknown lf_reset_status_fields_on_input_port_triggers(); - // Get a start_time from the RTI + // Get a start_time and effective_start_tag from the RTI lf_synchronize_with_other_federates(); // Resets start_time in federated execution according to the RTI. } // The start time will likely have changed. Adjust the current tag and stop tag. - env->current_tag = (tag_t){.time = start_time, .microstep = 0u}; + env->current_tag = effective_start_tag; if (duration >= 0LL) { // A duration has been specified. Recalculate the stop time. env->stop_tag = ((tag_t){.time = start_time + duration, .microstep = 0}); @@ -534,25 +535,25 @@ static void _lf_initialize_start_tag(environment_t* env) { // To use uniform code below, we define it here as a local variable. instant_t lf_fed_STA_offset = 0; #endif - LF_PRINT_LOG("Waiting for start time " PRINTF_TIME ".", start_time); - - // Wait until the start time. This is required for federates because the startup procedure - // in lf_synchronize_with_other_federates() can decide on a new start_time that is - // larger than the current physical time. - // This wait_until() is deliberately called after most precursor operations - // for tag (0,0) are performed (e.g., injecting startup reactions, etc.). - // This has two benefits: First, the startup overheads will reduce - // the required waiting time. Second, this call releases the mutex lock and allows - // other threads (specifically, federate threads that handle incoming p2p messages - // from other federates) to hold the lock and possibly raise a tag barrier. - while (!wait_until(start_time, &env->event_q_changed)) { + LF_PRINT_LOG("Waiting for start time " PRINTF_TIME ".", effective_start_tag.time); + + // Wait until the effective start time. This is required for federates because the startup procedure + // in lf_synchronize_with_other_federates() can decide on a new start_time, or the effective start time if it is a + // transient federate, that is larger than the current physical time. + // This wait_until() is deliberately called after most precursor operations for tag (0,0), or effective_start_tag,q + // are performed (e.g., injecting startup reactions, etc.). This has two benefits: First, the startup overheads will + // reduce the required waiting time. Second, this call releases the mutex lock and allows other threads (specifically, + // federate threads that handle incoming p2p messages from other federates) to hold the lock and possibly raise a tag + // barrier. + while (!wait_until(effective_start_tag.time, &env->event_q_changed)) { }; - LF_PRINT_DEBUG("Done waiting for start time + STA offset " PRINTF_TIME ".", start_time + lf_fed_STA_offset); + LF_PRINT_DEBUG("Done waiting for effective start time + STA offset " PRINTF_TIME ".", + effective_start_tag.time + lf_fed_STA_offset); LF_PRINT_DEBUG("Physical time is ahead of current time by " PRINTF_TIME ". This should be close to the STA offset.", - lf_time_physical() - start_time); + lf_time_physical() - effective_start_tag.time); - // Restore the current tag to match the start time. - env->current_tag = (tag_t){.time = start_time, .microstep = 0u}; + // Restore the current tag to match the effective start time. + env->current_tag = (tag_t){.time = effective_start_tag.time, .microstep = effective_start_tag.microstep}; // If the stop_tag is (0,0), also insert the shutdown // reactions. This can only happen if the timeout time @@ -569,7 +570,7 @@ static void _lf_initialize_start_tag(environment_t* env) { // from exceeding the timestamp of the message. It will remove that barrier // once the complete message has been read. Here, we wait for that barrier // to be removed, if appropriate before proceeding to executing tag (0,0). - _lf_wait_on_tag_barrier(env, (tag_t){.time = start_time, .microstep = 0}); + _lf_wait_on_tag_barrier(env, effective_start_tag); // In addition, if the earliest event on the event queue has a tag greater // than (0,0), then wait until the time of that tag. This prevents the runtime @@ -1047,6 +1048,7 @@ int lf_reactor_c_main(int argc, const char* argv[]) { // Initialize the clock through the platform API. No reading of physical time before this. _lf_initialize_clock(); start_time = lf_time_physical(); + effective_start_tag = (tag_t){.time = start_time, .microstep = 0}; #ifndef FEDERATED lf_tracing_set_start_time(start_time); #endif diff --git a/core/utils/pqueue_tag.c b/core/utils/pqueue_tag.c index 5daa04e4a..2ea8e9a8b 100644 --- a/core/utils/pqueue_tag.c +++ b/core/utils/pqueue_tag.c @@ -157,3 +157,14 @@ void pqueue_tag_remove_up_to(pqueue_tag_t* q, tag_t t) { } void pqueue_tag_dump(pqueue_tag_t* q) { pqueue_dump((pqueue_t*)q, pqueue_tag_print_element); } + +tag_t pqueue_tag_max_tag(pqueue_tag_t* q) { + tag_t result = NEVER_TAG; + for (size_t i = 1; i < q->size; i++) { + pqueue_tag_element_t* element = (pqueue_tag_element_t*)(q->d[i]); + if (lf_tag_compare(element->tag, result) > 0) { + result = element->tag; + } + } + return result; +} diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index e516e2a74..dd6a83245 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -19,6 +19,7 @@ #define FEDERATE_H #include +#include #include "tag.h" #include "lf_types.h" @@ -33,6 +34,16 @@ ////////////////////////////////////////////////////////////////////////////////// // Data types +/** + * Maps a port name to the set of transient federates that should be launched + * when that port becomes active. + */ +typedef struct port_transient_map_t { + char* port_name; + uint16_t* transient_fed_id; + int num_of_transients; +} port_transient_map_t; + /** * @brief Structure that a federate instance uses to keep track of its own state. * @ingroup Federated @@ -192,6 +203,30 @@ typedef struct federate_instance_t { */ instant_t min_delay_from_physical_action_to_federate_output; + /** + * Indicator of whether this federate is transient. + * The default value of false may be overridden in _lf_initialize_trigger_objects. + */ + bool is_transient; + + /** Indicator that this federate needs to refresh its session key/keys for its connections*/ + _Atomic bool rekey_requested; + + /** Indicator that a transient federate launch needs to be requested to the rti */ + _Atomic bool transient_launch_requested; + + /** An array of transient federates that needs to be launched */ + uint16_t pending_transient_launches[NUMBER_OF_FEDERATES]; + + /** Total number of transient federates launch requested */ + int num_transient_fed_launch_requested; + + /** Provides output port name -> list of transient fed ids mapping. It is generated at compile time. */ + port_transient_map_t port_to_transient_feds_mapping[NUMBER_OF_FEDERATES]; + + /** Count of port -> transient fed mappings present for this federation*/ + int port_map_size; + #ifdef FEDERATED_DECENTRALIZED /** * Thread responsible for setting ports to absent by an STAA offset if they @@ -522,6 +557,11 @@ int lf_send_tagged_message(environment_t* env, interval_t additional_delay, int */ void lf_set_federation_id(const char* fid); +/** + * @brief Return the federation id. + */ +const char* lf_get_federation_id(); + #ifdef FEDERATED_DECENTRALIZED /** * @brief Spawn a thread to iterate through STAA structs. @@ -563,6 +603,78 @@ void lf_stall_advance_level_federation_locked(size_t level); */ void lf_synchronize_with_other_federates(void); +/** + * @brief Request RTI for a launch of a transient federate + * + * This function sets the transient_launch_requested flag to true, this would signal + * that the federate needs to send a request to the RTI to launch one or more + * transient federates + * + * @param port_name The port name is looked up in the port to transient + * federates mapping to determine the transient federates that are connected to this + * port. + */ +void lf_launch_transient_federate(char* port_name); + +/** + * @brief Sends a transient federate launch request to the RTI + * + * This functions is called when the transient_launch_requested is checked if it is + * set to true. The function traverses over the list of pending transient federates + * that need to be launched and for each a request is sent to the RTI with the msg + * type MSG_TYPE_TRANSIENT_LAUNCH_REQUEST and the federate id. + */ +void _lf_send_launch_request(); + +/** + * @brief Request a session key refresh for all SST connections. + * + * Sets the rekey_requested flag, signaling that a key rotation should + * be performed. The actual handshake is deferred and handled asynchronously + * by _lf_check_and_perform_rekey() at a safe point outside of reaction + * execution. Safe to call from within a reaction. + * + * Only has effect when COMM_TYPE_SST is configured with centralized coordination. + */ +void lf_refresh_key(void); + +/** + * @brief Check if a key refresh has been requested and perform the handshake if so. + * + * If rekey_requested is set, fetches a new session key from the SST auth + * server, sends a MSG_TYPE_SST_KEY_REFRESH_REQUEST to the RTI containing + * the new key ID, and blocks until the RTI acknowledges via + * handle_rti_session_key_ack(). Resets rekey_requested upon completion. + * + * This function is called internally at a safe point in the federate's + * execution loop and should not be called directly by user code. + */ +void _lf_check_and_perform_rekey(void); + +/** + * @brief Handle a session key acknowledgment from the RTI. + * + * Called when the RTI responds with MSG_TYPE_SST_KEY_ACK during a key + * rotation handshake. Reads the key ID from the message, verifies it + * matches the pending key, swaps the pending key into active use, resets + * sequence counters, and signals lf_rekey_completed to unblock + * _lf_check_and_perform_rekey(). + * + * @param net_abs The network abstraction for the RTI connection. + */ +void handle_rti_session_key_ack(net_abstraction_t net_abs); + +/** + * @brief Handle a session key refresh request from a federate + * + * Called when a federate sends a key refresh request with MSG_TYPE__SST_KEY_REFRESH_REQUEST + * to its outbound federates. Reads the key ID from the message, fetches the corresponding key + * from the auth and stores it in the pending key field. Responds back to the federate who initiated the request with + * MSG_TYPE_SST_KEY_ACK and then swaps it current key with the key stored in the pending key field + * + * @param net_abs The network abstraction for the RTI connection. + */ +void handle_key_refresh_request(net_abstraction_t net_abs); /** * @brief Update the max level allowed to advance (MLAA). * @ingroup Federated diff --git a/include/core/reactor_common.h b/include/core/reactor_common.h index 61dc1d563..a12acee79 100644 --- a/include/core/reactor_common.h +++ b/include/core/reactor_common.h @@ -29,6 +29,41 @@ #include "modes.h" #include "port.h" +////////////////////// CLI Parameter Table ////////////////////// + +/** + * @brief Type tag for a user-defined CLI parameter. + * @ingroup Internal + */ +typedef enum { + CLI_TIME, ///< interval_t, parsed as value + units (e.g., "500 msec"). + CLI_INT, ///< int, parsed with atoi. + CLI_DOUBLE, ///< double, parsed with strtod. + CLI_FLOAT, ///< float, parsed with strtof. + CLI_BOOL, ///< bool, parsed as "true"/"false" or "1"/"0". + CLI_STRING ///< const char*, set directly from the argument string. +} lf_cli_type_t; + +/** + * @brief Descriptor for a user-defined main reactor parameter overridable from the command line. + * @ingroup Internal + * + * The code generator populates an array of these structs so that the runtime + * can parse user parameters in a table-driven fashion, without generating + * large if-else chains. + */ +typedef struct { + const char* name; ///< Parameter name (e.g., "period"). + lf_cli_type_t type; ///< The type of the parameter value. + void* value; ///< Pointer to the storage variable. + bool* given; ///< Pointer to a bool that is set to true when the arg is provided. + const char* description; ///< Description for the help message (e.g., "time value (default: 1 sec)"). + bool is_width; ///< True if this parameter is used for multiport/bank widths (not overridable). +} lf_cli_param_t; + +extern lf_cli_param_t* _lf_cli_params; +extern int _lf_cli_params_count; + ////////////////////// Constants & Macros ////////////////////// /** @@ -363,6 +398,31 @@ void schedule_output_reactions(environment_t* env, reaction_t* reaction, int wor */ int process_args(int argc, const char* argv[]); +/** + * @brief Process user-defined main reactor parameters from the command line. + * @ingroup Internal + * + * Parses user-defined parameters from argv using the table in _lf_cli_params. + * Recognized parameters are consumed; the remaining arguments are copied into + * newargv/newargc so they can be forwarded to lf_reactor_c_main(). + * + * @param argc The number of command-line arguments. + * @param argv The command-line arguments. + * @param newargc Output: number of remaining (unrecognized) arguments. + * @param newargv Output: array of remaining arguments (must be pre-allocated to at least argc). + * @return 0 on success, non-zero on error (the program should exit). + */ +int process_user_args(int argc, const char* argv[], int* newargc, const char** newargv); + +/** + * @brief Print a usage message listing both user-defined parameters and runtime options. + * @ingroup Internal + * + * @param argc The number of command-line arguments. + * @param argv The command-line arguments. + */ +void usage(int argc, const char* argv[]); + /** * @brief Initialize global variables and start tracing before calling the * @ref _lf_initialize_trigger_objects() function. diff --git a/include/core/utils/pqueue_tag.h b/include/core/utils/pqueue_tag.h index 7a349053d..eb0663937 100644 --- a/include/core/utils/pqueue_tag.h +++ b/include/core/utils/pqueue_tag.h @@ -250,4 +250,11 @@ void pqueue_tag_remove_up_to(pqueue_tag_t* q, tag_t t); */ void pqueue_tag_dump(pqueue_tag_t* q); +/** + * @brief Return the maximum tag in the queue or NEVER_TAG if the queue is empty. + * + * @param q The queue. + */ +tag_t pqueue_tag_max_tag(pqueue_tag_t* q); + #endif // PQUEUE_TAG_H diff --git a/include/core/utils/util.h b/include/core/utils/util.h index 95f660e59..49afe8b5f 100644 --- a/include/core/utils/util.h +++ b/include/core/utils/util.h @@ -188,4 +188,14 @@ void lf_vprint_error_and_exit(const char* format, va_list args) ATTRIBUTE_FORMAT */ #define LF_CRITICAL_SECTION_EXIT(env) LF_ASSERT(!lf_critical_section_exit(env), "Could not exit critical section") +/** + * @brief Stop the execution of a federate. + * Every enclave within the federate will stop at one microstep later than its + * current tag. Unlike lf_request_stop(), this process does not require any + * involvement from the RTI, nor does it necessitate any consensus. + * + * This function is particularly useful for testing transient federates. + */ +void lf_stop(); + #endif /* UTIL_H */ diff --git a/lingua-franca-ref.txt b/lingua-franca-ref.txt index b756b406f..52199a147 100644 --- a/lingua-franca-ref.txt +++ b/lingua-franca-ref.txt @@ -1 +1 @@ -networkdriver +transient-fed diff --git a/network/api/lf_sst_support.h b/network/api/lf_sst_support.h index f4ed2709f..a35884b42 100644 --- a/network/api/lf_sst_support.h +++ b/network/api/lf_sst_support.h @@ -11,6 +11,7 @@ typedef struct sst_priv_t { unsigned char buffer[MAX_SECURE_COMM_MSG_LENGTH]; size_t buf_filled; size_t buf_off; + session_key_t pending_key; } sst_priv_t; typedef struct sst_connection_params_t { diff --git a/network/api/net_abstraction.h b/network/api/net_abstraction.h index 14a02ea62..26e24a0aa 100644 --- a/network/api/net_abstraction.h +++ b/network/api/net_abstraction.h @@ -309,4 +309,65 @@ void set_server_port(net_abstraction_t net_abs, int32_t port); */ void set_server_hostname(net_abstraction_t net_abs, const char* hostname); +/** + * @brief Fetches a fresh session key from the SST auth server and stores it as pending_key. + * @ingroup Network + * + * Called by _lf_check_and_perform_rekey() on the federate before sending a key refresh + * request. The key is stored by value so it survives the free of the returned key list. + * + * @param net_abs The network abstraction whose pending_key will be updated. + */ +void get_new_session_key(net_abstraction_t net_abs); + +/** + * @brief Sends a key refresh message containing the pending key ID to the peer. + * @ingroup Network + * + * Used in both directions: federate sends MSG_TYPE_SST_KEY_REFRESH_REQUEST to initiate + * rotation; RTI sends MSG_TYPE_SST_KEY_ACK to confirm. The format is identical: 1 byte + * msg_type + 8 bytes key ID. + * + * @param net_abs The network abstraction to send from. + * @param msg_type MSG_TYPE_SST_KEY_REFRESH_REQUEST or MSG_TYPE_SST_KEY_ACK. + */ +void send_key_refresh_request(net_abstraction_t net_abs, unsigned char msg_type); + +/** + * @brief Looks up a session key by ID in the SST context and stores it as pending_key. + * @ingroup Network + * + * Called on the RTI in handle_key_refresh_request() after receiving a refresh request. + * The RTI uses the key ID sent by the federate to locate the same key locally, so both + * sides can swap to it after the ACK. + * + * @param net_abs The network abstraction whose pending_key will be set. + * @param key_id The 8-byte key ID received from the peer. + */ +void fetch_pending_session_key(net_abstraction_t net_abs, unsigned char* key_id); + +/** + * @brief Returns true if key_id matches the pending key's ID. + * @ingroup Network + * + * Called by handle_rti_session_key_ack() on the federate to verify the RTI's ACK refers + * to the key that was requested. A mismatch means the ACK is ignored and no swap occurs. + * + * @param net_abs The network abstraction holding pending_key. + * @param key_id The 8-byte key ID from the peer's message. + */ +bool verify_pending_key_id(net_abstraction_t net_abs, unsigned char* key_id); + +/** + * @brief Activates pending_key as the current session key and resets sequence counters. + * @ingroup Network + * + * Called after both sides confirm the key ID: on the federate in handle_rti_session_key_ack() + * and on the RTI in handle_key_refresh_request(). Resetting sequence counters to zero ensures + * the new key starts from a clean state. + * + * @param net_abs The network abstraction to update. + */ +void swap_to_pending_key(net_abstraction_t net_abs); + #endif /* NET_ABSTRACTION_H */ diff --git a/network/api/net_common.h b/network/api/net_common.h index 8f4855fcc..447c28ca3 100644 --- a/network/api/net_common.h +++ b/network/api/net_common.h @@ -23,10 +23,10 @@ * When it has successfully opened a TCP connection, the first message it sends * to the RTI is a @ref MSG_TYPE_FED_IDS message, which contains the ID of this federate * within the federation, contained in the global variable _lf_my_fed_id - * in the federate code - * (which is initialized by the code generator) and the unique ID of - * the federation, a GUID that is created at run time by the generated script - * that launches the federation. + * in the federate code (which is initialized by the code generator), + * the type of this federate (persistent (0) or transient (1)), + * and the unique ID of the federation, a GUID that is created at run time by the + * generated script that launches the federation. * If you launch the federates and the RTI manually, rather than using the script, * then the federation ID is a string that is optionally given to the federate * on the command line when it is launched. The federate will connect @@ -261,10 +261,9 @@ * * Two bytes (ushort) giving the federate ID. * * One byte (uchar) giving the length N of the federation ID. * * N bytes containing the federation ID. - * Each federate needs to have a unique ID between 0 and - * NUMBER_OF_FEDERATES-1. - * Each federate, when starting up, should send this message - * to the RTI. This is its first message to the RTI. + * Each federate needs to have a unique ID between 0 and NUMBER_OF_FEDERATES-1. + * Each federate, when starting up, should send either this message, or MSG_TYPE_TRANSIENT_FED_IDS + * to the RTI, as its first message to the RTI. * The RTI will respond with either MSG_TYPE_REJECT, MSG_TYPE_ACK, or MSG_TYPE_UDP_PORT. * If the federate is a C target LF program, the generated federate * code does this by calling lf_synchronize_with_other_federates(), @@ -272,6 +271,23 @@ */ #define MSG_TYPE_FED_IDS 1 +/** Byte identifying a message from a transient federate to an RTI containing + * the federate ID and the federation ID. The message contains, in this order: + * * One byte equal to MSG_TYPE_TRANSIENT_FED_IDS. + * * Two bytes (ushort) giving the federate ID. + * * One byte (uchar) giving the length N of the federation ID. + * * One byte giving the type of the federate (1 if transient, 0 if persistent) + * * N bytes containing the federation ID. + * Each federate needs to have a unique ID between 0 and NUMBER_OF_FEDERATES-1. + * Each federate, when starting up, should send either this message, or MSG_TYPE_FED_IDS + * to the RTI, as its first message to the RTI. + * The RTI will respond with either MSG_TYPE_REJECT, MSG_TYPE_ACK, or MSG_TYPE_UDP_PORT. + * If the federate is a C target LF program, the generated federate + * code does this by calling lf_synchronize_with_other_federates(), + * passing to it its federate ID. + */ +#define MSG_TYPE_TRANSIENT_FED_IDS 103 + /////////// Messages used for authenticated federation. /////////////// /** * @brief Byte identifying a message from a federate to an RTI containing @@ -336,8 +352,9 @@ * @ingroup Network * * Each federate sends its starting physical time as a message of this - * type, and the RTI broadcasts to all the federates the starting logical + * type, and the RTI broadcasts to all persistent federates the starting * time as a message of this type. + * In case of a joining federate, the RTI will also send the effective start tag. */ #define MSG_TYPE_TIMESTAMP 2 @@ -345,7 +362,13 @@ * @brief The length of a timestamp message. * @ingroup Network */ -#define MSG_TYPE_TIMESTAMP_LENGTH (1 + sizeof(int64_t)) +#define MSG_TYPE_TIMESTAMP_LENGTH (1 + sizeof(instant_t)) + +/** + * @brief The length of a timestamp message with an effective start tag. + * @ingroup Federated + */ +#define MSG_TYPE_TIMESTAMP_TAG_LENGTH (1 + sizeof(instant_t) + sizeof(tag_t)) /** * @brief Byte identifying a message to forward to another federate. @@ -628,6 +651,33 @@ */ #define MSG_TYPE_P2P_TAGGED_MESSAGE 18 +/** + * @brief Byte identifying a key refresh request. + * @ingroup Network + * + * Used to request that the peer should switch to a new session key. + */ +#define MSG_TYPE_SST_KEY_REFRESH_REQUEST 110 + +/** + * @brief Byte identifying a key refresh acknowledgement. + * @ingroup Network + * + * Sent in response to MSG_TYPE_SST_KEY_REFRESH_REQUEST to confirm the peer is ready + * to use the new session key. + */ +#define MSG_TYPE_SST_KEY_ACK 111 + +/** + * @brief Byte identifying transient federate launch request + * @ingroup Network + * + * This is sent to the RTI to request it to launch a certain transient + * federate. This message is accompanied with the id of the transient + * federate. + */ +#define MSG_TYPE_TRANSIENT_LAUNCH_REQUEST 112 + //////////////////////////////////////////////// /** * @brief Physical clock synchronization messages according to PTP. @@ -737,55 +787,49 @@ #define MSG_TYPE_DOWNSTREAM_NEXT_EVENT_TAG 26 ///////////////////////////////////////////// -//// Rejection codes +//// Transient federate support /** - * @brief Code sent with a @ref MSG_TYPE_REJECT message indicating that the - * federation ID does not match. - * @ingroup Network + * A message the informs a downstream federate that a federate upstream of it + * is connected. The next 2 bytes are the federate ID of the upstream federate. */ -#define FEDERATION_ID_DOES_NOT_MATCH 1 +#define MSG_TYPE_UPSTREAM_CONNECTED 27 +#define MSG_TYPE_UPSTREAM_CONNECTED_LENGTH (1 + sizeof(uint16_t)) /** - * @brief Code sent with a @ref MSG_TYPE_REJECT message indicating that the - * federate ID is already in use. - * @ingroup Network + * A message the informs a downstream federate that a federate upstream of it + * is no longer connected. The next 2 bytes are the federate ID of the upstream federate. */ -#define FEDERATE_ID_IN_USE 2 +#define MSG_TYPE_UPSTREAM_DISCONNECTED 28 +#define MSG_TYPE_UPSTREAM_DISCONNECTED_LENGTH (1 + sizeof(uint16_t)) /** - * @brief Code sent with a @ref MSG_TYPE_REJECT message indicating that the - * federate ID is out of range. - * @ingroup Network - */ -#define FEDERATE_ID_OUT_OF_RANGE 3 - -/** - * @brief Code sent with a @ref MSG_TYPE_REJECT message indicating that the - * incoming message is not expected. - * @ingroup Network - */ -#define UNEXPECTED_MESSAGE 4 - -/** - * @brief Code sent with a @ref MSG_TYPE_REJECT message indicating that the - * connected to the wrong server. - * @ingroup Network + * Byte sent by the RTI ordering the federate to stop. Upon receiving the message, + * the federate will call lf_stop(), which will make it resign at its current_tag + * plus 1 microstep. + * The next 8 bytes will be the time at which the federates will stop. + * The next 4 bytes will be the microstep at which the federates will stop.. */ -#define WRONG_SERVER 5 +#define MSG_TYPE_STOP 29 +#define MSG_TYPE_STOP_LENGTH 1 -/** - * @brief Code sent with a @ref MSG_TYPE_REJECT message indicating that the - * HMAC authentication failed. - * @ingroup Network - */ -#define HMAC_DOES_NOT_MATCH 6 +///////////////////////////////////////////// +//// Rejection codes /** * @brief Code sent with a @ref MSG_TYPE_REJECT message indicating that the - * RTI was not executed using the -a or --auth option. + * federation ID does not match. * @ingroup Network */ -#define RTI_NOT_EXECUTED_WITH_AUTH 7 +typedef enum { + FEDERATION_ID_DOES_NOT_MATCH = 1, + FEDERATE_ID_IN_USE = 2, + FEDERATE_ID_OUT_OF_RANGE = 3, + UNEXPECTED_MESSAGE = 4, + WRONG_SERVER = 5, + HMAC_DOES_NOT_MATCH = 6, + RTI_NOT_EXECUTED_WITH_AUTH = 7, + JOINING_TOO_LATE = 8 +} rejection_code_t; #endif /* NET_COMMON_H */ diff --git a/network/impl/src/lf_sst_support.c b/network/impl/src/lf_sst_support.c index 1c1da3b1e..722b1530a 100644 --- a/network/impl/src/lf_sst_support.c +++ b/network/impl/src/lf_sst_support.c @@ -80,7 +80,8 @@ net_abstraction_t accept_net(net_abstraction_t server_chan) { if (get_peer_address(client_priv->socket_priv) != 0) { lf_print_error("Failed to save peer address."); } - + + client_priv->sst_ctx = serv_priv->sst_ctx; // TODO: Do we need to copy sst_ctx form server_chan to fed_chan? session_key_list_t* s_key_list = init_empty_session_key_list(); SST_session_ctx_t* session_ctx = @@ -124,7 +125,7 @@ net_abstraction_t connect_to_net(net_params_t* params) { } if (sst_params->target == 1) { //Override target group to federates. - snprintf(priv->sst_ctx->config.purpose[ctx->config.purpose_index], sizeof(ctx->config.purpose[ctx->config.purpose_index]), "{\"group\":\"Federates\"}"); + snprintf(priv->sst_ctx->purpose_for_requesting_key, sizeof(ctx->purpose_for_requesting_key), "{\"group\":\"Federates\"}"); } session_key_list_t* s_key_list = get_session_key(priv->sst_ctx, NULL); SST_session_ctx_t* session_ctx = @@ -317,3 +318,79 @@ void set_server_hostname(net_abstraction_t net_abs, const char* hostname) { sst_priv_t* priv = (sst_priv_t*)net_abs; memcpy(priv->socket_priv->server_hostname, hostname, INET_ADDRSTRLEN); } + +/** + * Called by _lf_check_and_perform_rekey() on the federate side before initiating a key + * rotation. Contacts the SST auth server to obtain a fresh session key and stores it in + * pending_key by value, so it remains valid after new_list is freed. + */ +void get_new_session_key(net_abstraction_t net_abs){ + LF_ASSERT_NON_NULL(net_abs); + sst_priv_t* priv = (sst_priv_t *)net_abs; + session_key_list_t* new_list = get_session_key(priv->sst_ctx, NULL); + priv->pending_key = new_list->s_key[0]; + free_session_key_list_t(new_list); + lf_print("DEBUG: Session key acquired on the federate side"); +} + +/** + * Sends a key refresh message to the peer: 1 byte msg_type followed by the 8-byte pending + * key ID. Used in both directions of the handshake: + * Federate -> RTI: called with MSG_TYPE_SST_KEY_REFRESH_REQUEST to start a key rotation. + * RTI -> Federate: called with MSG_TYPE_SST_KEY_ACK to confirm the new key is accepted. + */ + +void send_key_refresh_request(net_abstraction_t net_abs, unsigned char msg_type){ + LF_ASSERT_NON_NULL(net_abs); + sst_priv_t* priv = (sst_priv_t *)net_abs; + + unsigned char buffer[1+SESSION_KEY_ID_SIZE]; + buffer[0] = msg_type; + memcpy(&buffer[1], priv->pending_key.key_id, SESSION_KEY_ID_SIZE); + + int result = write_to_net_close_on_error(net_abs, 9, buffer); + LF_PRINT_DEBUG("Result: %d", result); +} + +/** + * Called on the RTI side in handle_key_refresh_request() when a MSG_TYPE_SST_KEY_REFRESH_REQUEST + * arrives. The federate has already fetched the new key from the auth server and sent its ID; + * this function retrieves the matching key from the RTI's SST context and stores it as + * pending_key. + */ +void fetch_pending_session_key(net_abstraction_t net_abs, unsigned char* key_id){ + LF_ASSERT_NON_NULL(net_abs); + sst_priv_t* priv = (sst_priv_t *)net_abs; + session_key_list_t* existing = init_empty_session_key_list(); + session_key_t* new_key = get_session_key_by_ID(key_id, priv->sst_ctx, existing); + if(new_key != NULL){ + priv->pending_key = *new_key; + } + + free_session_key_list_t(existing); + +} + +/** + * Called by handle_rti_session_key_ack() on the federate side to confirm the key ID in the + * RTI's ACK refers to the same key that was requested. Returns false if they don't match. + */ +bool verify_pending_key_id(net_abstraction_t net_abs, unsigned char* key_id){ + LF_ASSERT_NON_NULL(net_abs); + sst_priv_t* priv = (sst_priv_t *)net_abs; + return memcmp(priv->pending_key.key_id, key_id, SESSION_KEY_ID_SIZE) == 0; +} + +/** + * Replaces the active session key with pending_key and resets both sequence counters to zero + * so encryption restarts cleanly with the new key. Called on the federate side in + * handle_rti_session_key_ack() and on the RTI side in handle_key_refresh_request(), after + * both sides have confirmed the key ID matches. + */ +void swap_to_pending_key(net_abstraction_t net_abs){ + LF_ASSERT_NON_NULL(net_abs); + sst_priv_t* priv = (sst_priv_t *)net_abs; + priv->session_ctx->s_key = priv->pending_key; + priv->session_ctx->received_seq_num = 0; + priv->session_ctx->sent_seq_num = 0; +} diff --git a/tag/api/tag.h b/tag/api/tag.h index 6638d8103..98104a942 100644 --- a/tag/api/tag.h +++ b/tag/api/tag.h @@ -306,6 +306,14 @@ instant_t lf_time_physical_elapsed(void); */ instant_t lf_time_start(void); +/** + * Return the tag at which the execution effectively started. + * Most of the time, this will default to {.time = start_time, .microstep: 0}. + * When the reactor is a transient federate, however, the value will be different. + * @return A tag. + */ +tag_t lf_tag_start_effective(void); + /** * @brief For user-friendly reporting of time values, the buffer length required. * @ingroup API @@ -359,4 +367,20 @@ size_t lf_readable_time(char* buffer, instant_t time); */ size_t lf_comma_separated_time(char* buffer, instant_t time); +/** + * @brief Parse a time value and units from strings, producing a time in nanoseconds. + * @ingroup API + * + * Recognized unit strings (case-sensitive, prefix match): + * "nsec" or "ns", "usec" or "us", "msec" or "ms", + * "sec" or "s" or "second", "min" or "minute", + * "hour", "day", "week". + * + * @param time_str A string representing a numeric time value (e.g., "500"). + * @param units_str A string representing the time units (e.g., "msec"). + * @param result Pointer to store the resulting time in nanoseconds. + * @return 0 on success, -1 if the time value is invalid, -2 if the units are invalid. + */ +int lf_time_parse(const char* time_str, const char* units_str, interval_t* result); + #endif // TAG_H diff --git a/test/general/utils/pqueue_test.c b/test/general/utils/pqueue_test.c index 665c4e13f..18b3009a8 100644 --- a/test/general/utils/pqueue_test.c +++ b/test/general/utils/pqueue_test.c @@ -23,6 +23,8 @@ static void insert_on_queue(pqueue_tag_t* q) { assert(!pqueue_tag_insert_tag(q, t2)); assert(!pqueue_tag_insert_tag(q, t3)); + assert(lf_tag_compare(pqueue_tag_max_tag(q), t1) == 0); + assert(!pqueue_tag_insert_if_no_match(q, t4)); assert(pqueue_tag_insert_if_no_match(q, t1)); assert(pqueue_tag_insert_if_no_match(q, t4)); diff --git a/trace/api/types/trace_types.h b/trace/api/types/trace_types.h index dbb672b67..16c89ece9 100644 --- a/trace/api/types/trace_types.h +++ b/trace/api/types/trace_types.h @@ -76,6 +76,8 @@ typedef enum { receive_ADR_QR, receive_DNET, receive_UNIDENTIFIED, + send_STOP, + receive_STOP, NUM_EVENT_TYPES } trace_event_t; @@ -142,6 +144,8 @@ static const char* trace_event_names[] = { "Receiving ADR_QR", "Receiving DNET", "Receiving UNIDENTIFIED", + "Sending STOP", + "Receiving STOP", }; static inline void _suppress_unused_variable_warning_for_static_variable() { (void)trace_event_names; } diff --git a/trace/impl/CMakeLists.txt b/trace/impl/CMakeLists.txt index f4a6b8b55..408aad43a 100644 --- a/trace/impl/CMakeLists.txt +++ b/trace/impl/CMakeLists.txt @@ -1,6 +1,41 @@ -set(LF_ROOT ${CMAKE_CURRENT_LIST_DIR}/../..) +## This CMakeLists is used in two ways: +## 1) Included from the reactor-c build (e.g., from `core/CMakeLists.txt`) +## 2) Built standalone by running CMake in `trace/impl/` +## +## When built standalone, we need to create the dependency targets that the +## implementation links against (they are lightweight INTERFACE libraries). + +set(_LF_TRACE_IMPL_STANDALONE OFF) +# Check if the trace plugin is compiled standalone. +if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_LIST_DIR) + cmake_minimum_required(VERSION 3.13) + project(lf_trace_impl LANGUAGES C) + set(CMAKE_C_STANDARD 11) + set(CMAKE_C_STANDARD_REQUIRED ON) + set(_LF_TRACE_IMPL_STANDALONE ON) +endif() + +get_filename_component(LF_ROOT "${CMAKE_CURRENT_LIST_DIR}/../.." ABSOLUTE) include(${LF_ROOT}/core/lf_utils.cmake) +# Ensure dependency targets exist when building standalone. +# When this file is included from the reactor-c build, that build is responsible +# for including these modules (and including them twice causes duplicate targets). +if(_LF_TRACE_IMPL_STANDALONE) + if(NOT TARGET lf::trace-api) + include(${LF_ROOT}/trace/api/CMakeLists.txt) + endif() + if(NOT TARGET lf::platform-api) + include(${LF_ROOT}/platform/api/CMakeLists.txt) + endif() + if(NOT TARGET lf::logging-api) + include(${LF_ROOT}/logging/api/CMakeLists.txt) + endif() + if(NOT TARGET lf::version-api) + include(${LF_ROOT}/version/api/CMakeLists.txt) + endif() +endif() + add_library(lf-trace-impl STATIC) add_library(lf::trace-impl ALIAS lf-trace-impl) target_link_libraries(lf-trace-impl PRIVATE lf::trace-api) diff --git a/trace/impl/build.sh b/trace/impl/build.sh index d96cc5b34..a98dd29d1 100755 --- a/trace/impl/build.sh +++ b/trace/impl/build.sh @@ -1,4 +1,12 @@ #!/bin/bash -cmake -S . -B build -DLOG_LEVEL=4 -cmake --build build +set -euo pipefail + +# `trace/impl` is a standalone CMake project (and can also be included from the +# reactor-c top-level build). This script builds the standalone project and +# produces `lib/lf-trace-impl.a`. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BUILD_DIR="${SCRIPT_DIR}/build" + +cmake -S "${SCRIPT_DIR}" -B "${BUILD_DIR}" -DLOG_LEVEL=4 +cmake --build "${BUILD_DIR}" --target lf-trace-impl diff --git a/trace/impl/src/trace_impl.c b/trace/impl/src/trace_impl.c index 72aac44d8..b0eb1498d 100644 --- a/trace/impl/src/trace_impl.c +++ b/trace/impl/src/trace_impl.c @@ -260,10 +260,37 @@ void lf_tracing_global_init(char* process_name, char* process_names, int fedid, } process_id = fedid; char filename[100]; + + // When tracing transient federates, a new trace file is created for each execution. For this, the function + // checks for file existance. If the file exists, the function appends a number to the file name and checks + // again. + int iter = 0; + bool file_exists = false; + bool new_file = false; if (strcmp(process_name, "rti") == 0) { snprintf(filename, sizeof(filename), "%s.lft", process_name); } else { - snprintf(filename, sizeof(filename), "%s_%d.lft", process_name, process_id); + FILE* file; + do { + if (iter == 0) { + sprintf(filename, "%s_%d.lft", process_name, process_id); + } else { + sprintf(filename, "%s_%d_%d.lft", process_name, process_id, iter); + } + file = fopen(filename, "r"); + if (file) { + file_exists = true; + new_file = true; + fclose(file); + iter++; + } else { + file_exists = false; + } + } while (file_exists); + } + if (new_file) { + lf_print_warning("No overwriting! The default file name already exists. A new trace file named %s is created.", + filename); } trace_new(filename); start_trace(&trace, max_num_local_threads); diff --git a/util/tracing/visualization/fedsd.py b/util/tracing/visualization/fedsd.py index a680d27c4..8107d45bf 100644 --- a/util/tracing/visualization/fedsd.py +++ b/util/tracing/visualization/fedsd.py @@ -29,6 +29,7 @@ .DNET { stroke: purple; fill: purple} \ .TIMESTAMP { stroke: grey; fill: grey } \ .FED_ID {stroke: #80DD99; fill: #80DD99 } \ + .STOP {stroke: #d0b7eb; fill: #d0b7eb} \ .ADV {stroke-linecap="round" ; stroke: "red" ; fill: "red"} \ text { \ font-size: smaller; \ @@ -86,7 +87,9 @@ "Receiving ADR_QR": "ADR_QR", "Receiving DNET": "DNET", "Receiving UNIDENTIFIED": "UNIDENTIFIED", - "Scheduler advancing time ends": "AdvLT" + "Scheduler advancing time ends": "AdvLT", + "Sending STOP": "STOP", + "Receiving STOP": "STOP" } prune_event_name.setdefault(" ", "UNIDENTIFIED") @@ -113,7 +116,7 @@ # Events matching at the sender and receiver ends depend on whether they are tagged # (the elapsed logical time and microstep have to be the same) or not. # Set of tagged events (messages) -non_tagged_messages = {'FED_ID', 'ACK', 'RESIGN', 'FAILED', 'REJECT', 'ADR_QR', 'ADR_AD', 'MSG', 'P2P_MSG'} +non_tagged_messages = {'FED_ID', 'ACK', 'RESIGN', 'FAILED', 'REJECT', 'ADR_QR', 'ADR_AD', 'MSG', 'P2P_MSG', 'STOP'} ################################################################################ @@ -212,7 +215,6 @@ def svg_string_draw_label(x1, y1, x2, y2, label) : else: rotation = 0 str_line = '\t'+label+'\n' - #print('rot = '+str(rotation)+' x1='+str(x1)+' y1='+str(y1)+' x2='+str(x2)+' y2='+str(y2)) return str_line @@ -507,11 +509,17 @@ def get_and_convert_lft_files(rti_lft_file, federates_lft_files, start_time, end if (not fed_df.empty): # Get the federate id number fed_id = fed_df.iloc[-1]['self_id'] - # Add to the list of sequence diagram actors and add the name - actors.append(fed_id) - actors_names[fed_id] = Path(fed_trace).stem - # Derive the x coordinate of the actor - x_coor[fed_id] = (padding * 2) + (spacing * (len(actors) - 1)) + + ### Check that the federate id have not been entrered yet. + ### This is particlurly useful for transient actors, when + ### they leave and join several times + if (actors.count(fed_id) == 0): + # Add to the list of sequence diagram actors and add the name + actors.append(fed_id) + actors_names[fed_id] = Path(fed_trace).stem + # Derive the x coordinate of the actor + x_coor[fed_id] = (padding * 2) + (spacing * (len(actors)-1)) + fed_df['x1'] = x_coor[fed_id] trace_df = pd.concat([trace_df, fed_df]) fed_df = fed_df[0:0] @@ -675,7 +683,7 @@ def get_and_convert_lft_files(rti_lft_file, federates_lft_files, start_time, end # FIXME: Using microseconds is hardwired here. physical_time = f'{int(row["physical_time"]/1000):,}' - if (row['event'] in {'FED_ID', 'ACK', 'FAILED', 'REJECT', 'ADR_QR', 'ADR_AD', 'MSG', 'P2P_MSG'}): + if (row['event'] in {'FED_ID', 'ACK', 'FAILED', 'REJECT', 'ADR_QR', 'ADR_AD', 'MSG', 'P2P_MSG', 'STOP'}): label = row['event'] else: label = row['event'] + '(' + f'{int(row["logical_time"]):,}' + ', ' + str(row['microstep']) + ')'