diff --git a/.github/workflows/github-actions-test-simple.yml b/.github/workflows/github-actions-test-simple.yml index f7692b09..1931f709 100644 --- a/.github/workflows/github-actions-test-simple.yml +++ b/.github/workflows/github-actions-test-simple.yml @@ -35,6 +35,7 @@ jobs: cd tests pytest test_feature_metrics.py pytest test_package.py + pytest test_drop_redundant_columns.py - name: Upload test results uses: actions/upload-artifact@v4 diff --git a/.gitignore b/.gitignore index aeadf1ab..a80f7b04 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,9 @@ share/python-wheels/ *.egg MANIFEST +# Claude Code +.claude/ + # IntelliJ .idea/ @@ -54,6 +57,10 @@ src/features/lexicons/certainty.txt examples/vector_data/* examples/output/* node_modules/ +*.csv +# but always track test fixture CSVs so they aren't silently dropped +!tests/data/cleaned_data/*.csv +*.log # testing /output diff --git a/docs/build/html/.buildinfo b/docs/build/html/.buildinfo index 5ac8e945..906d30ca 100644 --- a/docs/build/html/.buildinfo +++ b/docs/build/html/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: d7678f479036f3220c73480ec4f2c467 +config: 9a01a2cd3d4384710101b4a99edd7683 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/build/html/_sources/examples.rst.txt b/docs/build/html/_sources/examples.rst.txt index bab74995..14a7d8e6 100644 --- a/docs/build/html/_sources/examples.rst.txt +++ b/docs/build/html/_sources/examples.rst.txt @@ -94,7 +94,8 @@ Now we are ready to call the FeatureBuilder on our data. All we need to do is de grouping_keys = ["batch_num", "round_num"], # NOTE: This example demonstrates grouping. Use conversation_id_col if you have a single conversation identifier. vector_directory = "./vector_data/", output_file_base = "jury_output", - turns = True # NOTE: This defaults to False. Decide whether you want to combine successive 'utterances' by the same person as a 'turn.' + turns = True, # NOTE: This defaults to False. Decide whether you want to combine successive 'utterances' by the same person as a 'turn.' + drop_redundant_columns = True # NOTE: This defaults to False. When True, highly correlated and sparse feature columns are dropped from the output. ) jury_feature_builder.featurize() @@ -241,18 +242,12 @@ Custom Features * You can chose to add any of these features depending on your preference. -Analyzing First Percentage (%) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Analyzing First Percentage (%) [Deprecated] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* The **analyze_first_pct** parameter allows you to "cut off" and separately analyze the first X% of a conversation, in case you wish to separately study different sections of a conversation as it progresses. For example, you may be interested in knowing how the attributes of the first 50% of a conversation differ from the attributes of the entire conversation. Then you can sepcify the following: +.. warning:: - .. code-block:: python - - analyze_first_pct: [0.5, 1.0] - - * This will first analyze the first 50% of each conversation, and then analyze the full conversation. - - * By default, we will simply analyze 100% of each conversation. + **Deprecated as of v.0.1.8.** The **analyze_first_pct** parameter (and its underlying ``get_first_pct_of_chat`` method) has been removed. To analyze only a portion of a conversation, subset your input dataframe before passing it to the FeatureBuilder. Named Entity Recognition ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -350,6 +345,42 @@ Important Notes and Caveats - **[NOTE 2]** Be careful when choosing the "sum" aggregation method, as it is not always appropriate to use the "sum" as an aggregation function. While it is a sensible choice for utterance-level attributes that are *countable* (for example, the total number of words, or other lexical wordcounts), it is a less sensible choice for others (for example, it does not make sense to sum sentiment scores for each utterance in a conversation). Consequently, using the "sum" feature will come with an associated warning. - **[NOTE 3]** In addition to aggregating from the utterance (chat) level to the conversation level, we also aggregate from the speaker (user) level to the conversation level, using the same methods specified in ``convo_methods`` to do so. +.. _reducing_redundant_features: + +Reducing Redundant Features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* **New in v.0.1.8.** The FeatureBuilder generates a large number of features, and many of them are highly correlated with one another or are sparsely populated (containing mostly missing or zero values). The following parameters allow the FeatureBuilder to automatically detect groups of redundant features and retain only one representative per group, as well as drop columns that are dominated by missing (NA) or zero values. + +* The **drop_redundant_columns** parameter is the main "switch" controlling this behavior. It defaults to **False**, meaning that no columns are dropped; instead, the FeatureBuilder simply *logs* which columns it identified as redundant. When set to **True**, the redundant columns are actually removed from the chat-, user-, and conversation-level outputs. + + * Note that only features generated by the FeatureBuilder are considered. Your original input columns (metadata, outcome variables, and other non-numeric columns) are always preserved untouched. + +* The remaining parameters control *how* redundancy is detected: + + * **corr_thresh** (default ``0.9``): the minimum absolute Spearman correlation at which two numeric features are treated as redundant. Features whose correlations meet this threshold are clustered into groups, and one representative (the column with the most valid data and highest variance) is kept from each group. + + * **min_group_size** (default ``2``): the minimum number of correlated columns required to form a redundancy group. + + * **min_na_ratio** (default ``0.3``): numeric columns whose fraction of missing (NA) values exceeds this threshold are flagged (and dropped, if **drop_redundant_columns** is **True**). + + * **min_zero_ratio** (default ``0.9``): numeric columns whose fraction of zero values exceeds this threshold are flagged (and dropped, if **drop_redundant_columns** is **True**). + + * **treat_zero_as_na** (default ``True``): if **True**, zeros are treated as missing values when computing redundancy metrics and selecting the representative column for each group. + +* **Example: dropping correlated features.** To actually drop redundant columns (rather than just logging them), set **drop_redundant_columns = True**. Recall that this parameter **defaults to False**, so you must opt in. The example below keeps only one representative from each group of features correlated at an absolute Spearman correlation of 0.9 or higher: + + .. code-block:: python + + jury_feature_builder = FeatureBuilder( + input_df = juries_df, + grouping_keys = ["batch_num", "round_num"], + output_file_base = "jury_output", + drop_redundant_columns = True, # Defaults to False; set to True to drop redundant columns rather than only logging them. + corr_thresh = 0.9 # Cluster features correlated at >= 0.9 (absolute Spearman) and keep one representative per group. + ) + jury_feature_builder.featurize() + Cumulative Grouping ~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/build/html/_static/pygments.css b/docs/build/html/_static/pygments.css index 84ab3030..6f8b210a 100644 --- a/docs/build/html/_static/pygments.css +++ b/docs/build/html/_static/pygments.css @@ -6,9 +6,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: .highlight .hll { background-color: #ffffcc } .highlight { background: #f8f8f8; } .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ -.highlight .err { border: 1px solid #FF0000 } /* Error */ +.highlight .err { border: 1px solid #F00 } /* Error */ .highlight .k { color: #008000; font-weight: bold } /* Keyword */ -.highlight .o { color: #666666 } /* Operator */ +.highlight .o { color: #666 } /* Operator */ .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ .highlight .cp { color: #9C6500 } /* Comment.Preproc */ @@ -25,34 +25,34 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ .highlight .gs { font-weight: bold } /* Generic.Strong */ .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ -.highlight .gt { color: #0044DD } /* Generic.Traceback */ +.highlight .gt { color: #04D } /* Generic.Traceback */ .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ .highlight .kp { color: #008000 } /* Keyword.Pseudo */ .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ .highlight .kt { color: #B00040 } /* Keyword.Type */ -.highlight .m { color: #666666 } /* Literal.Number */ +.highlight .m { color: #666 } /* Literal.Number */ .highlight .s { color: #BA2121 } /* Literal.String */ .highlight .na { color: #687822 } /* Name.Attribute */ .highlight .nb { color: #008000 } /* Name.Builtin */ -.highlight .nc { color: #0000FF; font-weight: bold } /* Name.Class */ -.highlight .no { color: #880000 } /* Name.Constant */ -.highlight .nd { color: #AA22FF } /* Name.Decorator */ +.highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +.highlight .no { color: #800 } /* Name.Constant */ +.highlight .nd { color: #A2F } /* Name.Decorator */ .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ -.highlight .nf { color: #0000FF } /* Name.Function */ +.highlight .nf { color: #00F } /* Name.Function */ .highlight .nl { color: #767600 } /* Name.Label */ -.highlight .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */ +.highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ .highlight .nv { color: #19177C } /* Name.Variable */ -.highlight .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */ -.highlight .w { color: #bbbbbb } /* Text.Whitespace */ -.highlight .mb { color: #666666 } /* Literal.Number.Bin */ -.highlight .mf { color: #666666 } /* Literal.Number.Float */ -.highlight .mh { color: #666666 } /* Literal.Number.Hex */ -.highlight .mi { color: #666666 } /* Literal.Number.Integer */ -.highlight .mo { color: #666666 } /* Literal.Number.Oct */ +.highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +.highlight .w { color: #BBB } /* Text.Whitespace */ +.highlight .mb { color: #666 } /* Literal.Number.Bin */ +.highlight .mf { color: #666 } /* Literal.Number.Float */ +.highlight .mh { color: #666 } /* Literal.Number.Hex */ +.highlight .mi { color: #666 } /* Literal.Number.Integer */ +.highlight .mo { color: #666 } /* Literal.Number.Oct */ .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ .highlight .sc { color: #BA2121 } /* Literal.String.Char */ @@ -67,9 +67,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ .highlight .ss { color: #19177C } /* Literal.String.Symbol */ .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ -.highlight .fm { color: #0000FF } /* Name.Function.Magic */ +.highlight .fm { color: #00F } /* Name.Function.Magic */ .highlight .vc { color: #19177C } /* Name.Variable.Class */ .highlight .vg { color: #19177C } /* Name.Variable.Global */ .highlight .vi { color: #19177C } /* Name.Variable.Instance */ .highlight .vm { color: #19177C } /* Name.Variable.Magic */ -.highlight .il { color: #666666 } /* Literal.Number.Integer.Long */ \ No newline at end of file +.highlight .il { color: #666 } /* Literal.Number.Integer.Long */ \ No newline at end of file diff --git a/docs/build/html/_static/searchtools.js b/docs/build/html/_static/searchtools.js index b08d58c9..eaed9095 100644 --- a/docs/build/html/_static/searchtools.js +++ b/docs/build/html/_static/searchtools.js @@ -328,14 +328,13 @@ const Search = { for (const [title, foundTitles] of Object.entries(allTitles)) { if (title.toLowerCase().trim().includes(queryLower) && (queryLower.length >= title.length/2)) { for (const [file, id] of foundTitles) { - const score = Math.round(Scorer.title * queryLower.length / title.length); - const boost = titles[file] === title ? 1 : 0; // add a boost for document titles + let score = Math.round(100 * queryLower.length / title.length) normalResults.push([ docNames[file], titles[file] !== title ? `${titles[file]} > ${title}` : title, id !== null ? "#" + id : "", null, - score + boost, + score, filenames[file], ]); } diff --git a/docs/build/html/examples.html b/docs/build/html/examples.html index e1936257..b19da7ab 100644 --- a/docs/build/html/examples.html +++ b/docs/build/html/examples.html @@ -5,7 +5,7 @@ Worked Example — Team Communication Toolkit 0.1.1 documentation - + @@ -134,7 +134,7 @@

Import Recommendations: Virtual Environment and Pip

Importing the Package

After you import the package and install dependencies, you can then use our tool in your Python script as follows:

-
from team_comm_tools import FeatureBuilder
+
from team_comm_tools import FeatureBuilder
 

Now you have access to the feature_builder module. This is the main class that you’ll need to interact with the Team Communication Toolkit.

@@ -148,7 +148,7 @@

Walkthrough: Running the FeatureBuilder on Your DataConfiguring the FeatureBuilder

The FeatureBuilder accepts any Pandas DataFrame as the input, so you can read in data in whatever format you like. For the purposes of this walkthrough, we’ll be using some jury deliberation data from Hu et al. (2021).

We first import Pandas and read in the dataframe:

-
@@ -327,21 +328,12 @@
Custom Features -
Analyzing First Percentage (%)
-
    -
  • The analyze_first_pct parameter allows you to “cut off” and separately analyze the first X% of a conversation, in case you wish to separately study different sections of a conversation as it progresses. For example, you may be interested in knowing how the attributes of the first 50% of a conversation differ from the attributes of the entire conversation. Then you can sepcify the following:

    -
    -
    analyze_first_pct: [0.5, 1.0]
    -
    +
    +
    Analyzing First Percentage (%) [Deprecated]
    +
    +

    Warning

    +

    Deprecated as of v.0.1.8. The analyze_first_pct parameter (and its underlying get_first_pct_of_chat method) has been removed. To analyze only a portion of a conversation, subset your input dataframe before passing it to the FeatureBuilder.

    -
      -
    • This will first analyze the first 50% of each conversation, and then analyze the full conversation.

    • -
    • By default, we will simply analyze 100% of each conversation.

    • -
    -
    -
  • -
Named Entity Recognition
@@ -373,8 +365,8 @@
Named Entity Recognitionuser_columns: A list specifying which utterance-level attributes to aggregate at the speaker/user level.

The table below summarizes the different types of aggregation, and the ways in which they can be customized:

- - +
Aggregation Overview
+@@ -445,6 +437,44 @@
Important Notes and Caveats +
Reducing Redundant Features
+
    +
  • New in v.0.1.8. The FeatureBuilder generates a large number of features, and many of them are highly correlated with one another or are sparsely populated (containing mostly missing or zero values). The following parameters allow the FeatureBuilder to automatically detect groups of redundant features and retain only one representative per group, as well as drop columns that are dominated by missing (NA) or zero values.

  • +
  • The drop_redundant_columns parameter is the main “switch” controlling this behavior. It defaults to False, meaning that no columns are dropped; instead, the FeatureBuilder simply logs which columns it identified as redundant. When set to True, the redundant columns are actually removed from the chat-, user-, and conversation-level outputs.

    +
    +
      +
    • Note that only features generated by the FeatureBuilder are considered. Your original input columns (metadata, outcome variables, and other non-numeric columns) are always preserved untouched.

    • +
    +
    +
  • +
  • The remaining parameters control how redundancy is detected:

    +
    +
      +
    • corr_thresh (default 0.9): the minimum absolute Spearman correlation at which two numeric features are treated as redundant. Features whose correlations meet this threshold are clustered into groups, and one representative (the column with the most valid data and highest variance) is kept from each group.

    • +
    • min_group_size (default 2): the minimum number of correlated columns required to form a redundancy group.

    • +
    • min_na_ratio (default 0.3): numeric columns whose fraction of missing (NA) values exceeds this threshold are flagged (and dropped, if drop_redundant_columns is True).

    • +
    • min_zero_ratio (default 0.9): numeric columns whose fraction of zero values exceeds this threshold are flagged (and dropped, if drop_redundant_columns is True).

    • +
    • treat_zero_as_na (default True): if True, zeros are treated as missing values when computing redundancy metrics and selecting the representative column for each group.

    • +
    +
    +
  • +
  • Example: dropping correlated features. To actually drop redundant columns (rather than just logging them), set drop_redundant_columns = True. Recall that this parameter defaults to False, so you must opt in. The example below keeps only one representative from each group of features correlated at an absolute Spearman correlation of 0.9 or higher:

    +
    +
    jury_feature_builder = FeatureBuilder(
    +        input_df = juries_df,
    +        grouping_keys = ["batch_num", "round_num"],
    +        output_file_base = "jury_output",
    +        drop_redundant_columns = True,  # Defaults to False; set to True to drop redundant columns rather than only logging them.
    +        corr_thresh = 0.9               # Cluster features correlated at >= 0.9 (absolute Spearman) and keep one representative per group.
    +)
    +jury_feature_builder.featurize()
    +
    +
    +
    +
  • +
+
Cumulative Grouping
    diff --git a/docs/build/html/feature_builder.html b/docs/build/html/feature_builder.html index d6f474d0..b1e6b830 100644 --- a/docs/build/html/feature_builder.html +++ b/docs/build/html/feature_builder.html @@ -5,7 +5,7 @@ feature_builder module — Team Communication Toolkit 0.1.1 documentation - + @@ -56,8 +56,10 @@
  • FeatureBuilder.chat_level_features()
  • FeatureBuilder.conv_level_features()
  • FeatureBuilder.featurize()
  • -
  • FeatureBuilder.get_first_pct_of_chat()
  • +
  • FeatureBuilder.generate_summary_stats()
  • +
  • FeatureBuilder.keep_one_column_per_group()
  • FeatureBuilder.load_custem_liwc_dict()
  • +
  • FeatureBuilder.log_column_groups()
  • FeatureBuilder.merge_conv_data_with_original()
  • FeatureBuilder.preprocess_chat_data()
  • FeatureBuilder.save_features()
  • @@ -99,7 +101,7 @@

    feature_builder module

    -class feature_builder.FeatureBuilder(input_df: DataFrame, vector_directory: str = './vector_data/', output_file_base: str = 'output', output_file_path_chat_level: str = None, output_file_path_user_level: str = None, output_file_path_conv_level: str = None, custom_features: list = [], analyze_first_pct: list = [1.0], turns: bool = False, conversation_id_col: str = 'conversation_num', speaker_id_col: str = 'speaker_nickname', message_col: str = 'message', timestamp_col: str | tuple[str, str] = 'timestamp', timestamp_unit='ms', grouping_keys: list = [], cumulative_grouping=False, within_task=False, ner_training_df: DataFrame = None, ner_cutoff: int = 0.9, regenerate_vectors: bool = False, compute_vectors_from_preprocessed: bool = False, custom_liwc_dictionary_path: str = '', convo_aggregation=True, convo_methods: list = ['mean', 'max', 'min', 'stdev'], convo_columns: list = None, user_aggregation=True, user_methods: list = ['mean', 'max', 'min', 'stdev'], user_columns: list = None, use_gpu: bool = False)
    +class feature_builder.FeatureBuilder(input_df: DataFrame, vector_directory: str = './vector_data/', output_file_base: str = 'output', output_file_path_chat_level: str = None, output_file_path_user_level: str = None, output_file_path_conv_level: str = None, custom_features: list = [], turns: bool = False, conversation_id_col: str = 'conversation_num', speaker_id_col: str = 'speaker_nickname', message_col: str = 'message', timestamp_col: str | tuple[str, str] = 'timestamp', timestamp_unit='ms', grouping_keys: list = [], cumulative_grouping=False, within_task=False, ner_training_df: DataFrame = None, ner_cutoff: float = 0.9, regenerate_vectors: bool = False, compute_vectors_from_preprocessed: bool = False, custom_liwc_dictionary_path: str = '', convo_aggregation=True, convo_methods: list = ['mean', 'max', 'min', 'stdev'], convo_columns: list = None, user_aggregation=True, user_methods: list = ['mean', 'max', 'min', 'stdev'], user_columns: list = None, use_gpu: bool = False, corr_thresh: float = 0.9, min_na_ratio: float = 0.3, min_zero_ratio: float = 0.9, min_group_size: int = 2, treat_zero_as_na: bool = True, drop_redundant_columns: bool = False)

    Bases: object

    The FeatureBuilder is the main engine that reads in the user’s inputs and specifications and generates conversational features. The FeatureBuilder separately calls the classes @@ -121,8 +123,6 @@

  • custom_features (list, optional) – A list of additional features outside of the default features that should be calculated. Defaults to an empty list (i.e., no additional features beyond the defaults will be computed).

  • -
  • analyze_first_pct (list(float), optional) – Analyze the first X% of the data. This parameter is useful because the -earlier stages of the conversation may be more predictive than the later stages. Defaults to [1.0].

  • turns (bool, optional) – If true, collapses multiple “chats”/messages by the same speaker in a row into a single “turn.” Defaults to False.

  • conversation_id_col (str, optional) – A string representing the column name that should be selected as @@ -131,7 +131,7 @@ Defaults to “speaker_nickname”.

  • message_col (str, optional) – A string representing the column name that should be selected as the message. Defaults to “message”.

  • -
  • timestamp_col (str, optional) – A string representing the column name that should be selected as the message. +

  • timestamp_col (str | tuple[str, str], optional) – A timestamp column name, or a tuple of (start_timestamp_col, end_timestamp_col). Defaults to “timestamp”.

  • timestamp_unit (str, optional) – A string representing the unit of the timestamp (if the timestamp is numeric). Defaults to ‘ms’ (milliseconds). Other options (D, s, ms, us, ns) can be found on the Pandas @@ -141,14 +141,14 @@ “conversational identifier.”

  • cumulative_grouping (bool, optional) – If true, uses a cumulative way of grouping chats (looking not just within a single ID, but also at what happened before). NOTE: This parameter and the following one -(within_grouping) were created in the context of a multi-stage Empirica game (see: +(within_task) were created in the context of a multi-stage Empirica game (see: https://github.com/Watts-Lab/multi-task-empirica). Assumes exactly 3 nested columns at different levels: a High, Mid, and Low level; that are temporally nested. Defaults to False.

  • within_task (bool, optional) – If true, groups cumulatively such that only prior chats of the same “task” (Mid-level identifier) are considered. Defaults to False.

  • ner_training_df (pd.DataFrame, optional) – A pandas DataFrame of training data for named entity recognition features. Defaults to None and will not generate named entity features if it does not exist.

  • -
  • ner_cutoff (int) – The cutoff value for the confidence of prediction for each named entity. +

  • ner_cutoff (float) – The cutoff value for the confidence of prediction for each named entity. Defaults to 0.9.

  • regenerate_vectors (bool, optional) – If true, regenerates vector data even if it already exists. Defaults to False.

  • compute_vectors_from_preprocessed (bool, optional) – If true, computes vectors using preprocessed text (with @@ -165,6 +165,18 @@

  • user_columns (list, optional) – Specifies which columns (at the utterance/chat level) to aggregate for the speaker/user level. Defaults to all numeric columns.

  • use_gpu (bool, optional) – Specifies whether to use GPU for vert/bert model. Defaults to False.

  • +
  • corr_thresh (float, optional) – Minimum absolute Spearman correlation used to treat two numeric +columns as redundant during summary reduction. Defaults to 0.9.

  • +
  • min_na_ratio (float, optional) – Threshold for dropping numeric columns with high missing-value +ratio during summary reduction. Defaults to 0.3.

  • +
  • min_zero_ratio (float, optional) – Threshold for dropping numeric columns with high zero ratio +during summary reduction. Defaults to 0.9.

  • +
  • min_group_size (int, optional) – Minimum connected-component size to treat a correlated set +of columns as a redundancy group. Defaults to 2.

  • +
  • treat_zero_as_na (bool, optional) – If true, zeros are treated as missing values when computing +redundancy metrics and selecting representative columns. Defaults to True.

  • +
  • drop_redundant_columns (bool, optional) – If true, chat/user/conversation outputs are reduced to +representative numeric columns based on summary statistics. Defaults to False.

Returns:
@@ -227,20 +239,43 @@
-
-get_first_pct_of_chat(percentage) None
-

Truncate each conversation to the first X% of rows.

-

This function groups the chat data by conversation_num and retains only -the first X% of rows for each conversation.

+
+generate_summary_stats(df) DataFrame
+

Log and optionally reduce redundant numeric feature columns.

+

The method identifies numeric columns with high missingness and zero rates, +discovers highly correlated feature groups, and retains one representative +per group. Non-numeric columns are preserved and reattached before return.

Parameters:
-

percentage (float) – Percentage of rows to retain in each conversation

+

df (pd.DataFrame) – Input dataframe to summarize and optionally reduce.

Returns:
-

None

+

Dataframe with non-numeric columns plus filtered numeric columns.

Return type:
-

None

+

pd.DataFrame

+
+
+
+ +
+
+keep_one_column_per_group(df, groups)
+

Select one representative column from each correlated group.

+

Non-grouped columns are preserved, and grouped columns are reduced to the +best-scoring representative based on valid-count and variance.

+
+
Parameters:
+
    +
  • df (pd.DataFrame) – Original dataframe.

  • +
  • groups (list[list[str]]) – Groups of similar columns.

  • +
+
+
Returns:
+

Final list of columns to keep.

+
+
Return type:
+

list[str]

@@ -249,10 +284,11 @@
load_custem_liwc_dict(custom_liwc_dictionary_path: str) dict

Load the custom LIWC dictionary from the provided path.

-

This function reads the custom LIWC dictionary from the provided path and returns the dictionary.

+

This function reads the custom LIWC dictionary from the provided path and returns +the parsed dictionary. If the path is empty/invalid, returns an empty dict.

Parameters:
-

custom_liwc_dictionary_path (str) – Path to the custom LIWC dictionary file

+

custom_liwc_dictionary_path (str) – Path to the custom LIWC dictionary file.

Returns:

Custom LIWC dictionary

@@ -263,6 +299,28 @@
+
+
+log_column_groups(groups, max_groups, max_cols_per_group)
+

Log correlated feature groups to standard and detailed loggers.

+
+
Parameters:
+
    +
  • groups (list[list[str]]) – Correlated column groups.

  • +
  • max_groups (int) – Maximum number of groups to print to the standard logger.

  • +
  • max_cols_per_group (int) – Maximum number of columns shown per group in +the standard logger.

  • +
+
+
Returns:
+

None

+
+
Return type:
+

None

+
+
+
+
merge_conv_data_with_original() None
@@ -287,19 +345,12 @@

Call all preprocessing modules needed to clean the chat text.

This function groups the chat data as specified, verifies column presence, creates original and lowercased columns, preprocesses text, and optionally processes chat turns.

-
Parameters:
-
    -
  • turns (bool, optional) – Whether to preprocess naive turns, defaults to False

  • -
  • col – Columns to preprocess, including conversation_id, speaker_id and message, defaults to None

  • -
  • within_task (bool, optional) – Whether to group within tasks, defaults to False

  • -
+
Returns:
+

None

-
Returns:
+
Return type:

None

-
Return type:
-

None

-
@@ -359,7 +410,7 @@ either as datetime or as numeric values suitable for time difference calculations.

Parameters:
-

timestamp_col (str) – The name of the column to verify

+

timestamp_col (str) – The name of the column to verify.

Returns:

None

diff --git a/docs/build/html/features/basic_features.html b/docs/build/html/features/basic_features.html index fb076f47..a3185660 100644 --- a/docs/build/html/features/basic_features.html +++ b/docs/build/html/features/basic_features.html @@ -5,7 +5,7 @@ basic_features module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/burstiness.html b/docs/build/html/features/burstiness.html index 621048c4..e8c6eacb 100644 --- a/docs/build/html/features/burstiness.html +++ b/docs/build/html/features/burstiness.html @@ -5,7 +5,7 @@ burstiness module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/certainty.html b/docs/build/html/features/certainty.html index 0c5db974..cafed709 100644 --- a/docs/build/html/features/certainty.html +++ b/docs/build/html/features/certainty.html @@ -5,7 +5,7 @@ certainty module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/discursive_diversity.html b/docs/build/html/features/discursive_diversity.html index e1b6a9d0..565ecfd0 100644 --- a/docs/build/html/features/discursive_diversity.html +++ b/docs/build/html/features/discursive_diversity.html @@ -5,7 +5,7 @@ discursive_diversity module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/fflow.html b/docs/build/html/features/fflow.html index b34292b1..6fd530b8 100644 --- a/docs/build/html/features/fflow.html +++ b/docs/build/html/features/fflow.html @@ -5,7 +5,7 @@ fflow module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/get_all_DD_features.html b/docs/build/html/features/get_all_DD_features.html index 98feae1c..6383c54c 100644 --- a/docs/build/html/features/get_all_DD_features.html +++ b/docs/build/html/features/get_all_DD_features.html @@ -5,7 +5,7 @@ get_all_DD_features module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/get_user_network.html b/docs/build/html/features/get_user_network.html index 2f6ec589..fc9d2328 100644 --- a/docs/build/html/features/get_user_network.html +++ b/docs/build/html/features/get_user_network.html @@ -5,7 +5,7 @@ get_user_network module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/hedge.html b/docs/build/html/features/hedge.html index dfdadbc8..292577b9 100644 --- a/docs/build/html/features/hedge.html +++ b/docs/build/html/features/hedge.html @@ -5,7 +5,7 @@ hedge module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/index.html b/docs/build/html/features/index.html index 62668b2b..cfdb2df2 100644 --- a/docs/build/html/features/index.html +++ b/docs/build/html/features/index.html @@ -5,7 +5,7 @@ Features: Technical Documentation — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/info_exchange_zscore.html b/docs/build/html/features/info_exchange_zscore.html index 74b9b3fb..e84769c5 100644 --- a/docs/build/html/features/info_exchange_zscore.html +++ b/docs/build/html/features/info_exchange_zscore.html @@ -5,7 +5,7 @@ info_exchange_zscore module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/information_diversity.html b/docs/build/html/features/information_diversity.html index f6a9d846..667f5cb6 100644 --- a/docs/build/html/features/information_diversity.html +++ b/docs/build/html/features/information_diversity.html @@ -5,7 +5,7 @@ information_diversity module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/lexical_features_v2.html b/docs/build/html/features/lexical_features_v2.html index b5a32c98..39e2ae12 100644 --- a/docs/build/html/features/lexical_features_v2.html +++ b/docs/build/html/features/lexical_features_v2.html @@ -5,7 +5,7 @@ lexical_features_v2 module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/named_entity_recognition_features.html b/docs/build/html/features/named_entity_recognition_features.html index 9362689f..7446a3f3 100644 --- a/docs/build/html/features/named_entity_recognition_features.html +++ b/docs/build/html/features/named_entity_recognition_features.html @@ -5,7 +5,7 @@ named_entity_recognition_features module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/other_lexical_features.html b/docs/build/html/features/other_lexical_features.html index 7e8d54e6..75f6b189 100644 --- a/docs/build/html/features/other_lexical_features.html +++ b/docs/build/html/features/other_lexical_features.html @@ -5,7 +5,7 @@ other_lexical_features module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/politeness_features.html b/docs/build/html/features/politeness_features.html index b2993712..e90ffe38 100644 --- a/docs/build/html/features/politeness_features.html +++ b/docs/build/html/features/politeness_features.html @@ -5,7 +5,7 @@ politeness_features module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/politeness_v2.html b/docs/build/html/features/politeness_v2.html index a4f6a697..d7281bc3 100644 --- a/docs/build/html/features/politeness_v2.html +++ b/docs/build/html/features/politeness_v2.html @@ -5,7 +5,7 @@ politeness_v2 module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/politeness_v2_helper.html b/docs/build/html/features/politeness_v2_helper.html index 15384d08..7167c13e 100644 --- a/docs/build/html/features/politeness_v2_helper.html +++ b/docs/build/html/features/politeness_v2_helper.html @@ -5,7 +5,7 @@ politeness_v2_helper module — Team Communication Toolkit 0.1.1 documentation - + @@ -134,16 +134,16 @@
features.politeness_v2_helper.Question(doc)
-

Counts the number of sentences containing question words and question marks.

+

Counts the number of sentences containing question words and question marks. +Reference: https://github.com/bbevis/politenessPy/blob/main/strategy_extractor.py +:param doc: The spaCy Doc object containing the text to be analyzed. +:type doc: spacy.tokens.Doc

-
Parameters:
-

doc (spacy.tokens.Doc) – The spaCy Doc object containing the text to be analyzed.

-
-
Returns:
-

A tuple containing the counts of Yes/No questions and WH-questions.

+
Returns:
+

A tuple containing the counts of Yes/No questions and WH-questions.

-
Return type:
-

tuple

+
Return type:
+

tuple

diff --git a/docs/build/html/features/question_num.html b/docs/build/html/features/question_num.html index eeecb2c5..c3333139 100644 --- a/docs/build/html/features/question_num.html +++ b/docs/build/html/features/question_num.html @@ -5,7 +5,7 @@ question_num module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/readability.html b/docs/build/html/features/readability.html index 64677857..f127aa5b 100644 --- a/docs/build/html/features/readability.html +++ b/docs/build/html/features/readability.html @@ -5,7 +5,7 @@ readability module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/reddit_tags.html b/docs/build/html/features/reddit_tags.html index c40a3c51..b946042e 100644 --- a/docs/build/html/features/reddit_tags.html +++ b/docs/build/html/features/reddit_tags.html @@ -5,7 +5,7 @@ reddit_tags module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/temporal_features.html b/docs/build/html/features/temporal_features.html index 0e073f57..a9b30791 100644 --- a/docs/build/html/features/temporal_features.html +++ b/docs/build/html/features/temporal_features.html @@ -5,7 +5,7 @@ temporal_features module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/textblob_sentiment_analysis.html b/docs/build/html/features/textblob_sentiment_analysis.html index 64f5b70f..b243301c 100644 --- a/docs/build/html/features/textblob_sentiment_analysis.html +++ b/docs/build/html/features/textblob_sentiment_analysis.html @@ -5,7 +5,7 @@ textblob_sentiment_analysis module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/turn_taking_features.html b/docs/build/html/features/turn_taking_features.html index 05f45fc1..0017dbbf 100644 --- a/docs/build/html/features/turn_taking_features.html +++ b/docs/build/html/features/turn_taking_features.html @@ -5,7 +5,7 @@ turn_taking_features module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/variance_in_DD.html b/docs/build/html/features/variance_in_DD.html index e4c14026..7269c81d 100644 --- a/docs/build/html/features/variance_in_DD.html +++ b/docs/build/html/features/variance_in_DD.html @@ -5,7 +5,7 @@ variance_in_DD module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/within_person_discursive_range.html b/docs/build/html/features/within_person_discursive_range.html index b4f48d8c..ca68f3a5 100644 --- a/docs/build/html/features/within_person_discursive_range.html +++ b/docs/build/html/features/within_person_discursive_range.html @@ -5,7 +5,7 @@ within_person_discursive_range module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features/word_mimicry.html b/docs/build/html/features/word_mimicry.html index ff752fbd..821870e1 100644 --- a/docs/build/html/features/word_mimicry.html +++ b/docs/build/html/features/word_mimicry.html @@ -5,7 +5,7 @@ word_mimicry module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features_conceptual/TEMPLATE.html b/docs/build/html/features_conceptual/TEMPLATE.html index 502b4bf2..9224c03c 100644 --- a/docs/build/html/features_conceptual/TEMPLATE.html +++ b/docs/build/html/features_conceptual/TEMPLATE.html @@ -5,7 +5,7 @@ FEATURE NAME — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features_conceptual/content_word_accommodation.html b/docs/build/html/features_conceptual/content_word_accommodation.html index f65076bb..406d5648 100644 --- a/docs/build/html/features_conceptual/content_word_accommodation.html +++ b/docs/build/html/features_conceptual/content_word_accommodation.html @@ -5,7 +5,7 @@ Content Word Accommodation — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features_conceptual/function_word_accommodation.html b/docs/build/html/features_conceptual/function_word_accommodation.html index eea9ee2e..7a15fd23 100644 --- a/docs/build/html/features_conceptual/function_word_accommodation.html +++ b/docs/build/html/features_conceptual/function_word_accommodation.html @@ -5,7 +5,7 @@ Function Word Accommodation — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features_conceptual/index.html b/docs/build/html/features_conceptual/index.html index abbd3065..56877e4c 100644 --- a/docs/build/html/features_conceptual/index.html +++ b/docs/build/html/features_conceptual/index.html @@ -5,7 +5,7 @@ Features: Conceptual Documentation — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features_conceptual/mimicry_bert.html b/docs/build/html/features_conceptual/mimicry_bert.html index 3c336275..d700313b 100644 --- a/docs/build/html/features_conceptual/mimicry_bert.html +++ b/docs/build/html/features_conceptual/mimicry_bert.html @@ -5,7 +5,7 @@ Mimicry (BERT) — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features_conceptual/moving_mimicry.html b/docs/build/html/features_conceptual/moving_mimicry.html index a0a73784..6bfa9d79 100644 --- a/docs/build/html/features_conceptual/moving_mimicry.html +++ b/docs/build/html/features_conceptual/moving_mimicry.html @@ -5,7 +5,7 @@ Moving Mimicry — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features_conceptual/named_entity_recognition.html b/docs/build/html/features_conceptual/named_entity_recognition.html index 699023d1..10fdc796 100644 --- a/docs/build/html/features_conceptual/named_entity_recognition.html +++ b/docs/build/html/features_conceptual/named_entity_recognition.html @@ -5,7 +5,7 @@ Named Entity Recognition — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features_conceptual/positivity_bert.html b/docs/build/html/features_conceptual/positivity_bert.html index 9b11f4bd..9cf6455d 100644 --- a/docs/build/html/features_conceptual/positivity_bert.html +++ b/docs/build/html/features_conceptual/positivity_bert.html @@ -5,7 +5,7 @@ Sentiment (RoBERTa) — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features_conceptual/turn_taking_index.html b/docs/build/html/features_conceptual/turn_taking_index.html index a248019b..7ca0f119 100644 --- a/docs/build/html/features_conceptual/turn_taking_index.html +++ b/docs/build/html/features_conceptual/turn_taking_index.html @@ -5,7 +5,7 @@ Turn Taking Index — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/features_conceptual/word_ttr.html b/docs/build/html/features_conceptual/word_ttr.html index 2dd1f03d..91047dd1 100644 --- a/docs/build/html/features_conceptual/word_ttr.html +++ b/docs/build/html/features_conceptual/word_ttr.html @@ -5,7 +5,7 @@ Word Type-Token Ratio — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/genindex.html b/docs/build/html/genindex.html index 4c705366..11c6fcfd 100644 --- a/docs/build/html/genindex.html +++ b/docs/build/html/genindex.html @@ -4,7 +4,7 @@ Index — Team Communication Toolkit 0.1.1 documentation - + @@ -86,6 +86,7 @@

Index

| F | G | I + | K | L | M | N @@ -462,6 +463,8 @@

G

  • generate_certainty_pkl() (in module utils.check_embeddings)
  • generate_lexicon_pkl() (in module utils.check_embeddings) +
  • +
  • generate_summary_stats() (feature_builder.FeatureBuilder method)
  • generate_vect() (in module utils.check_embeddings)
  • @@ -490,8 +493,6 @@

    G

  • get_dep_pairs_noneg() (in module features.politeness_v2_helper)
  • get_discursive_diversity_features() (utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator method) -
  • -
  • get_first_pct_of_chat() (feature_builder.FeatureBuilder method)
  • get_first_person_words() (in module utils.preload_word_lists)
  • @@ -628,6 +629,14 @@

    I

    Aggregation Overview
    +

    K

    + + +
    +

    L

    - +
    @@ -820,11 +831,13 @@

    S

  • sentence_split() (in module features.politeness_v2_helper)
  • - - +
    • set_self_conv_data() (feature_builder.FeatureBuilder method) +
    • +
    • setup_logger() (in module utils.preprocess)
    • sort_words() (in module utils.check_embeddings)
    • diff --git a/docs/build/html/index.html b/docs/build/html/index.html index de04f500..07143a50 100644 --- a/docs/build/html/index.html +++ b/docs/build/html/index.html @@ -5,7 +5,7 @@ The Team Communication Toolkit — Team Communication Toolkit 0.1.1 documentation - + @@ -102,7 +102,7 @@

      Import Recommendations: Virtual Environment and PipWe strongly recommend using a virtual environment in Python to run the package. We have several specific dependency requirements. One important one is that we are currently only compatible with numpy < 2.0.0 because numpy 2.0.0 and above made significant changes that are not compatible with other dependencies of our package. As those dependencies are updated, we will support later versions of numpy.

      We also strongly recommend that your version of pip is up-to-date (>=24.0). There have been reports in which users have had trouble downloading dependencies (specifically, the Spacy package) with older versions of pip. If you get an error with downloading en_core_web_sm, we recommend updating pip.

      After you import the package and install dependencies, you can then use our tool in your Python script as follows:

      -
      from team_comm_tools import FeatureBuilder
      +
      from team_comm_tools import FeatureBuilder
       

      Note: PyPI treats hyphens and underscores equally, so “pip install team_comm_tools” and “pip install team-comm-tools” are equivalent. However, Python does NOT treat them equally, and you should use underscores when you import the package, like this: from team_comm_tools import FeatureBuilder.

      diff --git a/docs/build/html/intro.html b/docs/build/html/intro.html index 59e7ccf0..7346f6da 100644 --- a/docs/build/html/intro.html +++ b/docs/build/html/intro.html @@ -5,7 +5,7 @@ Introduction — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/objects.inv b/docs/build/html/objects.inv index cd350844..235e9a3f 100644 Binary files a/docs/build/html/objects.inv and b/docs/build/html/objects.inv differ diff --git a/docs/build/html/py-modindex.html b/docs/build/html/py-modindex.html index fc63afc0..f21e03c9 100644 --- a/docs/build/html/py-modindex.html +++ b/docs/build/html/py-modindex.html @@ -4,7 +4,7 @@ Python Module Index — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/search.html b/docs/build/html/search.html index 55b7894e..3c8da28b 100644 --- a/docs/build/html/search.html +++ b/docs/build/html/search.html @@ -4,7 +4,7 @@ Search — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/searchindex.js b/docs/build/html/searchindex.js index f5ab94f6..6a846766 100644 --- a/docs/build/html/searchindex.js +++ b/docs/build/html/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"A Light-Touch, One-Function Package": [[0, "a-light-touch-one-function-package"]], "Additional FeatureBuilder Considerations": [[1, "additional-featurebuilder-considerations"]], "Advanced Configuration Columns": [[1, "advanced-configuration-columns"]], "Aggregation Overview": [[1, "id2"]], "Analyzing First Percentage (%)": [[1, "analyzing-first-percentage"]], "Base Conversation-Level Features": [[11, "base-conversation-level-features"]], "Basic Input Columns": [[1, "basic-input-columns"]], "Certainty": [[30, null]], "Citation": [[29, "citation"], [30, "citation"], [31, "citation"], [32, "citation"], [33, "citation"], [34, "citation"], [35, "citation"], [36, "citation"], [37, "citation"], [38, "citation"], [40, "citation"], [41, "citation"], [42, "citation"], [43, "citation"], [44, "citation"], [45, "citation"], [46, "citation"], [47, "citation"], [48, "citation"], [49, "citation"], [50, "citation"], [51, "citation"], [52, "citation"], [53, "citation"], [54, "citation"], [55, "citation"], [56, "citation"], [57, "citation"], [58, "citation"], [59, "citation"], [60, "citation"]], "Configuring the FeatureBuilder": [[1, "configuring-the-featurebuilder"]], "Content Word Accommodation": [[31, null]], "Contents:": [[61, null]], "Conversation Parameters": [[1, "conversation-parameters"]], "Conversation-Level Aggregates": [[11, "conversation-level-aggregates"]], "Conversation-Level Features": [[11, "conversation-level-features"], [39, "conversation-level-features"]], "Conversational Repair": [[32, null]], "Cumulative Grouping": [[1, "cumulative-grouping"]], "Custom Aggregation": [[1, "custom-aggregation"]], "Custom Features": [[1, "custom-features"]], "Customizable Parameters": [[0, "customizable-parameters"]], "Dale-Chall Score": [[33, null]], "Declaring a FeatureBuilder": [[61, "declaring-a-featurebuilder"]], "Demo / Sample Code": [[0, "demo-sample-code"], [1, "demo-sample-code"]], "Discursive Diversity": [[34, null]], "Example Usage of Custom Aggregation Parameters": [[1, "example-usage-of-custom-aggregation-parameters"]], "Example:": [[41, "example"]], "FEATURE NAME": [[29, null]], "Feature Column Names": [[1, "feature-column-names"], [61, "feature-column-names"]], "Feature Documentation": [[62, "feature-documentation"]], "Feature Information": [[1, "feature-information"], [61, "feature-information"]], "Features: Conceptual Documentation": [[39, null]], "Features: Technical Documentation": [[11, null]], "Forward Flow": [[35, null]], "Function Word Accommodation": [[36, null]], "Generating Features: Utterance-, Speaker-, and Conversation-Level": [[62, "generating-features-utterance-speaker-and-conversation-level"]], "Generating Vectors using GPU": [[1, "generating-vectors-using-gpu"]], "Getting Started": [[1, "getting-started"], [61, "getting-started"], [62, "getting-started"]], "Gini Coefficient": [[37, null]], "Hedge": [[38, null]], "High*Level Intuition": [[54, "high-level-intuition"]], "High-Level Intuition": [[29, "high-level-intuition"], [30, "high-level-intuition"], [31, "high-level-intuition"], [32, "high-level-intuition"], [33, "high-level-intuition"], [34, "high-level-intuition"], [35, "high-level-intuition"], [36, "high-level-intuition"], [37, "high-level-intuition"], [38, "high-level-intuition"], [40, "high-level-intuition"], [41, "high-level-intuition"], [42, "high-level-intuition"], [43, "high-level-intuition"], [44, "high-level-intuition"], [45, "high-level-intuition"], [46, "high-level-intuition"], [47, "high-level-intuition"], [48, "high-level-intuition"], [49, "high-level-intuition"], [50, "high-level-intuition"], [51, "high-level-intuition"], [52, "high-level-intuition"], [53, "high-level-intuition"], [55, "high-level-intuition"], [56, "high-level-intuition"], [57, "high-level-intuition"], [58, "high-level-intuition"], [59, "high-level-intuition"], [60, "high-level-intuition"]], "Implementation": [[32, "implementation"], [42, "implementation"], [52, "implementation"], [54, "implementation"]], "Implementation Basics": [[29, "implementation-basics"], [30, "implementation-basics"], [31, "implementation-basics"], [33, "implementation-basics"], [34, "implementation-basics"], [35, "implementation-basics"], [36, "implementation-basics"], [37, "implementation-basics"], [38, "implementation-basics"], [40, "implementation-basics"], [41, "implementation-basics"], [43, "implementation-basics"], [44, "implementation-basics"], [45, "implementation-basics"], [46, "implementation-basics"], [47, "implementation-basics"], [48, "implementation-basics"], [49, "implementation-basics"], [50, "implementation-basics"], [51, "implementation-basics"], [53, "implementation-basics"], [55, "implementation-basics"], [56, "implementation-basics"], [57, "implementation-basics"], [58, "implementation-basics"], [59, "implementation-basics"], [60, "implementation-basics"]], "Implementation Notes/Caveats": [[29, "implementation-notes-caveats"], [30, "implementation-notes-caveats"], [31, "implementation-notes-caveats"], [33, "implementation-notes-caveats"], [34, "implementation-notes-caveats"], [35, "implementation-notes-caveats"], [36, "implementation-notes-caveats"], [38, "implementation-notes-caveats"], [40, "implementation-notes-caveats"], [41, "implementation-notes-caveats"], [43, "implementation-notes-caveats"], [44, "implementation-notes-caveats"], [45, "implementation-notes-caveats"], [46, "implementation-notes-caveats"], [47, "implementation-notes-caveats"], [48, "implementation-notes-caveats"], [49, "implementation-notes-caveats"], [50, "implementation-notes-caveats"], [51, "implementation-notes-caveats"], [53, "implementation-notes-caveats"], [55, "implementation-notes-caveats"], [56, "implementation-notes-caveats"], [57, "implementation-notes-caveats"], [58, "implementation-notes-caveats"], [59, "implementation-notes-caveats"]], "Import Recommendations: Virtual Environment and Pip": [[1, "import-recommendations-virtual-environment-and-pip"], [61, "import-recommendations-virtual-environment-and-pip"]], "Important Notes and Caveats": [[1, "important-notes-and-caveats"]], "Importing the Package": [[1, "importing-the-package"]], "Indices and Tables": [[61, "indices-and-tables"]], "Information Diversity": [[40, null]], "Information Exchange": [[41, null]], "Input File": [[34, "id2"]], "Inspecting Generated Features": [[1, "inspecting-generated-features"], [61, "inspecting-generated-features"]], "Interpretation:": [[41, "interpretation"]], "Interpreting the Feature": [[29, "interpreting-the-feature"], [30, "interpreting-the-feature"], [31, "interpreting-the-feature"], [32, "interpreting-the-feature"], [33, "interpreting-the-feature"], [34, "interpreting-the-feature"], [35, "interpreting-the-feature"], [36, "interpreting-the-feature"], [37, "interpreting-the-feature"], [38, "interpreting-the-feature"], [40, "interpreting-the-feature"], [41, "interpreting-the-feature"], [42, "interpreting-the-feature"], [43, "interpreting-the-feature"], [44, "interpreting-the-feature"], [45, "interpreting-the-feature"], [46, "interpreting-the-feature"], [47, "interpreting-the-feature"], [48, "interpreting-the-feature"], [49, "interpreting-the-feature"], [50, "interpreting-the-feature"], [51, "interpreting-the-feature"], [52, "interpreting-the-feature"], [53, "interpreting-the-feature"], [54, "interpreting-the-feature"], [55, "interpreting-the-feature"], [56, "interpreting-the-feature"], [57, "interpreting-the-feature"], [58, "interpreting-the-feature"], [59, "interpreting-the-feature"], [60, "interpreting-the-feature"]], "Introduction": [[62, null]], "Key Assumptions and Parameters": [[0, "key-assumptions-and-parameters"]], "Linguistic Inquiry and Word Count (LIWC) and Other Lexicons": [[42, null]], "Message Length": [[43, null]], "Message Quantity": [[44, null]], "Mimicry (BERT)": [[45, null]], "Motivation": [[62, "motivation"]], "Moving Mimicry": [[46, null]], "Named Entity Recognition": [[1, "named-entity-recognition"], [47, null]], "Named Entity Training Examples": [[47, "id2"]], "New in v.1.0.5: \u201cBring Your Own LIWC\u201d Custom Lexicon": [[42, "new-in-v-1-0-5-bring-your-own-liwc-custom-lexicon"]], "Online Discussion Tags": [[48, null]], "Other Utilities": [[69, "other-utilities"]], "Ouput File": [[34, "id3"]], "Our Team": [[62, "our-team"]], "Output File": [[30, "id2"], [35, "id2"], [45, "id2"], [46, "id2"], [47, "id3"], [51, "id1"]], "Output File Naming Details": [[1, "output-file-naming-details"]], "Package Assumptions": [[0, "package-assumptions"]], "Politeness Strategies": [[50, null]], "Politeness/Receptiveness Markers": [[49, null]], "Positivity Z-Score": [[52, null]], "Proportion of First Person Pronouns": [[53, null]], "Question (Naive)": [[54, null]], "Regenerating Vector Cache": [[1, "regenerating-vector-cache"]], "Related Features": [[29, "related-features"], [30, "related-features"], [31, "related-features"], [32, "related-features"], [33, "related-features"], [34, "related-features"], [35, "related-features"], [36, "related-features"], [37, "related-features"], [38, "related-features"], [40, "related-features"], [41, "related-features"], [42, "related-features"], [43, "related-features"], [44, "related-features"], [45, "related-features"], [46, "related-features"], [47, "related-features"], [48, "related-features"], [49, "related-features"], [50, "related-features"], [51, "related-features"], [52, "related-features"], [53, "related-features"], [54, "related-features"], [55, "related-features"], [56, "related-features"], [57, "related-features"], [58, "related-features"], [59, "related-features"], [60, "related-features"]], "Sentiment (RoBERTa)": [[51, null]], "Speaker Turn Counts": [[59, "id2"]], "Speaker- (User) Level Features": [[11, "speaker-user-level-features"]], "Table of Contents": [[61, "table-of-contents"]], "Team Burstiness": [[55, null]], "Textblob Polarity": [[56, null]], "Textblob Subjectivity": [[57, null]], "The Basics (Get Started Here!)": [[0, null]], "The FeatureBuilder": [[62, "the-featurebuilder"]], "The Team Communication Toolkit": [[61, null]], "Time Difference": [[58, null]], "Troubleshooting": [[1, "troubleshooting"], [61, "troubleshooting"]], "Turn Taking Index": [[59, null]], "Turns": [[1, "turns"]], "Using the Package": [[61, "using-the-package"]], "Utilities": [[69, null]], "Utterance- (Chat) Level Features": [[11, "utterance-chat-level-features"], [39, "utterance-chat-level-features"]], "Vector Directory": [[1, "vector-directory"]], "Walkthrough: Running the FeatureBuilder on Your Data": [[1, "walkthrough-running-the-featurebuilder-on-your-data"]], "Word Type-Token Ratio": [[60, null]], "Worked Example": [[1, null]], "assign_chunk_nums module": [[63, null]], "basic_features module": [[3, null]], "burstiness module": [[4, null]], "calculate_chat_level_features module": [[64, null]], "calculate_conversation_level_features module": [[65, null]], "calculate_user_level_features module": [[66, null]], "certainty module": [[5, null]], "check_embeddings module": [[67, null]], "discursive_diversity module": [[6, null]], "feature_builder module": [[2, null]], "fflow module": [[7, null]], "get_all_DD_features module": [[8, null]], "get_user_network module": [[9, null]], "gini_coefficient module": [[68, null]], "hedge module": [[10, null]], "info_exchange_zscore module": [[12, null]], "information_diversity module": [[13, null]], "lexical_features_v2 module": [[14, null]], "named_entity_recognition_features module": [[15, null]], "other_lexical_features module": [[16, null]], "politeness_features module": [[17, null]], "politeness_v2 module": [[18, null]], "politeness_v2_helper module": [[19, null]], "preload_word_lists module": [[70, null]], "preprocess module": [[71, null]], "question_num module": [[20, null]], "readability module": [[21, null]], "reddit_tags module": [[22, null]], "summarize_features module": [[72, null]], "temporal_features module": [[23, null]], "textblob_sentiment_analysis module": [[24, null]], "turn_taking_features module": [[25, null]], "variance_in_DD module": [[26, null]], "within_person_discursive_range module": [[27, null]], "word_mimicry module": [[28, null]], "z-scores:": [[41, "z-scores"]], "zscore_chats_and_conversation module": [[73, null]], "\u201cDriver\u201d Classes: Utterance-, Conversation-, and Speaker-Level Features": [[69, "driver-classes-utterance-conversation-and-speaker-level-features"]]}, "docnames": ["basics", "examples", "feature_builder", "features/basic_features", "features/burstiness", "features/certainty", "features/discursive_diversity", "features/fflow", "features/get_all_DD_features", "features/get_user_network", "features/hedge", "features/index", "features/info_exchange_zscore", "features/information_diversity", "features/lexical_features_v2", "features/named_entity_recognition_features", "features/other_lexical_features", "features/politeness_features", "features/politeness_v2", "features/politeness_v2_helper", "features/question_num", "features/readability", "features/reddit_tags", "features/temporal_features", "features/textblob_sentiment_analysis", "features/turn_taking_features", "features/variance_in_DD", "features/within_person_discursive_range", "features/word_mimicry", "features_conceptual/TEMPLATE", "features_conceptual/certainty", "features_conceptual/content_word_accommodation", "features_conceptual/conversational_repair", "features_conceptual/dale_chall_score", "features_conceptual/discursive_diversity", "features_conceptual/forward_flow", "features_conceptual/function_word_accommodation", "features_conceptual/gini_coefficient", "features_conceptual/hedge", "features_conceptual/index", "features_conceptual/information_diversity", "features_conceptual/information_exchange", "features_conceptual/liwc", "features_conceptual/message_length", "features_conceptual/message_quantity", "features_conceptual/mimicry_bert", "features_conceptual/moving_mimicry", "features_conceptual/named_entity_recognition", "features_conceptual/online_discussions_tags", "features_conceptual/politeness_receptiveness_markers", "features_conceptual/politeness_strategies", "features_conceptual/positivity_bert", "features_conceptual/positivity_z_score", "features_conceptual/proportion_of_first_person_pronouns", "features_conceptual/questions", "features_conceptual/team_burstiness", "features_conceptual/textblob_polarity", "features_conceptual/textblob_subjectivity", "features_conceptual/time_difference", "features_conceptual/turn_taking_index", "features_conceptual/word_ttr", "index", "intro", "utils/assign_chunk_nums", "utils/calculate_chat_level_features", "utils/calculate_conversation_level_features", "utils/calculate_user_level_features", "utils/check_embeddings", "utils/gini_coefficient", "utils/index", "utils/preload_word_lists", "utils/preprocess", "utils/summarize_features", "utils/zscore_chats_and_conversation"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["basics.rst", "examples.rst", "feature_builder.rst", "features/basic_features.rst", "features/burstiness.rst", "features/certainty.rst", "features/discursive_diversity.rst", "features/fflow.rst", "features/get_all_DD_features.rst", "features/get_user_network.rst", "features/hedge.rst", "features/index.rst", "features/info_exchange_zscore.rst", "features/information_diversity.rst", "features/lexical_features_v2.rst", "features/named_entity_recognition_features.rst", "features/other_lexical_features.rst", "features/politeness_features.rst", "features/politeness_v2.rst", "features/politeness_v2_helper.rst", "features/question_num.rst", "features/readability.rst", "features/reddit_tags.rst", "features/temporal_features.rst", "features/textblob_sentiment_analysis.rst", "features/turn_taking_features.rst", "features/variance_in_DD.rst", "features/within_person_discursive_range.rst", "features/word_mimicry.rst", "features_conceptual/TEMPLATE.rst", "features_conceptual/certainty.rst", "features_conceptual/content_word_accommodation.rst", "features_conceptual/conversational_repair.rst", "features_conceptual/dale_chall_score.rst", "features_conceptual/discursive_diversity.rst", "features_conceptual/forward_flow.rst", "features_conceptual/function_word_accommodation.rst", "features_conceptual/gini_coefficient.rst", "features_conceptual/hedge.rst", "features_conceptual/index.rst", "features_conceptual/information_diversity.rst", "features_conceptual/information_exchange.rst", "features_conceptual/liwc.rst", "features_conceptual/message_length.rst", "features_conceptual/message_quantity.rst", "features_conceptual/mimicry_bert.rst", "features_conceptual/moving_mimicry.rst", "features_conceptual/named_entity_recognition.rst", "features_conceptual/online_discussions_tags.rst", "features_conceptual/politeness_receptiveness_markers.rst", "features_conceptual/politeness_strategies.rst", "features_conceptual/positivity_bert.rst", "features_conceptual/positivity_z_score.rst", "features_conceptual/proportion_of_first_person_pronouns.rst", "features_conceptual/questions.rst", "features_conceptual/team_burstiness.rst", "features_conceptual/textblob_polarity.rst", "features_conceptual/textblob_subjectivity.rst", "features_conceptual/time_difference.rst", "features_conceptual/turn_taking_index.rst", "features_conceptual/word_ttr.rst", "index.rst", "intro.rst", "utils/assign_chunk_nums.rst", "utils/calculate_chat_level_features.rst", "utils/calculate_conversation_level_features.rst", "utils/calculate_user_level_features.rst", "utils/check_embeddings.rst", "utils/gini_coefficient.rst", "utils/index.rst", "utils/preload_word_lists.rst", "utils/preprocess.rst", "utils/summarize_features.rst", "utils/zscore_chats_and_conversation.rst"], "indexentries": {"adverb_limiter() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.adverb_limiter", false]], "assign_chunk_nums() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.assign_chunk_nums", false]], "bare_command() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.bare_command", false]], "built_spacy_ner() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.built_spacy_ner", false]], "burstiness() (in module features.burstiness)": [[4, "features.burstiness.burstiness", false]], "calculate_chat_level_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_chat_level_features", false]], "calculate_conversation_level_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.calculate_conversation_level_features", false]], "calculate_hedge_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_hedge_features", false]], "calculate_id_score() (in module features.information_diversity)": [[13, "features.information_diversity.calculate_ID_score", false]], "calculate_info_diversity() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.calculate_info_diversity", false]], "calculate_named_entities() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.calculate_named_entities", false]], "calculate_num_question_naive() (in module features.question_num)": [[20, "features.question_num.calculate_num_question_naive", false]], "calculate_politeness_sentiment() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_politeness_sentiment", false]], "calculate_politeness_v2() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_politeness_v2", false]], "calculate_team_burstiness() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.calculate_team_burstiness", false]], "calculate_textblob_sentiment() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_textblob_sentiment", false]], "calculate_user_level_features() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.calculate_user_level_features", false]], "calculate_vector_word_mimicry() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_vector_word_mimicry", false]], "calculate_word_mimicry() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_word_mimicry", false]], "chat_level_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.chat_level_features", false]], "chatlevelfeaturescalculator (class in utils.calculate_chat_level_features)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator", false]], "check_embeddings() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.check_embeddings", false]], "classify_ntri() (in module features.other_lexical_features)": [[16, "features.other_lexical_features.classify_NTRI", false]], "classify_text_dalechall() (in module features.readability)": [[21, "features.readability.classify_text_dalechall", false]], "clean_text() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.clean_text", false]], "commit_data() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.commit_data", false]], "compress() (in module utils.preprocess)": [[71, "utils.preprocess.compress", false]], "compute_frequency() (in module features.word_mimicry)": [[28, "features.word_mimicry.compute_frequency", false]], "compute_frequency_per_conv() (in module features.word_mimicry)": [[28, "features.word_mimicry.compute_frequency_per_conv", false]], "computetf() (in module features.word_mimicry)": [[28, "features.word_mimicry.computeTF", false]], "concat_bert_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.concat_bert_features", false]], "conjection_seperator() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.conjection_seperator", false]], "content_mimicry_score() (in module features.word_mimicry)": [[28, "features.word_mimicry.Content_mimicry_score", false]], "content_mimicry_score_per_conv() (in module features.word_mimicry)": [[28, "features.word_mimicry.Content_mimicry_score_per_conv", false]], "conv_level_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.conv_level_features", false]], "conv_to_float_arr() (in module features.get_all_dd_features)": [[8, "features.get_all_DD_features.conv_to_float_arr", false]], "conversationlevelfeaturescalculator (class in utils.calculate_conversation_level_features)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator", false]], "count_all_caps() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_all_caps", false]], "count_bullet_points() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_bullet_points", false]], "count_characters() (in module features.basic_features)": [[3, "features.basic_features.count_characters", false]], "count_difficult_words() (in module features.readability)": [[21, "features.readability.count_difficult_words", false]], "count_ellipses() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_ellipses", false]], "count_emojis() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_emojis", false]], "count_emphasis() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_emphasis", false]], "count_line_breaks() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_line_breaks", false]], "count_links() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_links", false]], "count_matches() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.count_matches", false]], "count_messages() (in module features.basic_features)": [[3, "features.basic_features.count_messages", false]], "count_numbering() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_numbering", false]], "count_parentheses() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_parentheses", false]], "count_quotes() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_quotes", false]], "count_responding_to_someone() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_responding_to_someone", false]], "count_spacy_matches() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.count_spacy_matches", false]], "count_syllables() (in module features.readability)": [[21, "features.readability.count_syllables", false]], "count_turn_taking_index() (in module features.turn_taking_features)": [[25, "features.turn_taking_features.count_turn_taking_index", false]], "count_turns() (in module features.turn_taking_features)": [[25, "features.turn_taking_features.count_turns", false]], "count_user_references() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_user_references", false]], "count_words() (in module features.basic_features)": [[3, "features.basic_features.count_words", false]], "create_chunks() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.create_chunks", false]], "create_chunks_messages() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.create_chunks_messages", false]], "create_cumulative_rows() (in module utils.preprocess)": [[71, "utils.preprocess.create_cumulative_rows", false]], "dale_chall_helper() (in module features.readability)": [[21, "features.readability.dale_chall_helper", false]], "feat_counts() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.feat_counts", false]], "feature_builder": [[2, "module-feature_builder", false]], "featurebuilder (class in feature_builder)": [[2, "feature_builder.FeatureBuilder", false]], "features.basic_features": [[3, "module-features.basic_features", false]], "features.burstiness": [[4, "module-features.burstiness", false]], "features.certainty": [[5, "module-features.certainty", false]], "features.discursive_diversity": [[6, "module-features.discursive_diversity", false]], "features.fflow": [[7, "module-features.fflow", false]], "features.get_all_dd_features": [[8, "module-features.get_all_DD_features", false]], "features.get_user_network": [[9, "module-features.get_user_network", false]], "features.hedge": [[10, "module-features.hedge", false]], "features.info_exchange_zscore": [[12, "module-features.info_exchange_zscore", false]], "features.information_diversity": [[13, "module-features.information_diversity", false]], "features.lexical_features_v2": [[14, "module-features.lexical_features_v2", false]], "features.named_entity_recognition_features": [[15, "module-features.named_entity_recognition_features", false]], "features.other_lexical_features": [[16, "module-features.other_lexical_features", false]], "features.politeness_features": [[17, "module-features.politeness_features", false]], "features.politeness_v2": [[18, "module-features.politeness_v2", false]], "features.politeness_v2_helper": [[19, "module-features.politeness_v2_helper", false]], "features.question_num": [[20, "module-features.question_num", false]], "features.readability": [[21, "module-features.readability", false]], "features.reddit_tags": [[22, "module-features.reddit_tags", false]], "features.temporal_features": [[23, "module-features.temporal_features", false]], "features.textblob_sentiment_analysis": [[24, "module-features.textblob_sentiment_analysis", false]], "features.turn_taking_features": [[25, "module-features.turn_taking_features", false]], "features.variance_in_dd": [[26, "module-features.variance_in_DD", false]], "features.within_person_discursive_range": [[27, "module-features.within_person_discursive_range", false]], "features.word_mimicry": [[28, "module-features.word_mimicry", false]], "featurize() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.featurize", false]], "fix_abbreviations() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.fix_abbreviations", false]], "function_mimicry_score() (in module features.word_mimicry)": [[28, "features.word_mimicry.function_mimicry_score", false]], "generate_bert() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_bert", false]], "generate_certainty_pkl() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_certainty_pkl", false]], "generate_lexicon_pkl() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_lexicon_pkl", false]], "generate_vect() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_vect", false]], "get_centroids() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_centroids", false]], "get_certainty() (in module features.certainty)": [[5, "features.certainty.get_certainty", false]], "get_certainty_score() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_certainty_score", false]], "get_content_words_in_message() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_content_words_in_message", false]], "get_conversation_level_aggregates() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_conversation_level_aggregates", false]], "get_cosine_similarity() (in module features.discursive_diversity)": [[6, "features.discursive_diversity.get_cosine_similarity", false]], "get_dale_chall_easy_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_dale_chall_easy_words", false]], "get_dale_chall_score_and_classfication() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_dale_chall_score_and_classfication", false]], "get_dd() (in module features.discursive_diversity)": [[6, "features.discursive_diversity.get_DD", false]], "get_dd_features() (in module features.get_all_dd_features)": [[8, "features.get_all_DD_features.get_DD_features", false]], "get_dep_pairs() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.get_dep_pairs", false]], "get_dep_pairs_noneg() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.get_dep_pairs_noneg", false]], "get_discursive_diversity_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_discursive_diversity_features", false]], "get_first_pct_of_chat() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.get_first_pct_of_chat", false]], "get_first_person_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_first_person_words", false]], "get_forward_flow() (in module features.fflow)": [[7, "features.fflow.get_forward_flow", false]], "get_forward_flow() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_forward_flow", false]], "get_function_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_function_words", false]], "get_function_words_in_message() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_function_words_in_message", false]], "get_gini() (in module utils.gini_coefficient)": [[68, "utils.gini_coefficient.get_gini", false]], "get_gini_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_gini_features", false]], "get_info_diversity() (in module features.information_diversity)": [[13, "features.information_diversity.get_info_diversity", false]], "get_info_exchange_wordcount() (in module features.info_exchange_zscore)": [[12, "features.info_exchange_zscore.get_info_exchange_wordcount", false]], "get_liwc_count() (in module features.lexical_features_v2)": [[14, "features.lexical_features_v2.get_liwc_count", false]], "get_max() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_max", false]], "get_mean() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_mean", false]], "get_median() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_median", false]], "get_mimicry_bert() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_mimicry_bert", false]], "get_min() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_min", false]], "get_moving_mimicry() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_moving_mimicry", false]], "get_named_entity() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_named_entity", false]], "get_nan_vector() (in module features.within_person_discursive_range)": [[27, "features.within_person_discursive_range.get_nan_vector", false]], "get_nan_vector() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.get_nan_vector", false]], "get_polarity_score() (in module features.textblob_sentiment_analysis)": [[24, "features.textblob_sentiment_analysis.get_polarity_score", false]], "get_politeness_strategies() (in module features.politeness_features)": [[17, "features.politeness_features.get_politeness_strategies", false]], "get_politeness_v2() (in module features.politeness_v2)": [[18, "features.politeness_v2.get_politeness_v2", false]], "get_proportion_first_pronouns() (in module features.other_lexical_features)": [[16, "features.other_lexical_features.get_proportion_first_pronouns", false]], "get_question_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_question_words", false]], "get_reddit_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_reddit_features", false]], "get_sentiment() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.get_sentiment", false]], "get_stdev() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_stdev", false]], "get_subjectivity_score() (in module features.textblob_sentiment_analysis)": [[24, "features.textblob_sentiment_analysis.get_subjectivity_score", false]], "get_sum() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_sum", false]], "get_team_burstiness() (in module features.burstiness)": [[4, "features.burstiness.get_team_burstiness", false]], "get_temporal_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_temporal_features", false]], "get_time_diff() (in module features.temporal_features)": [[23, "features.temporal_features.get_time_diff", false]], "get_time_diff_startend() (in module features.temporal_features)": [[23, "features.temporal_features.get_time_diff_startend", false]], "get_turn() (in module features.turn_taking_features)": [[25, "features.turn_taking_features.get_turn", false]], "get_turn_id() (in module utils.preprocess)": [[71, "utils.preprocess.get_turn_id", false]], "get_turn_taking_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_turn_taking_features", false]], "get_unique_pairwise_combos() (in module features.discursive_diversity)": [[6, "features.discursive_diversity.get_unique_pairwise_combos", false]], "get_user_level_aggregates() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_user_level_aggregates", false]], "get_user_level_summary_statistics_features() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_user_level_summary_statistics_features", false]], "get_user_level_summed_features() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_user_level_summed_features", false]], "get_user_max_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_max_dataframe", false]], "get_user_mean_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_mean_dataframe", false]], "get_user_median_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_median_dataframe", false]], "get_user_min_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_min_dataframe", false]], "get_user_network() (in module features.get_user_network)": [[9, "features.get_user_network.get_user_network", false]], "get_user_network() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_user_network", false]], "get_user_stdev_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_stdev_dataframe", false]], "get_user_sum_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_sum_dataframe", false]], "get_variance_in_dd() (in module features.variance_in_dd)": [[26, "features.variance_in_DD.get_variance_in_DD", false]], "get_within_person_disc_range() (in module features.within_person_discursive_range)": [[27, "features.within_person_discursive_range.get_within_person_disc_range", false]], "get_word_ttr() (in module features.other_lexical_features)": [[16, "features.other_lexical_features.get_word_TTR", false]], "get_zscore_across_all_chats() (in module utils.zscore_chats_and_conversation)": [[73, "utils.zscore_chats_and_conversation.get_zscore_across_all_chats", false]], "get_zscore_across_all_conversations() (in module utils.zscore_chats_and_conversation)": [[73, "utils.zscore_chats_and_conversation.get_zscore_across_all_conversations", false]], "gini_coefficient() (in module utils.gini_coefficient)": [[68, "utils.gini_coefficient.gini_coefficient", false]], "info_diversity() (in module features.information_diversity)": [[13, "features.information_diversity.info_diversity", false]], "info_exchange() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.info_exchange", false]], "is_hedged_sentence_1() (in module features.hedge)": [[10, "features.hedge.is_hedged_sentence_1", false]], "is_valid_term() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.is_valid_term", false]], "lexical_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.lexical_features", false]], "liwc_features() (in module features.lexical_features_v2)": [[14, "features.lexical_features_v2.liwc_features", false]], "load_custem_liwc_dict() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.load_custem_liwc_dict", false]], "load_liwc_dict() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.load_liwc_dict", false]], "load_saved_data() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.load_saved_data", false]], "load_to_dict() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.load_to_dict", false]], "load_to_lists() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.load_to_lists", false]], "merge_conv_data_with_original() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.merge_conv_data_with_original", false]], "mimic_words() (in module features.word_mimicry)": [[28, "features.word_mimicry.mimic_words", false]], "module": [[2, "module-feature_builder", false], [3, "module-features.basic_features", false], [4, "module-features.burstiness", false], [5, "module-features.certainty", false], [6, "module-features.discursive_diversity", false], [7, "module-features.fflow", false], [8, "module-features.get_all_DD_features", false], [9, "module-features.get_user_network", false], [10, "module-features.hedge", false], [12, "module-features.info_exchange_zscore", false], [13, "module-features.information_diversity", false], [14, "module-features.lexical_features_v2", false], [15, "module-features.named_entity_recognition_features", false], [16, "module-features.other_lexical_features", false], [17, "module-features.politeness_features", false], [18, "module-features.politeness_v2", false], [19, "module-features.politeness_v2_helper", false], [20, "module-features.question_num", false], [21, "module-features.readability", false], [22, "module-features.reddit_tags", false], [23, "module-features.temporal_features", false], [24, "module-features.textblob_sentiment_analysis", false], [25, "module-features.turn_taking_features", false], [26, "module-features.variance_in_DD", false], [27, "module-features.within_person_discursive_range", false], [28, "module-features.word_mimicry", false], [63, "module-utils.assign_chunk_nums", false], [64, "module-utils.calculate_chat_level_features", false], [65, "module-utils.calculate_conversation_level_features", false], [66, "module-utils.calculate_user_level_features", false], [67, "module-utils.check_embeddings", false], [68, "module-utils.gini_coefficient", false], [70, "module-utils.preload_word_lists", false], [71, "module-utils.preprocess", false], [72, "module-utils.summarize_features", false], [73, "module-utils.zscore_chats_and_conversation", false]], "named_entities() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.named_entities", false]], "num_named_entity() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.num_named_entity", false]], "other_lexical_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.other_lexical_features", false]], "phrase_split() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.phrase_split", false]], "positivity_zscore() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.positivity_zscore", false]], "prep_simple() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.prep_simple", false]], "prep_whole() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.prep_whole", false]], "preprocess_chat_data() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.preprocess_chat_data", false]], "preprocess_conversation_columns() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_conversation_columns", false]], "preprocess_naive_turns() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_naive_turns", false]], "preprocess_text() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_text", false]], "preprocess_text_lowercase_but_retain_punctuation() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_text_lowercase_but_retain_punctuation", false]], "preprocessing() (in module features.information_diversity)": [[13, "features.information_diversity.preprocessing", false]], "punctuation_seperator() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.punctuation_seperator", false]], "question() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.Question", false]], "read_in_lexicons() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.read_in_lexicons", false]], "reduce_chunks() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.reduce_chunks", false]], "remove_active_user() (in module features.get_user_network)": [[9, "features.get_user_network.remove_active_user", false]], "remove_unhashable_cols() (in module utils.preprocess)": [[71, "utils.preprocess.remove_unhashable_cols", false]], "save_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.save_features", false]], "sentence_pad() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.sentence_pad", false]], "sentence_split() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.sentence_split", false]], "sentenciser() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.sentenciser", false]], "set_self_conv_data() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.set_self_conv_data", false]], "sort_words() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.sort_words", false]], "str_to_vec() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.str_to_vec", false]], "text_based_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.text_based_features", false]], "token_count() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.token_count", false]], "train_spacy_ner() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.train_spacy_ner", false]], "user_level_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.user_level_features", false]], "userlevelfeaturescalculator (class in utils.calculate_user_level_features)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator", false]], "utils.assign_chunk_nums": [[63, "module-utils.assign_chunk_nums", false]], "utils.calculate_chat_level_features": [[64, "module-utils.calculate_chat_level_features", false]], "utils.calculate_conversation_level_features": [[65, "module-utils.calculate_conversation_level_features", false]], "utils.calculate_user_level_features": [[66, "module-utils.calculate_user_level_features", false]], "utils.check_embeddings": [[67, "module-utils.check_embeddings", false]], "utils.gini_coefficient": [[68, "module-utils.gini_coefficient", false]], "utils.preload_word_lists": [[70, "module-utils.preload_word_lists", false]], "utils.preprocess": [[71, "module-utils.preprocess", false]], "utils.summarize_features": [[72, "module-utils.summarize_features", false]], "utils.zscore_chats_and_conversation": [[73, "module-utils.zscore_chats_and_conversation", false]], "verify_timestamp_format() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.verify_timestamp_format", false]], "word_start() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.word_start", false]]}, "objects": {"": [[2, 0, 0, "-", "feature_builder"]], "feature_builder": [[2, 1, 1, "", "FeatureBuilder"]], "feature_builder.FeatureBuilder": [[2, 2, 1, "", "chat_level_features"], [2, 2, 1, "", "conv_level_features"], [2, 2, 1, "", "featurize"], [2, 2, 1, "", "get_first_pct_of_chat"], [2, 2, 1, "", "load_custem_liwc_dict"], [2, 2, 1, "", "merge_conv_data_with_original"], [2, 2, 1, "", "preprocess_chat_data"], [2, 2, 1, "", "save_features"], [2, 2, 1, "", "set_self_conv_data"], [2, 2, 1, "", "user_level_features"], [2, 2, 1, "", "verify_timestamp_format"]], "features": [[3, 0, 0, "-", "basic_features"], [4, 0, 0, "-", "burstiness"], [5, 0, 0, "-", "certainty"], [6, 0, 0, "-", "discursive_diversity"], [7, 0, 0, "-", "fflow"], [8, 0, 0, "-", "get_all_DD_features"], [9, 0, 0, "-", "get_user_network"], [10, 0, 0, "-", "hedge"], [12, 0, 0, "-", "info_exchange_zscore"], [13, 0, 0, "-", "information_diversity"], [14, 0, 0, "-", "lexical_features_v2"], [15, 0, 0, "-", "named_entity_recognition_features"], [16, 0, 0, "-", "other_lexical_features"], [17, 0, 0, "-", "politeness_features"], [18, 0, 0, "-", "politeness_v2"], [19, 0, 0, "-", "politeness_v2_helper"], [20, 0, 0, "-", "question_num"], [21, 0, 0, "-", "readability"], [22, 0, 0, "-", "reddit_tags"], [23, 0, 0, "-", "temporal_features"], [24, 0, 0, "-", "textblob_sentiment_analysis"], [25, 0, 0, "-", "turn_taking_features"], [26, 0, 0, "-", "variance_in_DD"], [27, 0, 0, "-", "within_person_discursive_range"], [28, 0, 0, "-", "word_mimicry"]], "features.basic_features": [[3, 3, 1, "", "count_characters"], [3, 3, 1, "", "count_messages"], [3, 3, 1, "", "count_words"]], "features.burstiness": [[4, 3, 1, "", "burstiness"], [4, 3, 1, "", "get_team_burstiness"]], "features.certainty": [[5, 3, 1, "", "get_certainty"]], "features.discursive_diversity": [[6, 3, 1, "", "get_DD"], [6, 3, 1, "", "get_cosine_similarity"], [6, 3, 1, "", "get_unique_pairwise_combos"]], "features.fflow": [[7, 3, 1, "", "get_forward_flow"]], "features.get_all_DD_features": [[8, 3, 1, "", "conv_to_float_arr"], [8, 3, 1, "", "get_DD_features"]], "features.get_user_network": [[9, 3, 1, "", "get_user_network"], [9, 3, 1, "", "remove_active_user"]], "features.hedge": [[10, 3, 1, "", "is_hedged_sentence_1"]], "features.info_exchange_zscore": [[12, 3, 1, "", "get_info_exchange_wordcount"]], "features.information_diversity": [[13, 3, 1, "", "calculate_ID_score"], [13, 3, 1, "", "get_info_diversity"], [13, 3, 1, "", "info_diversity"], [13, 3, 1, "", "preprocessing"]], "features.lexical_features_v2": [[14, 3, 1, "", "get_liwc_count"], [14, 3, 1, "", "liwc_features"]], "features.named_entity_recognition_features": [[15, 3, 1, "", "built_spacy_ner"], [15, 3, 1, "", "calculate_named_entities"], [15, 3, 1, "", "named_entities"], [15, 3, 1, "", "num_named_entity"], [15, 3, 1, "", "train_spacy_ner"]], "features.other_lexical_features": [[16, 3, 1, "", "classify_NTRI"], [16, 3, 1, "", "get_proportion_first_pronouns"], [16, 3, 1, "", "get_word_TTR"]], "features.politeness_features": [[17, 3, 1, "", "get_politeness_strategies"]], "features.politeness_v2": [[18, 3, 1, "", "get_politeness_v2"]], "features.politeness_v2_helper": [[19, 3, 1, "", "Question"], [19, 3, 1, "", "adverb_limiter"], [19, 3, 1, "", "bare_command"], [19, 3, 1, "", "clean_text"], [19, 3, 1, "", "commit_data"], [19, 3, 1, "", "conjection_seperator"], [19, 3, 1, "", "count_matches"], [19, 3, 1, "", "count_spacy_matches"], [19, 3, 1, "", "feat_counts"], [19, 3, 1, "", "get_dep_pairs"], [19, 3, 1, "", "get_dep_pairs_noneg"], [19, 3, 1, "", "load_saved_data"], [19, 3, 1, "", "load_to_dict"], [19, 3, 1, "", "load_to_lists"], [19, 3, 1, "", "phrase_split"], [19, 3, 1, "", "prep_simple"], [19, 3, 1, "", "prep_whole"], [19, 3, 1, "", "punctuation_seperator"], [19, 3, 1, "", "sentence_pad"], [19, 3, 1, "", "sentence_split"], [19, 3, 1, "", "sentenciser"], [19, 3, 1, "", "token_count"], [19, 3, 1, "", "word_start"]], "features.question_num": [[20, 3, 1, "", "calculate_num_question_naive"]], "features.readability": [[21, 3, 1, "", "classify_text_dalechall"], [21, 3, 1, "", "count_difficult_words"], [21, 3, 1, "", "count_syllables"], [21, 3, 1, "", "dale_chall_helper"]], "features.reddit_tags": [[22, 3, 1, "", "count_all_caps"], [22, 3, 1, "", "count_bullet_points"], [22, 3, 1, "", "count_ellipses"], [22, 3, 1, "", "count_emojis"], [22, 3, 1, "", "count_emphasis"], [22, 3, 1, "", "count_line_breaks"], [22, 3, 1, "", "count_links"], [22, 3, 1, "", "count_numbering"], [22, 3, 1, "", "count_parentheses"], [22, 3, 1, "", "count_quotes"], [22, 3, 1, "", "count_responding_to_someone"], [22, 3, 1, "", "count_user_references"]], "features.temporal_features": [[23, 3, 1, "", "get_time_diff"], [23, 3, 1, "", "get_time_diff_startend"]], "features.textblob_sentiment_analysis": [[24, 3, 1, "", "get_polarity_score"], [24, 3, 1, "", "get_subjectivity_score"]], "features.turn_taking_features": [[25, 3, 1, "", "count_turn_taking_index"], [25, 3, 1, "", "count_turns"], [25, 3, 1, "", "get_turn"]], "features.variance_in_DD": [[26, 3, 1, "", "get_variance_in_DD"]], "features.within_person_discursive_range": [[27, 3, 1, "", "get_nan_vector"], [27, 3, 1, "", "get_within_person_disc_range"]], "features.word_mimicry": [[28, 3, 1, "", "Content_mimicry_score"], [28, 3, 1, "", "Content_mimicry_score_per_conv"], [28, 3, 1, "", "computeTF"], [28, 3, 1, "", "compute_frequency"], [28, 3, 1, "", "compute_frequency_per_conv"], [28, 3, 1, "", "function_mimicry_score"], [28, 3, 1, "", "get_content_words_in_message"], [28, 3, 1, "", "get_function_words_in_message"], [28, 3, 1, "", "get_mimicry_bert"], [28, 3, 1, "", "get_moving_mimicry"], [28, 3, 1, "", "mimic_words"]], "utils": [[63, 0, 0, "-", "assign_chunk_nums"], [64, 0, 0, "-", "calculate_chat_level_features"], [65, 0, 0, "-", "calculate_conversation_level_features"], [66, 0, 0, "-", "calculate_user_level_features"], [67, 0, 0, "-", "check_embeddings"], [68, 0, 0, "-", "gini_coefficient"], [70, 0, 0, "-", "preload_word_lists"], [71, 0, 0, "-", "preprocess"], [72, 0, 0, "-", "summarize_features"], [73, 0, 0, "-", "zscore_chats_and_conversation"]], "utils.assign_chunk_nums": [[63, 3, 1, "", "assign_chunk_nums"], [63, 3, 1, "", "create_chunks"], [63, 3, 1, "", "create_chunks_messages"], [63, 3, 1, "", "reduce_chunks"]], "utils.calculate_chat_level_features": [[64, 1, 1, "", "ChatLevelFeaturesCalculator"]], "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator": [[64, 2, 1, "", "calculate_chat_level_features"], [64, 2, 1, "", "calculate_hedge_features"], [64, 2, 1, "", "calculate_politeness_sentiment"], [64, 2, 1, "", "calculate_politeness_v2"], [64, 2, 1, "", "calculate_textblob_sentiment"], [64, 2, 1, "", "calculate_vector_word_mimicry"], [64, 2, 1, "", "calculate_word_mimicry"], [64, 2, 1, "", "concat_bert_features"], [64, 2, 1, "", "get_certainty_score"], [64, 2, 1, "", "get_dale_chall_score_and_classfication"], [64, 2, 1, "", "get_forward_flow"], [64, 2, 1, "", "get_named_entity"], [64, 2, 1, "", "get_reddit_features"], [64, 2, 1, "", "get_temporal_features"], [64, 2, 1, "", "info_exchange"], [64, 2, 1, "", "lexical_features"], [64, 2, 1, "", "other_lexical_features"], [64, 2, 1, "", "positivity_zscore"], [64, 2, 1, "", "text_based_features"]], "utils.calculate_conversation_level_features": [[65, 1, 1, "", "ConversationLevelFeaturesCalculator"]], "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator": [[65, 2, 1, "", "calculate_conversation_level_features"], [65, 2, 1, "", "calculate_info_diversity"], [65, 2, 1, "", "calculate_team_burstiness"], [65, 2, 1, "", "get_conversation_level_aggregates"], [65, 2, 1, "", "get_discursive_diversity_features"], [65, 2, 1, "", "get_gini_features"], [65, 2, 1, "", "get_turn_taking_features"], [65, 2, 1, "", "get_user_level_aggregates"]], "utils.calculate_user_level_features": [[66, 1, 1, "", "UserLevelFeaturesCalculator"]], "utils.calculate_user_level_features.UserLevelFeaturesCalculator": [[66, 2, 1, "", "calculate_user_level_features"], [66, 2, 1, "", "get_centroids"], [66, 2, 1, "", "get_user_level_summary_statistics_features"], [66, 2, 1, "", "get_user_level_summed_features"], [66, 2, 1, "", "get_user_network"]], "utils.check_embeddings": [[67, 3, 1, "", "check_embeddings"], [67, 3, 1, "", "fix_abbreviations"], [67, 3, 1, "", "generate_bert"], [67, 3, 1, "", "generate_certainty_pkl"], [67, 3, 1, "", "generate_lexicon_pkl"], [67, 3, 1, "", "generate_vect"], [67, 3, 1, "", "get_nan_vector"], [67, 3, 1, "", "get_sentiment"], [67, 3, 1, "", "is_valid_term"], [67, 3, 1, "", "load_liwc_dict"], [67, 3, 1, "", "read_in_lexicons"], [67, 3, 1, "", "sort_words"], [67, 3, 1, "", "str_to_vec"]], "utils.gini_coefficient": [[68, 3, 1, "", "get_gini"], [68, 3, 1, "", "gini_coefficient"]], "utils.preload_word_lists": [[70, 3, 1, "", "get_dale_chall_easy_words"], [70, 3, 1, "", "get_first_person_words"], [70, 3, 1, "", "get_function_words"], [70, 3, 1, "", "get_question_words"]], "utils.preprocess": [[71, 3, 1, "", "compress"], [71, 3, 1, "", "create_cumulative_rows"], [71, 3, 1, "", "get_turn_id"], [71, 3, 1, "", "preprocess_conversation_columns"], [71, 3, 1, "", "preprocess_naive_turns"], [71, 3, 1, "", "preprocess_text"], [71, 3, 1, "", "preprocess_text_lowercase_but_retain_punctuation"], [71, 3, 1, "", "remove_unhashable_cols"]], "utils.summarize_features": [[72, 3, 1, "", "get_max"], [72, 3, 1, "", "get_mean"], [72, 3, 1, "", "get_median"], [72, 3, 1, "", "get_min"], [72, 3, 1, "", "get_stdev"], [72, 3, 1, "", "get_sum"], [72, 3, 1, "", "get_user_max_dataframe"], [72, 3, 1, "", "get_user_mean_dataframe"], [72, 3, 1, "", "get_user_median_dataframe"], [72, 3, 1, "", "get_user_min_dataframe"], [72, 3, 1, "", "get_user_stdev_dataframe"], [72, 3, 1, "", "get_user_sum_dataframe"]], "utils.zscore_chats_and_conversation": [[73, 3, 1, "", "get_zscore_across_all_chats"], [73, 3, 1, "", "get_zscore_across_all_conversations"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "terms": {"": [0, 1, 2, 4, 5, 9, 11, 13, 14, 25, 28, 29, 31, 32, 34, 35, 36, 40, 41, 42, 43, 44, 45, 46, 47, 49, 50, 51, 52, 55, 59, 61, 62, 64, 65, 66], "0": [0, 1, 2, 5, 10, 13, 16, 21, 24, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 43, 45, 46, 47, 50, 51, 53, 55, 59, 61], "00222437221134802": [5, 64], "01": 51, "02": 51, "04": 40, "0496": [21, 33], "05": [13, 40, 50, 51], "06": 51, "08": [42, 50], "09": [45, 46, 50], "1": [0, 1, 2, 3, 10, 13, 22, 24, 32, 34, 35, 37, 38, 40, 41, 43, 44, 45, 46, 47, 48, 51, 53, 55, 56, 57, 59, 61, 62, 67], "10": [1, 5, 6, 21, 24, 33, 59, 61, 64], "100": [1, 21, 33, 37, 42, 47, 62], "10th": 33, "1145": [21, 24], "1177": [5, 64], "11th": 33, "12": [35, 45, 46, 50], "1287": 6, "12th": 33, "13": 50, "14": 50, "15": [1, 37, 50], "1579": [21, 33], "17": 50, "1948": 33, "195": 36, "1977": 62, "1d": 67, "1lpngokujsx": 5, "1st": 50, "1st_person": 50, "1st_person_pl": 50, "1st_person_start": 50, "2": [0, 1, 2, 34, 35, 41, 47, 59, 61, 62, 67], "20": [37, 59], "2004": 42, "2007": [0, 5, 42, 67], "2009": 60, "2012": 55, "2013": [12, 16, 31, 32, 36, 37, 38, 41, 43, 49, 50, 52, 54, 70], "2015": [42, 53, 58, 60, 67], "2016": 4, "2017": 13, "2018": [40, 44, 55], "2019": [35, 52], "2020": [18, 21, 24, 33, 49, 50, 56, 57], "2021": [1, 6, 43, 44], "2022": [13, 34], "2023": [1, 5, 30, 59, 61, 64], "2024": [40, 42], "21": 59, "22": [41, 50], "2384068": 4, "24": [1, 61], "25": 47, "27": [42, 50], "28": 50, "29": 50, "2nd": 50, "2nd_person": 50, "2nd_person_start": 50, "3": [0, 1, 2, 21, 34, 41, 42, 51, 59, 61, 67, 71], "30": 50, "3000": 33, "32": [34, 50], "3432929": [21, 24], "35": 51, "36": 50, "38": 50, "39": 49, "39512260": 68, "3n": 59, "4": [0, 1, 5, 13, 21, 30, 33, 41, 42, 56, 61, 62, 71], "4274": 6, "43": 50, "45": 50, "47": 50, "49": 50, "4pit4bqz6": 5, "4th": [21, 33], "5": [1, 5, 21, 30, 33, 37, 41, 59], "50": [1, 47], "52": 50, "53": 50, "57": 50, "58": 50, "5th": 33, "6": [1, 33, 43], "60": 51, "63": 50, "6365": 21, "64": 67, "68": 47, "6th": 33, "7": [30, 33, 48], "70": 50, "78": [35, 50], "7th": 33, "8": [1, 30, 33, 42, 67], "80": [21, 70], "82": 41, "85": 34, "86": 35, "87": 50, "89": [45, 46], "8th": 33, "9": [2, 5, 21, 30, 33, 40, 47, 50, 67], "9123": 47, "92": 51, "93chall_readability_formula": [21, 70], "94": 15, "95": 47, "95450": 42, "97": 51, "9855072464": 47, "9992": 47, "99954": 47, "9th": 33, "A": [1, 2, 4, 12, 13, 14, 15, 16, 17, 18, 19, 21, 23, 25, 28, 33, 34, 35, 37, 38, 40, 41, 44, 45, 46, 47, 49, 50, 51, 52, 57, 59, 60, 61, 62, 64, 66, 67, 68, 70, 71, 72, 73], "And": [1, 62], "As": [1, 31, 35, 36, 40, 42, 45, 61], "Be": 1, "But": [1, 50, 62], "By": [0, 1, 11, 42, 50], "For": [0, 1, 31, 34, 37, 41, 42, 43, 47, 49, 54, 56, 59, 62, 65], "If": [0, 1, 2, 5, 21, 29, 30, 35, 42, 45, 47, 50, 55, 61, 62, 63, 65, 66, 67, 71], "In": [1, 21, 30, 31, 34, 35, 36, 37, 39, 41, 42, 45, 46, 47, 50, 55, 59, 61, 62], "It": [1, 2, 31, 32, 33, 36, 37, 41, 44, 45, 46, 50, 64, 65, 66, 67, 71], "NO": 37, "NOT": [1, 61], "No": [19, 50, 53], "Not": 41, "One": [1, 37, 61], "That": [29, 55], "The": [1, 2, 3, 4, 5, 7, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 58, 59, 60, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73], "Then": [1, 55, 61], "There": [1, 11, 32, 61, 66], "These": [1, 11, 17, 32, 34, 42, 48, 52, 61, 62, 69], "To": [0, 1, 29, 31, 34, 37, 40, 42, 55, 56, 57, 61, 62], "WITH": 21, "Will": 50, "_deviat": 55, "_lexical_wordcount_custom": 42, "_preprocessed_": 0, "abbrevi": 67, "abil": [13, 29], "abl": [31, 36, 61], "abort": 1, "about": [1, 12, 29, 31, 36, 41, 47, 61, 62], "abov": [1, 21, 34, 61], "abstract_id": 4, "academ": 42, "accept": [0, 1, 58, 61], "access": [0, 1, 11, 15, 61], "accommod": [28, 32, 39, 45, 46, 64], "accord": [21, 37, 59, 64, 70], "accordingli": 63, "account": [1, 29, 32, 42], "accus": 50, "achiev": [50, 62], "acknowledg": 49, "acm": [21, 24], "acommod": 36, "across": [1, 13, 28, 31, 34, 40, 41, 50, 62, 64, 73], "action": 59, "activ": [1, 9, 44, 55, 71], "actual": [41, 56], "ad": [61, 62, 71], "adapt": 59, "add": [0, 1, 2, 21, 51, 61], "addit": [2, 32, 34, 42, 63, 69], "addition": [0, 30, 31, 32, 54], "address": 1, "adjac": 71, "adjust": [0, 21, 37, 63], "advanc": [31, 36], "advantag": 4, "adverb": [19, 31, 36], "adverb_limit": [19, 49], "affect": [0, 1, 29, 35, 44], "affirm": 49, "after": [0, 1, 31, 34, 36, 42, 43, 61, 62, 64, 67], "again": [32, 34, 67], "against": [28, 31, 36, 52, 67], "agarw": 62, "aggreg": [0, 2, 3, 37, 44, 61, 62, 65, 66, 72], "agre": 47, "agreement": 49, "ah": [31, 36], "ai": 62, "aim": [39, 62], "airtim": [37, 62], "al": [1, 5, 16, 18, 21, 24, 30, 31, 32, 33, 34, 35, 36, 38, 42, 43, 44, 49, 50, 52, 53, 54, 56, 57, 58, 59, 60, 64], "algorithm": [56, 57], "align": [35, 51], "all": [0, 1, 2, 6, 11, 12, 13, 15, 19, 22, 28, 30, 31, 34, 35, 36, 37, 40, 41, 42, 46, 48, 49, 51, 52, 55, 58, 61, 62, 64, 65, 66, 71, 73], "allow": [0, 1, 67], "almaatouq": 59, "alon": 67, "along": 1, "alongsid": 1, "alphabet": 49, "alphanumer": [42, 67, 71], "alreadi": [0, 1, 2, 4, 10, 12, 16, 67], "also": [0, 1, 2, 28, 30, 31, 32, 34, 36, 37, 38, 42, 47, 51, 54, 60, 61, 62, 64, 65, 67, 69], "alsobai": 59, "altern": 59, "although": [1, 23, 31, 36], "alwai": [1, 55], "am": [31, 36, 42, 54, 62], "amaz": [48, 56], "ambient": 32, "american": 33, "ami": [47, 59, 62], "amic": 62, "among": [36, 37, 52, 55, 62], "amongst": [6, 35, 48], "an": [0, 1, 2, 5, 8, 11, 12, 13, 21, 29, 30, 31, 32, 33, 34, 36, 38, 40, 41, 42, 45, 47, 48, 49, 50, 51, 52, 54, 59, 60, 61, 62, 63, 65, 66, 67, 68, 71], "analys": [1, 62], "analysi": [0, 1, 11, 52, 62, 67, 71], "analyt": 62, "analyz": [0, 2, 13, 14, 16, 17, 19, 20, 21, 22, 24, 28, 43, 52, 62, 67, 71], "analyze_first_pct": [0, 1, 2], "angri": 47, "ani": [0, 1, 29, 31, 33, 38, 54, 62, 71], "annot": [17, 50], "anoth": [30, 34, 36, 48], "answer": 29, "anybodi": [31, 36], "anyth": [1, 31, 36, 56], "anywher": [31, 36], "apartment": 42, "api": [2, 47], "api_refer": 24, "apolog": [17, 50], "apologi": 49, "appear": [0, 15, 28, 31, 37, 38, 42, 64, 67], "append": [1, 17, 42, 64, 65, 66, 67], "appli": [4, 13, 18, 62, 64, 69], "applic": [29, 71], "appreci": 50, "approach": [32, 38, 42, 45, 46, 49, 53, 64], "appropri": [1, 31, 69], "ar": [0, 1, 2, 3, 5, 9, 10, 11, 15, 17, 19, 21, 23, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 51, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 71], "arcross": 34, "area": 62, "aren": [31, 36], "around": 2, "arous": 48, "arrai": [6, 8, 67, 68], "articl": [37, 50], "ask": [20, 47, 54], "ask_ag": 49, "aspect": [50, 62], "assign": [1, 31, 36, 38, 45, 46, 52, 59, 61, 63, 71], "assign_chunk_num": 69, "associ": [1, 4, 15, 21, 29, 30, 31, 32, 36, 40, 45, 46, 47, 48, 61], "assum": [0, 1, 2, 10, 12, 16, 23, 31, 41, 60, 61, 64, 67, 71], "assumpt": [1, 41, 61], "asterisk": 22, "attribut": [0, 1, 11, 34, 51, 52, 56, 62], "author": [5, 31, 36, 59], "auto": 2, "automat": [0, 1, 61, 69], "auxiliari": [31, 36], "avail": [0, 1, 61, 62, 63, 64, 67], "averag": [1, 11, 13, 28, 30, 33, 34, 35, 40, 41, 46, 52, 64, 65, 72], "avil": 62, "avoid": 30, "awar": 29, "awesom": 62, "b": [4, 34, 35, 42, 45, 46, 55, 62], "bachelor": 42, "back": 62, "bag": [32, 38, 42, 45, 46, 49, 53, 56, 57], "bare_command": [19, 49], "base": [0, 1, 2, 15, 18, 19, 31, 32, 34, 35, 36, 37, 40, 42, 51, 52, 53, 54, 55, 56, 57, 61, 62, 63, 64, 65, 66, 71], "basic": [10, 11, 12, 16, 61, 62], "basic_featur": 11, "batch": 67, "batch_num": 1, "batch_siz": 67, "bay": [56, 57], "bbevi": 18, "becaus": [1, 2, 12, 21, 31, 36, 40, 42, 56, 61], "becom": [44, 61, 62], "been": [1, 12, 16, 31, 36, 61], "befor": [0, 1, 2, 17, 31, 36, 45, 48], "beforehand": 64, "begin": [34, 42, 54, 58, 61, 62, 63], "behavior": [0, 1, 11, 62, 63], "being": [4, 13, 14, 16, 17, 20, 21, 24, 31, 32, 36, 43, 47, 51, 55, 56, 60], "belong": 1, "below": [1, 11, 21, 33, 36, 45, 48, 51, 61, 62, 67, 69], "ber": 54, "bert": [0, 1, 2, 31, 35, 36, 39, 46, 61, 64, 67], "bert_path": 67, "bert_sentiment_data": [1, 61, 64], "best": 29, "better": [31, 61], "between": [4, 6, 13, 21, 23, 24, 28, 30, 31, 34, 35, 36, 37, 40, 45, 46, 55, 58, 59, 62, 64, 65, 67], "betwen": 34, "beyond": 2, "big": 59, "binari": [10, 32, 38], "blame": 47, "blob": [1, 24, 61, 67], "block": [22, 32, 48, 59], "blog": 15, "bodi": 67, "bold": [22, 64], "bool": [2, 63, 65, 66, 67, 71], "boolean": 1, "bootstrap": 62, "both": [0, 1, 2, 42, 52, 54, 55, 59, 62], "bother": 50, "bottom": 59, "bought": 41, "bound": [29, 35, 36, 37, 52, 55], "boundari": [34, 35, 42], "boyd": [0, 42], "break": [22, 48, 64], "brief": 44, "bring": 0, "broader": 52, "broken": 59, "btw": 50, "bug": [1, 61], "build": [1, 7, 34, 45, 46, 62], "built": [1, 11, 42, 67], "built_spacy_n": 15, "bullet": [22, 48, 64], "bunch": 59, "burst": 58, "bursti": [1, 11, 39, 58, 61, 65], "by_the_wai": 49, "c": [12, 34, 35, 45, 46, 62], "cach": [0, 2, 51, 61], "calcul": [1, 2, 5, 11, 12, 16, 18, 21, 28, 33, 41, 48, 49, 50, 56, 57, 58, 60, 62, 63, 64, 65, 66, 67, 68, 72, 73], "calculate_chat_level_featur": [1, 61, 69], "calculate_conversation_level_featur": 69, "calculate_hedge_featur": 64, "calculate_id_scor": 13, "calculate_info_divers": 65, "calculate_named_ent": 15, "calculate_num_question_na": 20, "calculate_politeness_senti": 64, "calculate_politeness_v2": 64, "calculate_team_bursti": 65, "calculate_textblob_senti": 64, "calculate_user_level_featur": 69, "calculate_vector_word_mimicri": 64, "calculate_word_mimicri": 64, "call": [1, 2, 8, 11, 13, 61, 62, 64, 69], "can": [0, 1, 2, 11, 31, 32, 33, 34, 36, 37, 42, 43, 44, 47, 48, 49, 50, 52, 54, 60, 61, 62, 67, 69], "can_you": 49, "cannot": [1, 2, 31, 36, 45, 46, 49, 62], "cao": [21, 24, 33, 43, 44, 56, 57, 62], "cap": [22, 48, 64], "capit": [0, 2, 48], "captur": [29, 30, 32, 34, 35, 38, 41, 42, 55], "caract": 40, "cardiffnlp": [1, 61], "care": 1, "carefulli": 60, "carri": 31, "casa_token": 5, "case": [1, 13, 16, 28, 29, 30, 31, 36, 37, 41, 45, 46, 51, 55, 56, 59, 61], "casual": 43, "categori": [21, 32, 42, 45, 46, 49, 52, 67], "caus": [31, 32, 36, 59], "caveat": 42, "center": 62, "central": 34, "centroid": [34, 66], "certain": [5, 19, 30, 42, 45, 46, 49, 71], "certainli": 42, "certainti": [11, 38, 39, 42, 64, 67], "cfm": 4, "chall": [1, 21, 39, 64, 70], "chang": [0, 1, 34, 50, 61, 71], "charact": [1, 2, 3, 15, 19, 37, 42, 49, 62, 64, 65, 66, 67, 71], "characterist": [1, 62], "chat": [0, 1, 2, 4, 5, 6, 7, 8, 12, 13, 14, 16, 23, 25, 28, 29, 32, 35, 36, 41, 44, 45, 46, 49, 59, 61, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73], "chat_data": [2, 6, 7, 8, 26, 27, 28, 63, 64, 65, 66, 67, 71], "chat_df": 14, "chat_featur": [1, 42, 61, 65, 66], "chat_level_data": 72, "chat_level_featur": 2, "chatlevelfeaturescalcul": [1, 2, 17, 21, 61, 64, 69], "chats_data": 73, "check": [19, 44, 64, 67], "check_embed": [1, 61, 69], "chen": 62, "choic": 1, "choos": [1, 60], "chose": 1, "chronolog": 1, "chunk": [34, 59, 63], "chunk_num": 63, "circlelyt": 13, "citat": [21, 24], "cite": 50, "clarif": [16, 32, 64], "class": [1, 2, 31, 61, 62, 64, 65, 66], "classif": [21, 64], "classifi": [16, 21, 50, 56, 57], "classify_ntri": 16, "classify_text_dalechal": 21, "clean": [2, 17, 19, 67, 71], "clean_text": 19, "clear": 1, "close": [31, 42, 48, 62], "closer": [45, 46, 59], "clue": 62, "cmu": 12, "code": [6, 18, 29, 32, 51, 55, 61, 62, 68], "coeffici": [1, 4, 39, 62, 65, 68], "cognit": 62, "col": 2, "colab": [0, 1], "collabor": [59, 62], "collaps": 2, "collect": [1, 2, 34, 49, 50, 52, 61, 62], "colleg": 33, "column": [0, 2, 4, 6, 7, 8, 9, 12, 13, 14, 16, 18, 23, 25, 28, 42, 51, 56, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "column_count_frequ": 28, "column_count_mim": 28, "column_mimc": 28, "column_nam": 71, "column_to_summar": 72, "com": [1, 2, 4, 5, 13, 15, 18, 64, 67, 68, 71], "comb": 62, "combin": [0, 1, 6, 28, 64, 71], "come": [1, 12, 13, 21, 32, 33, 42, 58, 61], "comm": [1, 61], "command": [1, 61], "comment": 48, "commit": 23, "commit_data": 19, "common": [0, 32, 62, 64], "commonli": 37, "commun": [0, 1, 11, 42, 44, 48, 55, 60, 62, 64], "companion": 1, "compar": [31, 35, 44, 45, 52, 64, 71, 73], "compat": [0, 1, 61], "complement": [31, 36], "complet": [1, 2, 31, 55], "complex": [0, 35, 43, 50, 62], "compon": 50, "compos": 2, "comprehens": [33, 48], "compress": 71, "comput": [0, 1, 2, 4, 5, 6, 10, 11, 12, 13, 14, 28, 29, 30, 31, 34, 35, 36, 37, 40, 41, 42, 45, 46, 49, 50, 52, 55, 62, 64, 65, 66, 67, 69, 73], "compute_frequ": 28, "compute_frequency_per_conv": 28, "compute_vectors_from_preprocess": [0, 2], "computetf": 28, "conain": 61, "concat_bert_featur": [1, 61, 64], "concaten": [19, 49, 64, 71], "concentr": 55, "concept": [29, 39, 42, 62], "conceptu": [61, 62], "concis": 43, "concret": 29, "conduct": 1, "confid": [2, 5, 15, 30, 47, 64], "conflict": 62, "confound": 44, "congruent": 34, "conjection_seper": 19, "conjunct": [19, 31, 36, 49], "conjunction_start": 49, "connect": 39, "conscious": 35, "consecut": 22, "consequ": [0, 1], "consid": [1, 2, 33, 37], "consider": [61, 62], "consist": [31, 36, 40, 41], "constitut": 41, "constrain": [34, 35], "construct": [1, 11, 55, 62], "constructor": 47, "consult": 5, "contact": 0, "contain": [1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 25, 28, 29, 30, 35, 38, 42, 47, 49, 50, 55, 61, 62, 63, 64, 67, 71, 72, 73], "content": [0, 1, 12, 13, 28, 34, 36, 39, 41, 42, 45, 46, 62, 64, 67], "content_mimicry_scor": 28, "content_mimicry_score_per_conv": 28, "content_word_accommod": 31, "content_word_accommodation_per_conv": 31, "content_word_mimicri": 28, "contentcod": 67, "contentcodingdictionari": 67, "context": [2, 32, 42, 48, 62, 71], "continu": [56, 57], "contract": 49, "contrast": 39, "contribut": [13, 34, 37, 62], "control": 1, "conv": [1, 61], "conv_data": [2, 65], "conv_features_al": [1, 61], "conv_features_bas": [1, 11, 61], "conv_level_featur": 2, "conv_to_float_arr": 8, "convei": [6, 34, 52], "conveni": [1, 61], "convers": [0, 2, 3, 4, 6, 7, 8, 9, 12, 13, 23, 25, 28, 29, 31, 34, 35, 36, 37, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 52, 55, 58, 59, 61, 63, 64, 65, 66, 68, 71, 72, 73], "conversation_id": [2, 28, 61, 71], "conversation_id_col": [0, 1, 2, 4, 6, 7, 8, 9, 13, 23, 25, 26, 27, 61, 63, 64, 65, 66, 68, 72, 73], "conversation_num": [0, 1, 2, 6, 7, 64, 66, 73], "conversationlevelfeaturescalcul": [2, 65, 69], "convert": [8, 41, 49, 67, 71], "convict": 5, "convo_aggreg": [0, 1, 2, 65], "convo_column": [0, 1, 2, 65], "convo_method": [0, 1, 2, 65], "convokit": [17, 50, 62, 64], "coordin": 55, "copi": [0, 1, 42], "copular": [31, 36], "core": [34, 69], "cornel": 17, "corpu": [0, 1, 50], "corrado": 37, "correl": [41, 55], "correspond": [30, 34, 35, 40, 49, 55, 66], "cosin": [6, 7, 13, 28, 31, 34, 35, 36, 40, 45, 46, 65], "could": [1, 31, 33, 36, 50, 54], "could_you": 49, "couldn": [31, 36], "count": [0, 1, 3, 12, 14, 15, 16, 19, 21, 25, 28, 30, 31, 32, 36, 39, 41, 43, 44, 49, 52, 53, 54, 56, 58, 64, 65, 66], "count_all_cap": 22, "count_bullet_point": 22, "count_charact": 3, "count_difficult_word": 21, "count_ellips": 22, "count_emoji": 22, "count_emphasi": 22, "count_line_break": 22, "count_link": 22, "count_match": [19, 49], "count_messag": 3, "count_numb": 22, "count_parenthes": 22, "count_quot": 22, "count_responding_to_someon": 22, "count_spacy_match": 19, "count_syl": 21, "count_turn": 25, "count_turn_taking_index": 25, "count_user_refer": 22, "count_word": 3, "countabl": [1, 65], "countd": 36, "counterfactu": 50, "cours": [16, 31, 34, 36, 63], "cover": 28, "cpu": [0, 1, 67], "creat": [0, 1, 2, 13, 19, 31, 40, 42, 61, 62, 64, 65, 66, 71], "create_chunk": 63, "create_chunks_messag": 63, "create_cumulative_row": 71, "credit": 33, "criteria": 67, "crowd": 13, "csv": [1, 2, 61, 62, 67], "cuda": 67, "cumul": [2, 71], "cumulative_group": [0, 1, 2, 71], "current": [1, 11, 23, 31, 34, 35, 36, 40, 45, 46, 58, 61, 64, 71], "curt": 43, "custom": [0, 2, 11, 14, 62], "custom_featur": [0, 1, 2, 61], "custom_liwc_dictionari": [14, 64], "custom_liwc_dictionary_path": [0, 2, 42], "customiz": 62, "cut": 1, "cutoff": [2, 15, 47, 64], "d": [0, 1, 2, 31, 34, 36, 61], "dale": [1, 21, 39, 64, 70], "dale_chall_help": 21, "danescu": [49, 50], "dash": 22, "data": [0, 2, 6, 7, 8, 9, 13, 19, 20, 32, 37, 40, 41, 47, 51, 55, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "datafram": [0, 1, 2, 4, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 23, 25, 28, 37, 42, 47, 49, 59, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "dataknowsal": 15, "dataset": [1, 2, 9, 12, 13, 28, 31, 41, 47, 52, 61, 64, 65, 66, 73], "date": [1, 42, 61], "datetim": [2, 58], "dcosta": 62, "deal": [50, 59], "death": 1, "debat": 59, "decid": [1, 62], "decis": [1, 13, 62], "declar": [1, 62, 69], "deepli": 62, "default": [0, 1, 2, 5, 11, 13, 16, 23, 30, 34, 35, 42, 47, 62, 63, 64, 66, 67, 71, 73], "defer": [17, 50], "defin": [0, 11, 21, 31, 34, 36, 40, 59, 62, 64, 65, 66, 70], "definit": [1, 3, 44], "degre": [6, 30, 36, 45, 46, 55], "delet": 29, "deliber": 1, "demo": 61, "democrat": 1, "demonstr": 1, "demystifi": 62, "denomin": 59, "denot": 42, "densiti": 60, "dep_": 49, "dep_pair": 19, "depend": [0, 1, 10, 19, 32, 49, 52, 61, 63], "deriv": [2, 11, 65, 66], "descend": 67, "describ": [1, 11, 62], "descript": [1, 61], "design": [0, 1, 2, 13, 34, 62], "desir": [2, 63, 72], "detail": [0, 11, 33, 41, 43, 61, 62], "detect": [1, 32, 37, 38, 47, 48, 49, 54], "determin": [13, 18, 31, 35, 36, 40, 45, 46, 71], "dev": 24, "develop": [5, 37, 40, 62], "deviat": [4, 5, 29, 40, 41, 55, 58, 65, 72, 73], "devic": 67, "df": [4, 8, 9, 12, 13, 16, 18, 23, 28, 63, 71], "dic": [2, 14, 42, 67], "diccategori": 67, "dict": [2, 14, 17, 19, 28, 64, 67, 71], "dicterm": 67, "dictext": 67, "dictionari": [0, 1, 2, 14, 15, 17, 19, 28, 30, 42, 49, 61, 64, 67, 71], "did": [1, 31, 36, 37, 47, 50, 54, 62], "didn": [31, 36], "differ": [0, 1, 2, 4, 11, 12, 23, 28, 29, 31, 34, 36, 37, 39, 40, 44, 45, 46, 47, 49, 55, 62, 63, 64, 65, 66, 67, 71], "differenti": [49, 59], "difficult": [21, 33], "difficult_word": 21, "difficulti": 33, "dimens": [40, 62], "dimension": [34, 35], "dinner": 41, "direct": [34, 43, 45, 47, 50, 69], "direct_quest": [32, 50, 54], "direct_start": 50, "directli": [1, 62, 69], "directori": [0, 2, 19, 61, 65, 67], "disabl": 1, "disagr": 49, "disagre": 51, "discours": [31, 36], "discret": [31, 36, 45, 46], "discurs": [0, 1, 6, 8, 39, 40, 61, 65, 66], "discursive_divers": 11, "discus": 8, "discuss": [0, 1, 31, 34, 39, 40, 42, 43, 61, 62, 71], "dispers": 68, "displai": [1, 34, 42, 46, 61], "dispos": 1, "distanc": [34, 35, 40], "distinct": [31, 36, 59], "distinguish": 59, "distribut": 31, "div": 16, "diverg": [6, 34, 35], "divers": [0, 1, 6, 8, 13, 39, 61, 65], "divid": [16, 34, 59, 63], "dl": [21, 24], "do": [0, 1, 29, 31, 34, 36, 37, 43, 49, 50, 54, 62, 69], "doc": [2, 19], "doc_top": 13, "document": [1, 17, 61, 69], "doe": [1, 2, 29, 40, 42, 43, 45, 47, 54, 61, 71], "doesn": [0, 1, 29, 31, 36, 42, 45, 61, 67], "doi": [5, 6, 21, 24, 64], "domain": [31, 50], "don": [31, 36, 49, 54, 62, 67], "done": [2, 50], "dot": 22, "doubl": 30, "down": [31, 36], "download": [1, 61], "download_resourc": [1, 61], "downstream": [17, 62], "dozen": 62, "drive": [62, 69], "driver": [2, 61, 64, 65, 66], "drop": [0, 2, 64], "due": [34, 59], "duncan": 62, "duplic": [1, 2, 71], "durat": [58, 63], "dure": [2, 55, 59, 62], "dynam": [59, 61], "e": [0, 1, 2, 4, 15, 20, 29, 30, 31, 32, 34, 35, 36, 37, 38, 41, 42, 47, 48, 49, 52, 54, 56, 59, 61, 63, 65, 66, 67, 71], "e2": [21, 70], "each": [0, 1, 2, 3, 4, 7, 8, 9, 11, 12, 15, 17, 19, 23, 25, 28, 30, 31, 34, 35, 36, 37, 40, 41, 42, 44, 45, 46, 47, 49, 50, 51, 52, 55, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73], "earlier": [0, 1, 2, 42], "easi": [1, 21, 62, 70], "easier": 21, "easili": 33, "easy_word": 21, "eat": 34, "echo": 31, "econom": 37, "edg": [29, 59], "edu": [1, 12, 16, 17, 70], "effect": [1, 41], "effici": 1, "effort": 55, "either": [2, 20, 52, 55, 67], "elaps": 58, "element": [1, 6, 67], "ellips": [22, 48, 64], "els": [1, 22, 47, 64], "embed": [8, 31, 34, 35, 36, 45, 46, 65, 66, 67, 69], "emili": [30, 35, 45, 46, 47, 59, 62], "emoji": [22, 42, 48, 64, 67, 71], "emot": [1, 61], "emoticon": 48, "emphas": [22, 48, 64], "emphasi": 48, "empirica": [1, 2, 71], "emploi": 45, "empti": [0, 2, 13, 64, 67], "en": [1, 21, 24, 61, 70], "en_core_web_sm": [1, 61], "enabl": 71, "enclos": 22, "encod": [1, 8], "encompass": 62, "encount": [1, 34, 35, 61], "encourag": 64, "end": [0, 1, 15, 20, 23, 34, 42, 54, 62, 63, 67], "engag": 43, "engin": 2, "english": [34, 42], "enjoi": 62, "ensur": [0, 1, 40, 42, 49, 61, 63, 67], "entir": [0, 1, 12, 28, 31, 36, 40, 41, 52, 59, 62, 73], "entiti": [0, 2, 15, 39, 64], "entityrecogn": 47, "entri": [1, 28, 61], "ep8dauru1ogvjurwdbof5h6ayfbslvughjyiv31d_as6ppbt": 5, "equal": [1, 21, 34, 37, 40, 55, 59, 61, 62, 63], "equival": [0, 1, 41, 55, 61], "eric": 62, "error": [1, 16, 61, 71], "escap": 42, "especi": [41, 62], "essenti": 51, "establish": 31, "estim": 31, "et": [1, 5, 16, 18, 21, 24, 30, 31, 32, 33, 34, 35, 36, 38, 42, 43, 44, 49, 50, 52, 53, 54, 56, 57, 58, 59, 60, 64], "etc": [10, 15, 16, 17, 42], "evalu": [5, 47, 50], "evan": 62, "even": [0, 1, 2, 34, 37, 42, 62, 63, 67], "evenli": [34, 55], "event": [1, 34, 55, 61], "ever": 62, "everi": [1, 4, 13, 31, 34, 35, 36, 61, 62], "everybodi": [31, 36], "everyon": [31, 36, 47, 62], "everyth": [31, 36, 56], "everywher": [31, 36], "evolut": 35, "evolv": [35, 71], "exactli": [1, 2, 71], "examin": [40, 62, 63], "exampl": [0, 10, 11, 15, 21, 24, 29, 31, 32, 34, 37, 42, 43, 48, 50, 51, 54, 56, 59, 60, 61, 62, 67], "example_data": 1, "exce": 15, "except": [42, 67, 71], "exchang": [12, 35, 39, 40, 45, 55, 64], "exclud": [0, 41, 42], "exclus": [41, 42], "excus": 32, "exhibit": 35, "exist": [0, 1, 2, 55, 61, 62, 63, 64, 67], "expand": 49, "expect": [1, 37, 42, 47], "expected_valu": 47, "explain": [0, 29], "explan": [29, 43], "explor": [61, 62], "express": [5, 14, 30, 31, 32, 36, 38, 42, 64, 67], "extend": 1, "extens": [43, 44], "extent": [1, 4, 7, 12, 31, 34, 35, 37, 51, 55, 59, 61], "extern": 48, "extra": 51, "extract": [1, 17, 19, 28, 40, 50, 64], "extrem": [55, 56, 57], "face": [1, 51, 61], "facilit": [62, 71], "fact": [4, 35, 50, 54, 59], "factual": [17, 24, 50], "fail": [1, 61], "fals": [0, 1, 2, 31, 54, 61, 67, 71], "famili": 42, "far": [34, 35, 46, 50, 62], "faster": 14, "feat_count": 19, "featur": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 63, 64, 65, 66, 67], "feature_build": [0, 1, 42, 61, 64], "feature_dict": [1, 61], "feature_method": [64, 65], "feature_nam": [1, 61], "featurebuild": [0, 2, 11, 42, 47, 69], "features_conceptu": [1, 61], "feauturebuild": 1, "few": [48, 62], "fewer": [12, 60], "fflow": 11, "field": [13, 17], "file": [0, 2, 12, 14, 19, 42, 61, 65, 67], "filenam": [1, 2, 19], "filenotfounderror": 67, "filler": [37, 60], "filler_paus": 49, "filter": [19, 62], "final": [1, 2, 34, 42, 62], "find": [1, 19, 28, 50], "fingertip": 62, "finit": 55, "first": [0, 2, 11, 12, 16, 19, 31, 34, 35, 36, 39, 40, 41, 42, 45, 46, 49, 52, 54, 59, 61, 62, 64, 67, 70, 71], "first_person": 12, "first_person_plur": 49, "first_person_raw": [12, 16], "first_person_singl": 49, "five": 37, "fix": [52, 67], "fix_abbrevi": 67, "flag": 71, "float": [0, 2, 4, 5, 6, 8, 10, 13, 14, 16, 21, 24, 25, 28, 68], "floor": 59, "flow": [0, 1, 7, 31, 36, 39, 41, 45, 46, 61, 64], "focal": [31, 36], "focu": 41, "folder": [0, 1, 19], "follow": [0, 1, 2, 11, 16, 17, 29, 31, 32, 33, 41, 42, 47, 49, 50, 53, 55, 59, 60, 61, 64, 65, 67], "for_m": 49, "for_you": 49, "forc": [0, 1, 61], "form": 1, "formal": [1, 61], "formal_titl": 49, "format": [0, 1, 8, 17, 22, 42, 47, 48, 61, 62, 64, 67], "former": [45, 46], "formula": [33, 42, 59, 64, 70], "fornt": 1, "forward": [0, 1, 7, 39, 41, 61, 64], "forward_flow": 35, "found": [1, 2, 5, 28, 30, 33, 61, 69], "four": [1, 8], "fourth": 33, "frac": 55, "fraction": 59, "framework": [49, 50, 62], "frequenc": [28, 31, 44, 64], "frequency_dict": 28, "fridai": 34, "from": [0, 1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 19, 21, 28, 29, 31, 32, 33, 34, 35, 36, 39, 41, 42, 49, 50, 51, 53, 55, 56, 57, 58, 61, 62, 64, 65, 66, 67, 71], "full": [1, 28, 37], "full_empirical_dataset": 1, "fulli": [32, 48], "functinon": 12, "function": [1, 2, 3, 4, 10, 11, 12, 13, 14, 16, 20, 21, 28, 31, 39, 44, 45, 46, 50, 56, 57, 60, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73], "function_mimic_word": 28, "function_mimicry_scor": 28, "function_word_mimicri": 28, "function_word_refer": 28, "fund": 62, "further": [1, 61, 71], "furthermor": 42, "futur": [23, 66], "g": [0, 1, 2, 4, 15, 20, 29, 31, 32, 36, 37, 38, 41, 42, 47, 48, 52, 54, 59, 61, 63, 65, 66, 67, 71], "game": [1, 2, 59, 71], "gaug": [33, 52], "gener": [0, 2, 9, 11, 12, 16, 21, 31, 34, 35, 36, 40, 42, 45, 46, 49, 51, 59, 65, 66, 67, 69, 71, 72], "generaliz": 23, "generate_bert": 67, "generate_certainty_pkl": 67, "generate_lexicon_pkl": 67, "generate_vect": 67, "gensim": 40, "get": [16, 20, 21, 28, 30, 31, 36, 49, 66, 67], "get_all_dd_featur": 11, "get_centroid": 66, "get_certainti": 5, "get_certainty_scor": 64, "get_content_words_in_messag": 28, "get_conversation_level_aggreg": 65, "get_cosine_similar": 6, "get_dale_chall_easy_word": [21, 70], "get_dale_chall_score_and_classf": 64, "get_dd": 6, "get_dd_featur": 8, "get_dep_pair": [19, 49], "get_dep_pairs_noneg": [19, 49], "get_discursive_diversity_featur": 65, "get_first_pct_of_chat": 2, "get_first_person_word": [12, 70], "get_forward_flow": [7, 64], "get_function_word": 70, "get_function_words_in_messag": 28, "get_gini": 68, "get_gini_featur": 65, "get_info_divers": 13, "get_info_exchange_wordcount": 12, "get_liwc_count": 14, "get_max": 72, "get_mean": 72, "get_median": 72, "get_mimicry_bert": 28, "get_min": 72, "get_moving_mimicri": 28, "get_named_ent": 64, "get_nan_vector": [27, 67], "get_polarity_scor": 24, "get_politeness_strategi": 17, "get_politeness_v2": 18, "get_proportion_first_pronoun": 16, "get_question_word": 70, "get_reddit_featur": 64, "get_senti": 67, "get_stdev": 72, "get_subjectivity_scor": 24, "get_sum": 72, "get_team_bursti": 4, "get_temporal_featur": [4, 64], "get_time_diff": 23, "get_time_diff_startend": 23, "get_turn": 25, "get_turn_id": 71, "get_turn_taking_featur": 65, "get_unique_pairwise_combo": 6, "get_user_level_aggreg": 65, "get_user_level_summary_statistics_featur": 66, "get_user_level_summed_featur": 66, "get_user_max_datafram": 72, "get_user_mean_datafram": 72, "get_user_median_datafram": 72, "get_user_min_datafram": 72, "get_user_network": [11, 66], "get_user_stdev_datafram": 72, "get_user_sum_datafram": 72, "get_variance_in_dd": 26, "get_within_person_disc_rang": 27, "get_word_ttr": 16, "get_zscore_across_all_chat": 73, "get_zscore_across_all_convers": 73, "gina": 62, "gini": [1, 39, 62, 65, 68], "gini_coeffici": [11, 69], "github": [0, 1, 2, 18, 67, 71], "give": [0, 1, 29, 37], "give_ag": 49, "given": [0, 1, 5, 6, 13, 14, 28, 30, 31, 33, 34, 35, 36, 40, 41, 55, 59, 66, 67, 71], "go": [1, 34, 35, 45, 46, 50, 62], "goal": 62, "goe": 67, "good": [50, 56, 62], "goodby": 49, "googl": [0, 1], "got": [31, 36], "gotta": [31, 36], "gpu": [0, 2, 67], "grade": 33, "grader": 21, "grai": 35, "grammat": 36, "granularli": 35, "grate": [42, 62], "gratitud": [17, 49, 50], "great": [47, 50, 51, 56, 59, 60, 62], "greater": 55, "greet": 50, "groceri": 41, "group": [0, 2, 4, 13, 29, 33, 34, 41, 52, 59, 62, 68, 71, 72], "grouping_kei": [0, 1, 2, 71], "gt": 22, "guess": 10, "gun": 1, "gy": 15, "gym": 34, "ha": [0, 1, 32, 34, 35, 37, 42, 43, 46, 52, 54, 55, 56, 59, 61, 62, 63, 67, 71], "had": [1, 31, 36, 54, 61], "hadn": [31, 36], "handl": [19, 29], "happen": [1, 2, 55, 62, 63], "happi": 42, "hardcod": 67, "harder": 21, "hashedg": [17, 50], "hasn": [31, 36], "hasneg": 50, "hasposit": 50, "hate": 31, "have": [0, 1, 10, 12, 16, 31, 34, 36, 37, 40, 41, 42, 45, 46, 50, 54, 59, 60, 61, 62, 71], "haven": [31, 36], "he": [1, 31, 36], "header": [18, 67], "hear": 32, "heart": [61, 62], "heat": 1, "heavi": 62, "hedg": [11, 30, 39, 49, 50, 64], "hei": [1, 35, 45, 46, 50], "helena": [47, 62], "hello": [0, 43, 49], "help": [0, 31, 34, 36, 43, 45, 46, 52, 58, 69], "helper": 67, "her": [30, 31, 36], "here": [1, 29, 31, 34, 41, 42, 47, 61, 62, 66], "herself": [31, 36], "hesit": [60, 64], "hi": [31, 35, 36, 43, 45, 46], "hierach": 71, "hierarch": 71, "high": [0, 1, 2, 61, 62, 71], "higher": [0, 1, 21, 31, 34, 36, 40, 41, 42, 44, 45, 46, 55, 60], "highest": 71, "highlight": 1, "him": [31, 36], "himself": [31, 36], "hmm": [31, 36], "hoc": 62, "hold": 31, "hole": 62, "home": 42, "homework": 34, "homonym": 31, "hood": 1, "hope": 35, "host": [45, 46], "hour": 48, "how": [1, 5, 28, 29, 30, 31, 34, 35, 36, 39, 43, 45, 51, 52, 54, 56, 62], "howev": [0, 1, 3, 11, 35, 40, 42, 44, 54, 56, 61, 62], "howitwork": 1, "html": [1, 2, 15, 17, 24, 61], "http": [1, 2, 4, 5, 6, 12, 13, 15, 16, 17, 18, 21, 24, 41, 45, 46, 47, 61, 64, 67, 68, 70, 71], "hu": [1, 42, 62], "hug": [1, 51, 61], "huggingfac": 1, "huh": [31, 32, 36], "human": [37, 50, 62], "hyperlink": 48, "hyphen": [1, 42, 61, 67], "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 19, 20, 21, 22, 23, 24, 25, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 71, 73], "iby1": 5, "id": [2, 4, 7, 23, 28, 62, 64, 66, 68, 71, 72, 73], "idea": [12, 35, 40, 47, 51], "ident": [34, 35], "identif": 1, "identifi": [0, 1, 2, 4, 8, 9, 15, 23, 25, 30, 31, 41, 47, 50, 52, 61, 63, 64, 71, 72], "identiif": [13, 71], "ignor": [1, 32], "illustr": [1, 41, 48, 62], "imagin": 1, "immedi": [31, 35, 64], "impact": [1, 60], "impersonal_pronoun": 49, "implement": 64, "impli": 37, "import": [31, 32, 36, 44, 45, 62, 69], "incent": 13, "includ": [0, 1, 2, 10, 17, 22, 31, 32, 35, 36, 42, 45, 46, 51, 52, 56, 61, 62, 66, 71], "inclus": [13, 71], "incongru": [8, 34], "incorpor": [1, 42, 45, 46], "increas": [1, 62], "incredibli": 42, "increment": 71, "independ": 1, "index": [1, 2, 4, 13, 25, 37, 39, 55, 61, 65], "indic": [1, 2, 16, 21, 22, 30, 32, 34, 35, 36, 40, 41, 43, 44, 48, 49, 50, 52, 55, 60, 63, 71], "indirect": 50, "indirect_btw": 50, "indirect_greet": 50, "indirectli": 69, "individu": [0, 1, 5, 11, 31, 34, 37, 45, 50, 59, 60, 62, 72], "inequ": 37, "infer": [1, 51, 67], "influenc": 1, "info": [13, 18, 64], "info_divers": 13, "info_exchang": 64, "info_exchange_wordcount": [41, 64], "info_exchange_zscor": 11, "inform": [0, 6, 11, 12, 13, 24, 32, 34, 39, 48, 62, 64, 65], "informal_titl": 49, "information_divers": 11, "initi": [2, 62, 63, 64, 65, 66], "input": [0, 2, 4, 6, 12, 13, 14, 15, 16, 19, 20, 22, 28, 42, 50, 55, 60, 62, 63, 64, 65, 66, 67, 71, 72], "input_data": [25, 68, 72], "input_df": [1, 2, 42, 61, 71], "inquiri": [0, 30, 39, 52], "insid": 1, "insight": 1, "inspir": 15, "instal": [1, 61, 62], "instanc": [1, 22, 50, 59, 66], "instanti": 2, "insteac": 1, "instead": [1, 62], "instruct": [1, 61], "int": [2, 3, 10, 13, 15, 16, 19, 20, 22, 28, 63, 64, 67], "intact": 71, "integ": [0, 13, 40, 47], "intend": 59, "interact": [1, 11, 43, 44, 62, 69], "interconnect": 62, "interest": [1, 61, 62], "interfac": 62, "intermedi": [59, 64], "intern": 29, "interpret": [0, 1], "interrupt": 59, "interv": [58, 65], "introduc": [42, 62], "introduct": [11, 61], "invalid": 67, "invers": 64, "involv": [41, 62, 65], "io": [1, 24, 47, 61], "ipynb": [0, 1], "is_hedged_sentence_1": 10, "is_valid_term": 67, "isn": [1, 31, 36], "issu": [1, 31, 36, 37, 42, 61], "ital": 64, "italic": 22, "item": [0, 71], "its": [0, 15, 31, 35, 36, 40, 41, 47, 54, 55, 64, 69], "itself": [31, 36, 44], "jami": [0, 42], "john": 1, "jonson": 62, "journal": [5, 64], "json": [1, 61], "jurafski": 70, "juri": 1, "juries_df": 1, "jury_conversations_with_outcome_var": 1, "jury_feature_build": 1, "jury_output": 1, "jury_output_chat_level": [1, 61], "jury_output_turn_level": 1, "just": [1, 2, 31, 36, 46, 50, 59, 61, 62], "katharina": 34, "keep": [1, 71], "kei": [1, 2, 4, 19, 28, 30, 54, 61, 71], "keyword": [19, 49], "kind": [10, 62], "kitchen": 42, "knob": 0, "know": [1, 30], "knowledg": 29, "known": [1, 32, 61], "kumar": 62, "kw": 19, "l714": 67, "l81": 67, "lab": [1, 2, 62, 71], "label": [1, 15, 21, 51], "lack": [31, 38, 45, 46], "languag": [15, 31, 34, 42, 50, 62], "larg": [1, 31, 69], "larger": [0, 31, 61], "last": [1, 31], "late": 32, "later": [0, 1, 2, 42, 61], "latest": [1, 61], "latter": [31, 36], "lda": [13, 40], "learn": [1, 61, 62], "least": [10, 32, 42, 63, 67], "led": 62, "legal": 49, "lemmat": [13, 40], "len": 28, "length": [35, 39, 41, 42, 44, 67], "less": [1, 13, 32, 50, 52, 55, 62, 63], "let": [41, 49, 53], "let_me_know": 49, "letter": [49, 71], "level": [0, 1, 2, 3, 4, 6, 7, 8, 9, 12, 13, 14, 16, 23, 61, 64, 65, 66, 71, 72], "lexic": [1, 10, 12, 14, 16, 31, 32, 36, 42, 60, 62, 64], "lexical_featur": [14, 64], "lexical_features_v2": [10, 11], "lexicon": [0, 5, 10, 14, 30, 39, 50, 52, 67, 69], "lexicons_dict": 67, "librari": [34, 51, 56, 57], "lift": 62, "light": 61, "like": [1, 22, 31, 34, 36, 41, 50, 61, 62], "limiat": 32, "limit": [11, 32, 37, 42, 54], "line": [0, 1, 19, 22, 48, 61, 62, 64, 67], "linear": 64, "linguist": [0, 18, 19, 30, 39, 50, 52], "link": [22, 29, 48, 50, 64], "list": [1, 2, 6, 7, 10, 11, 12, 13, 15, 19, 20, 21, 22, 28, 31, 33, 36, 37, 42, 48, 49, 50, 53, 54, 61, 64, 65, 66, 67, 68, 70, 71], "liter": 42, "literatur": 62, "littl": 38, "littlehors": 1, "liu": [42, 52], "live": [1, 54], "liwc": [0, 2, 14, 30, 39, 51, 52, 56, 62, 64, 67], "liwc2015": 42, "liwc_featur": [10, 14], "liwc_test_output": 42, "lix": 34, "ll": [1, 31, 36, 61], "load": [2, 19, 67, 69], "load_custem_liwc_dict": 2, "load_liwc_dict": 67, "load_saved_data": 19, "load_to_dict": 19, "load_to_list": 19, "loc": 15, "local": [1, 42, 51, 61], "locat": [1, 62], "long": 4, "longer": [30, 41, 43, 48, 61, 62], "look": [2, 34, 61, 65, 66], "loos": 36, "lot": [31, 36], "loud": 60, "love": [31, 56], "low": [1, 2, 29, 55, 60, 71], "lower": [0, 1, 21, 31, 33, 36, 41, 44, 55, 60], "lowercas": [2, 13, 40, 48, 49, 71], "lowest": 71, "lpearl": 16, "lst": 6, "m": [0, 2, 23, 30, 31, 36], "made": [1, 23, 35, 59, 61, 62], "magnitud": 55, "mai": [1, 2, 11, 28, 31, 32, 35, 36, 37, 41, 42, 43, 44, 54, 61, 62, 71], "main": [1, 2, 5, 62, 64, 65, 66, 67], "make": [1, 5, 31, 34, 55, 56, 62, 66, 69, 71], "man": 62, "mani": [1, 4, 11, 32, 37, 41, 60, 62, 66], "manner": [55, 62], "manual": [1, 61], "map": [13, 34, 67], "mark": [19, 20, 22, 43, 54, 64, 71], "marker": [18, 32, 39, 42, 50, 51, 52, 54, 56], "marlow": 44, "matarazzo": 62, "match": [1, 5, 16, 19, 30, 67], "math": 34, "matter": [28, 47], "max": [0, 1, 2, 11, 66, 72], "max_num_chunk": 63, "max_user_mean_num_word": 1, "maxim": [34, 35, 37, 72], "maximum": [1, 63, 65, 72], "mayb": [38, 47], "mcfarland": 70, "me": [31, 32, 36, 41, 50, 53], "mean": [0, 1, 2, 4, 6, 11, 13, 21, 29, 31, 34, 36, 40, 41, 42, 47, 55, 56, 58, 61, 62, 65, 66, 72, 73], "mean_num_word": 1, "meaning": [31, 41, 55], "meaningless": 41, "meant": 39, "measur": [0, 1, 7, 12, 13, 20, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 51, 52, 54, 55, 56, 57, 58, 59, 60, 62, 64, 68], "mechan": 32, "median": [0, 1, 72], "medium": 21, "meet": 48, "member": [13, 34, 37, 55], "merg": [2, 8, 65, 66], "merge_conv_data_with_origin": 2, "messag": [0, 1, 2, 3, 4, 5, 8, 11, 12, 13, 14, 15, 16, 17, 20, 21, 22, 23, 24, 28, 30, 31, 34, 35, 36, 37, 39, 41, 45, 46, 47, 48, 50, 51, 52, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 71, 73], "messaga": 61, "message_col": [0, 1, 2, 12, 13, 14, 61, 64, 65, 67, 71], "message_col_origin": 14, "message_embed": [6, 7, 8], "message_lower_with_punc": 71, "metadata": [0, 1], "method": [1, 5, 31, 41, 50, 62, 65], "metric": [0, 1, 8, 30, 34, 35, 46, 47, 48, 55, 66], "michael": 1, "mid": [1, 2, 71], "middl": [21, 34, 63], "might": [0, 1, 29, 43, 48, 53], "mikeyeoman": [18, 64], "mileston": 34, "millisecond": [0, 2], "mimic": [28, 31, 36, 45], "mimic_word": 28, "mimick": [28, 31, 64], "mimicri": [0, 1, 28, 31, 35, 36, 39, 61, 64], "mimicry_bert": [45, 46], "min": [1, 2, 11, 72], "mind": [1, 35, 50], "mine": [31, 36, 53, 59], "minim": [0, 41, 60], "minimum": [65, 72], "minmiz": 72, "minu": [12, 41, 64], "minut": [55, 58], "mirror": 1, "miss": [1, 32, 61], "mitig": [31, 36], "mizil": [49, 50], "mm": [31, 36], "mnsc": 6, "modal": 50, "mode": 60, "model": [1, 2, 13, 15, 31, 34, 35, 36, 40, 45, 46, 47, 51, 62, 67], "model_bert": 67, "modif": 35, "modifi": [1, 9, 19, 32, 64], "modul": [0, 1, 11, 34, 49, 50, 61, 69], "monologu": 59, "more": [0, 1, 2, 11, 12, 22, 23, 24, 28, 31, 32, 34, 36, 37, 40, 41, 42, 43, 44, 45, 46, 50, 52, 55, 59, 61, 62, 71], "morn": 1, "most": [1, 24, 31, 55, 62, 69], "motiv": 61, "move": [0, 1, 28, 31, 36, 39, 45, 59, 61], "movi": 31, "much": [1, 28, 31, 34, 35, 36, 45, 62], "multi": [1, 2, 71], "multidimension": [45, 46], "multipl": [0, 1, 2, 19, 62, 71], "must": [1, 6, 62, 71], "my": [30, 31, 35, 36, 45, 46, 50, 53], "my_chat_featur": 1, "my_feature_build": 61, "my_fil": 1, "my_output": 61, "my_output_chat_level": 61, "my_output_conv_level": 61, "my_output_user_level": 61, "my_pandas_datafram": 61, "myself": [31, 36, 53], "n": [0, 2, 35, 45, 46, 47, 57, 59, 60], "n_chat": 59, "na": [5, 33, 43, 44, 48, 49, 50, 53, 58], "naiv": [2, 20, 32, 34, 38, 39, 53, 56, 57, 64], "name": [0, 2, 4, 7, 8, 9, 12, 13, 14, 15, 17, 19, 23, 25, 28, 30, 32, 35, 39, 42, 45, 46, 50, 51, 56, 63, 64, 66, 67, 68, 71, 72, 73], "name_to_train": 47, "named_ent": [15, 47], "named_entity_recognition_featur": 11, "nan": [0, 34, 67], "nate": [35, 45, 46], "nathaniel": [35, 45, 46], "nativ": 50, "natur": [43, 55], "ndarrai": 68, "nearest": [13, 40], "nearli": 62, "necessari": [63, 67], "need": [0, 1, 2, 21, 62, 66, 67], "need_sent": 67, "need_senti": 67, "neg": [1, 24, 29, 31, 34, 35, 36, 42, 50, 51, 52, 54, 56, 61, 62, 67], "negat": [19, 49], "negative_bert": [0, 1, 51, 61], "negative_emot": [49, 51, 52, 56], "negoti": 62, "neighborhood": 54, "neither": 30, "ner": 15, "ner_cutoff": [0, 1, 2, 47, 64], "ner_train": 64, "ner_training_df": [0, 1, 2, 47], "nest": [0, 1, 2, 22, 71], "net": [45, 46], "network": 11, "neutral": [1, 5, 24, 30, 51, 55, 61, 67], "neutral_bert": [1, 51, 61], "never": 1, "new": [1, 4, 13, 34, 61, 64, 65, 66, 72], "new_column_nam": 72, "next": [1, 32, 47, 58], "nice": [1, 50, 54, 61], "nicknam": 1, "niculescu": [49, 50], "night": 31, "nikhil": [59, 62], "nltk": [1, 42, 61], "nobodi": [31, 36], "nois": 32, "non": [1, 2, 28, 31, 37, 42, 48, 61, 62, 67, 71], "none": [1, 2, 19, 37, 55, 61, 64, 65, 66, 67], "nor": 30, "normal": [19, 28, 31], "notabl": 62, "note": [0, 2, 12, 16, 20, 42, 61, 67, 71], "notebook": [0, 1], "noth": [31, 36, 56], "noun": 1, "novel": [45, 46], "now": [0, 1], "nowher": [31, 36], "np": [67, 68], "ntri": 32, "null": 34, "num": 48, "num_char": 65, "num_chunk": [27, 63], "num_hedge_word": 10, "num_messag": 65, "num_named_ent": [15, 47], "num_row": 63, "num_top": 13, "num_word": [12, 16, 65], "number": [0, 1, 3, 11, 12, 13, 14, 15, 16, 19, 20, 21, 22, 25, 28, 31, 32, 34, 36, 37, 40, 41, 42, 43, 44, 47, 48, 49, 54, 56, 58, 59, 60, 62, 63, 64, 66, 67, 69, 71, 72], "numer": [0, 1, 2, 11, 13, 33, 68, 72, 73], "numpi": [1, 61, 68], "o": 35, "object": [1, 2, 11, 19, 44, 50, 57, 58, 61, 62, 64, 65, 66], "obtain": [0, 1, 13, 17, 23, 24, 34, 42, 61], "occur": [0, 4, 31, 42, 71], "occurr": 19, "off": [0, 1, 31, 36], "offer": 0, "offici": [61, 67], "often": [28, 36, 47, 48, 62], "oh": [31, 36, 48], "okai": [31, 36], "older": [1, 49, 61], "on_column": [18, 23, 28, 68, 72, 73], "onc": [1, 2, 11, 58, 61, 62, 67], "one": [0, 1, 2, 4, 10, 12, 19, 23, 25, 28, 29, 31, 32, 36, 37, 47, 51, 56, 59, 61, 62, 67, 68, 71, 73], "ones": [31, 36], "onli": [0, 1, 2, 5, 11, 23, 29, 31, 32, 34, 36, 37, 45, 53, 58, 59, 61, 62, 67, 71], "onlin": [1, 32, 39, 64], "onward": 0, "open": [0, 62, 66], "operation": [39, 50, 59], "opinion": [24, 31], "oppos": [2, 31, 34, 35, 55], "opposit": 34, "option": [1, 2, 37, 62, 63, 67, 71], "order": [0, 1, 35, 37, 42, 67, 71], "org": [2, 6, 15, 21, 24, 41, 70], "organ": 1, "origin": [1, 2, 5, 12, 21, 31, 32, 35, 36, 37, 45, 46, 49, 50, 59], "orthogon": 34, "other": [0, 1, 2, 9, 11, 28, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 45, 46, 48, 51, 52, 54, 56, 58, 59, 61, 62, 64, 66, 71], "other_lexical_featur": [11, 64], "otherwis": [2, 10, 21, 32, 38, 63, 67, 71], "our": [0, 1, 2, 11, 13, 29, 31, 32, 36, 37, 39, 53, 59, 61, 71], "ourselv": 53, "out": [1, 16, 19, 31, 36, 42, 55, 60, 62], "outcom": [1, 44, 62], "output": [0, 2, 10, 17, 19, 40, 42, 61, 62, 64, 67], "output_file_bas": [0, 1, 2, 42, 61], "output_file_path_chat_level": [1, 2], "output_file_path_conv_level": [1, 2], "output_file_path_user_level": [1, 2], "output_path": 67, "outsid": [1, 2, 12], "over": [1, 16, 29, 31, 34, 35, 36, 37, 53, 55, 60, 62, 67, 71], "overal": [30, 31, 34, 36, 45, 46], "overrid": [0, 1, 2], "overview": [0, 61, 62], "overwhelmingli": 1, "overwritten": 1, "own": [0, 1, 2, 9, 35, 62, 64], "p": 55, "pacakg": 24, "pace": [43, 62], "packag": [17, 18, 40, 62], "pad": 19, "page": [1, 11, 29, 39, 61, 62, 69], "pair": [6, 19, 34, 49, 71], "pairwis": [6, 34], "panda": [0, 1, 2, 12, 14, 16, 23, 47, 64, 65, 66, 71, 72, 73], "paper": [4, 5, 12, 18, 29, 40, 49, 50, 64], "paragraph": 22, "paramet": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 47, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "paramt": 1, "pardon": 32, "parenthes": [22, 42, 48, 64], "parenthet": [22, 48], "pars": [2, 16, 50, 60], "parser": 67, "part": [1, 10, 13, 29, 36, 42, 52, 67, 71], "particip": [1, 9, 37, 62], "particl": [31, 36], "particular": [1, 11, 31, 32, 34, 41, 45, 47, 51, 59, 62], "particularli": 42, "partner": 32, "pass": [1, 13, 21, 47, 71], "path": [1, 2, 19, 61, 67], "path_in": 19, "pattern": [4, 11, 19, 42, 55, 62, 67], "paus": 4, "pd": [1, 2, 4, 6, 7, 8, 9, 12, 13, 14, 15, 16, 18, 19, 23, 25, 63, 64, 65, 66, 67, 68, 71], "pdf": [5, 12, 13, 16, 18, 21, 24, 64, 70], "penalti": 1, "pennebak": [0, 12, 37, 41, 42, 52], "pennyslvania": 62, "peopl": [1, 32, 59, 62], "per": [1, 6, 9, 19, 42, 63, 66, 72], "percentag": [2, 21], "perfect": [37, 59], "perform": [0, 1, 16, 50], "perhap": 1, "period": [4, 34, 55], "person": [1, 8, 12, 15, 16, 32, 34, 39, 41, 42, 50, 59, 62, 64, 70], "perspect": 1, "petrocelli": 5, "phrase": [19, 30, 38, 54], "phrase_split": 19, "pickl": [19, 67], "piec": [36, 42, 59, 63], "pl": 50, "place": [55, 61, 62], "plan": [34, 35, 45, 46], "player": 59, "pleas": [0, 1, 38, 49, 50, 61, 62], "please_start": 50, "point": [22, 24, 34, 35, 42, 45, 46, 48, 52, 64, 66], "poisson": 55, "polar": [24, 39, 51, 52, 64], "polit": [1, 17, 18, 30, 32, 38, 39, 42, 51, 52, 54, 56, 64], "politeness_featur": 11, "politeness_v2": 11, "politeness_v2_help": 11, "politenessstrategi": [17, 50], "portion": 0, "posit": [0, 1, 11, 15, 24, 29, 31, 39, 42, 50, 51, 54, 56, 61, 62, 64, 67], "positive_affect_lexical_per_100": [51, 52, 56], "positive_bert": [0, 1, 51, 61], "positive_emot": [49, 51, 52, 56], "positivity_bert": [1, 61], "positivity_zscor": 64, "positivity_zscore_chat": 52, "positivity_zscore_convers": 52, "possess": 31, "possibl": [1, 34, 62, 66], "possibli": [38, 62], "ppron": 67, "practic": [34, 35], "pre": [1, 4, 21, 37, 49, 64], "preced": [31, 35, 71], "precend": 35, "precis": 47, "precomput": 51, "predefin": 19, "predetermin": [31, 36], "predict": [2, 47, 51, 64], "prefer": [0, 1], "preload_word_list": 69, "prep_simpl": 19, "prep_whol": 19, "preposit": [31, 36], "preproces": 48, "preprocess": [0, 1, 2, 13, 19, 40, 43, 49, 51, 61, 69], "preprocess_chat_data": 2, "preprocess_conversation_column": 71, "preprocess_naive_turn": 71, "preprocess_text": 71, "preprocess_text_lowercase_but_retain_punctu": 71, "presenc": [2, 32, 67], "present": [1, 2, 14, 30, 31, 38, 42, 55, 62], "preserv": 42, "prespecifi": 19, "prevent": 51, "previou": [1, 7, 28, 31, 36, 45, 46, 58, 64, 71], "primari": 34, "print": [2, 71], "prior": [2, 64, 71], "priya": [47, 62], "probabl": [15, 47], "problem": 62, "procedur": 62, "proceed": 46, "process": [0, 1, 2, 4, 10, 21, 37, 42, 55, 62, 64, 65, 67, 69, 71], "prodi": 15, "produc": [1, 2, 34], "product": 15, "professor": 62, "progress": [1, 2], "project": [54, 62], "pronoun": [12, 16, 31, 36, 39, 41, 42, 64, 67, 70], "proper": 1, "properli": 42, "properti": [1, 11, 61], "proport": [16, 39, 42, 64], "propos": 37, "provid": [0, 1, 2, 15, 29, 30, 33, 36, 39, 44, 47, 54, 62], "proxi": 42, "pseudonym": 1, "psycholog": 42, "pub": 70, "publish": [5, 30, 64], "pubsonlin": 6, "punctuat": [0, 2, 16, 19, 20, 21, 28, 43, 54, 60, 67, 71], "punctuation_seper": 19, "puncut": 48, "pure": [24, 36], "purpos": 1, "put": [34, 42, 50, 62, 66], "py": [0, 1, 14, 49, 61, 64, 67], "pydata": 2, "pypi": [1, 61], "python": [1, 32, 41, 56, 57, 61, 62, 68], "qtd": 62, "qualiti": 41, "quantifi": [31, 36, 62], "quantiti": [37, 39, 41, 47], "quartil": 50, "question": [16, 19, 20, 29, 32, 39, 49, 50, 64, 66, 68, 70], "question_num": 11, "question_word": 20, "quick": [1, 43], "quickli": 0, "quit": 40, "quot": [22, 48, 64], "quotat": [22, 48], "rabbit": 62, "rain": 41, "rais": [2, 67, 71], "random": 55, "rang": [5, 8, 24, 30, 33, 34, 35, 40, 51, 53, 55, 56, 57], "ranganath": [16, 31, 32, 36, 38, 43, 54, 70], "ranganath2013": 70, "ranganathetal2013_detectingflirt": 16, "rapid": [1, 4], "rare": [34, 35], "rate": [42, 51], "rather": [1, 31, 34, 35, 36, 37, 45, 46, 63], "ratio": [16, 39, 64], "raw": [0, 12, 16, 21, 31, 33, 42, 50, 64], "re": [1, 31, 36, 42, 50, 61], "reach": 42, "read": [0, 1, 2, 16, 21, 29, 33, 61, 62, 64, 65, 66, 67], "read_csv": 1, "read_in_lexicon": 67, "readabl": [11, 33, 64, 70], "reader": 33, "readi": 1, "readili": 62, "readthedoc": [1, 24, 61], "real": [1, 55], "realit": 13, "realli": [31, 36, 50], "reason": [31, 36, 45, 46, 49], "reassur": 49, "recal": 47, "recent": [0, 50], "recept": [18, 32, 39, 42, 50, 51, 52, 54, 56, 62, 64], "recogn": [1, 42, 43, 47], "recognit": [0, 2, 39, 64], "recommend": [0, 42, 62], "reddit": [48, 64], "reddit_tag": 11, "redditus": 48, "reduc": 63, "reduce_chunk": 63, "redund": [42, 62], "refer": [0, 1, 2, 11, 22, 24, 28, 31, 42, 48, 52, 61, 62, 64, 70], "reflect": [37, 43], "regardless": 1, "regener": [0, 2, 51, 67], "regenerate_vector": [0, 1, 2, 67], "regex": [14, 16, 42, 49, 67], "regist": 37, "regress": 1, "regular": [5, 14, 30, 32, 42, 55, 58, 67], "reichel": [53, 58, 60], "reidl": [4, 13], "reinvent": 62, "rel": [41, 51, 52, 55, 60, 64], "relat": [1, 61, 62, 64], "relationship": 36, "relev": [1, 29, 42, 44, 49, 51, 56, 61, 64, 65], "reli": [31, 34, 35, 36, 69], "reliabl": [33, 42], "remain": [1, 30, 71], "rememb": 1, "remov": [0, 2, 9, 13, 19, 28, 40, 43, 48, 49, 50, 71], "remove_active_us": 9, "remove_unhashable_col": 71, "renam": 1, "repair": [16, 39], "repeat": [60, 71], "repetit": 60, "replac": 19, "report": [1, 61], "repres": [2, 4, 6, 7, 11, 13, 23, 31, 34, 36, 42, 45, 46, 64, 66, 67, 68, 71, 72, 73], "represent": [34, 38, 67], "reproduc": [36, 62], "republican": 1, "request": [32, 50, 51], "requir": [0, 1, 20, 21, 31, 55, 61, 62, 64, 65, 66, 67, 71], "research": [1, 62], "reserv": 0, "resolv": 62, "resourc": [1, 39, 48, 61, 62], "respect": [1, 2, 12, 31, 36, 37, 69], "respons": [22, 48, 55, 58, 64], "restaur": [34, 56], "restor": 0, "restrict": 71, "result": [40, 55, 65, 72], "retain": [2, 16, 20, 21, 60, 71], "retriev": 50, "retunr": 3, "return": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 30, 32, 43, 49, 50, 51, 55, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "reveal": 62, "revert": 50, "review": 62, "rewrit": 50, "rich": 62, "riedl": [13, 40, 55], "right": [31, 36, 61, 62], "roberta": [0, 1, 39, 42, 52, 56, 61, 64, 67], "robust": 13, "rocklag": [5, 30, 64], "room": 59, "root": [13, 40], "rough": [12, 54], "roughli": 31, "round": [13, 40, 59, 71], "round_num": 1, "row": [0, 1, 2, 9, 13, 25, 37, 40, 59, 63, 68, 71, 72, 73], "rowbotham": 62, "rucker": 5, "rule": [1, 69], "run": [0, 10, 12, 16, 35, 46, 47, 48, 51, 61, 69], "runtim": [1, 35], "ryan": [0, 42], "ryanboyd": 67, "sagepub": [5, 64], "sai": [1, 32, 50, 59], "said": [1, 36, 62], "same": [0, 1, 2, 31, 34, 37, 45, 48, 52, 59, 60, 62, 71], "sampl": [61, 62], "sarcast": 48, "save": [0, 1, 2, 19, 64, 67], "save_featur": 2, "sbert": [0, 1, 28, 31, 34, 35, 36, 45, 46, 64, 65, 67], "scale": [42, 51], "schema": 1, "scheme": 0, "school": [21, 62], "scienc": [29, 39, 62], "scientist": [61, 62], "score": [1, 4, 5, 11, 12, 13, 15, 21, 24, 28, 29, 30, 31, 34, 35, 36, 38, 39, 40, 45, 46, 47, 50, 51, 53, 56, 57, 61, 64, 65, 67, 73], "script": [1, 61], "sea": 1, "seamless": 62, "search": [19, 61], "second": [0, 1, 4, 34, 42, 58, 59], "second_person": 49, "secr": [18, 49, 50, 64], "section": [1, 29, 61], "see": [0, 1, 2, 11, 30, 34, 38, 41, 45, 46, 47, 55, 62, 71], "seek": [5, 62], "seen": 67, "segment": [0, 19], "select": [2, 4, 23, 28, 36, 45, 64, 66, 67, 68, 71, 72, 73], "self": [1, 2, 61], "semant": [31, 34, 35, 41], "semantic_group": [1, 61], "send": [1, 37, 55], "sens": [1, 5, 31, 54, 66], "sensibl": 1, "sent": [1, 37, 64], "sentenc": [0, 1, 10, 15, 19, 20, 21, 33, 34, 35, 36, 42, 45, 46, 47, 48, 54, 56, 61, 67], "sentence_pad": 19, "sentence_split": 19, "sentence_to_train": 47, "sentencis": 19, "sentiment": [0, 1, 24, 31, 39, 42, 52, 56, 61, 62, 64, 67], "sentimet": 1, "separ": [1, 2, 19, 34, 42, 51, 67], "sepcifi": 1, "septemb": 40, "sequenc": [1, 59], "sequenti": 1, "seri": [12, 16, 23, 28, 42, 71, 73], "serv": 12, "set": [0, 1, 2, 13, 34, 48, 59], "set_self_conv_data": 2, "sever": [1, 30, 41, 42, 48, 51, 56, 61], "shall": 54, "share": [31, 36, 37], "she": [30, 31, 36], "shift": 34, "shop": 62, "short": [55, 58], "shorter": [13, 40, 41, 42, 43], "should": [0, 1, 2, 4, 14, 23, 28, 29, 31, 36, 47, 48, 54, 61, 62, 64, 66, 67, 68, 69, 71, 72, 73], "shouldn": [31, 36], "show": [1, 37, 61], "showeth": 62, "shruti": [35, 45, 46, 47, 62], "side": 31, "signal": [45, 55], "signifi": 42, "signific": [1, 61], "silent": 37, "similar": [1, 6, 7, 13, 28, 29, 31, 34, 35, 36, 40, 45, 46, 49, 50, 62, 65], "similarli": [1, 35], "simpl": [0, 1, 16, 19, 42, 61, 62], "simpli": [1, 5, 11, 28, 42, 56, 62], "simplifi": 1, "simplist": 41, "sinc": [1, 32, 41, 71], "singh": 62, "singl": [0, 1, 2, 11, 12, 19, 23, 31, 34, 35, 36, 37, 41, 45, 46, 59, 62, 71, 72], "singular": [12, 41, 64], "site": 16, "situat": 37, "size": [1, 13, 63, 67], "skip": 1, "slightli": [32, 62, 63], "slow": 1, "small": 40, "so": [1, 2, 10, 30, 31, 36, 37, 42, 50, 61, 62, 66, 67], "social": [29, 39, 61, 62], "socsci": 16, "softwar": 62, "sohi": 62, "sol3": 4, "solut": [1, 59], "solv": 62, "some": [0, 1, 11, 17, 29, 32, 34, 35, 37, 41, 61, 63], "somebodi": [31, 36], "someon": [22, 29, 31, 36, 47, 48, 61, 64], "someplac": [31, 36], "someth": 47, "sometim": 1, "somewhat": 35, "soon": 62, "sorri": [16, 32, 50], "sort": [10, 42, 67], "sort_word": 67, "sound": [47, 51], "sourc": [4, 5, 6, 12, 13, 16, 17, 21, 34, 35, 50, 64, 68], "space": [34, 40, 42, 67, 71], "spaci": [1, 19, 47, 49, 50, 61], "span": 63, "spars": 32, "speak": [1, 31, 36, 37, 59, 60, 62], "speaker": [0, 1, 2, 6, 8, 9, 25, 31, 34, 35, 37, 38, 42, 45, 46, 61, 66, 71, 72], "speaker_id": [2, 61, 72], "speaker_id_col": [0, 1, 2, 6, 8, 9, 25, 26, 27, 61, 65, 66, 71, 72], "speaker_nicknam": [0, 1, 2, 6, 9, 59, 66], "special": [0, 1, 2, 48, 71], "specif": [1, 2, 12, 32, 41, 48, 55, 61, 62, 69, 71], "specifi": [1, 2, 19, 47, 49, 65, 66, 67, 68, 71, 72, 73], "speciifc": 63, "speed": 1, "spend": [51, 62], "spike": 55, "split": [19, 21, 43, 63], "spoke": 59, "spoken": [11, 37], "spread": 55, "squar": [13, 40], "src": 67, "ssrn": 4, "stabl": 40, "stack": 14, "stackoverflow": 68, "stage": [1, 2, 34, 71], "stamp": 55, "standard": [1, 4, 37, 40, 41, 42, 49, 55, 58, 60, 65, 72, 73], "stanford": 70, "start": [15, 19, 20, 22, 23, 50], "statement": [1, 38, 42, 47, 48, 61, 62, 64], "statist": [1, 65, 66, 68], "statologi": 41, "stdev": [1, 2, 11, 65, 66], "stem": 42, "step": [1, 4, 28, 41, 45, 46, 51], "still": [1, 41, 45, 46], "stochast": 40, "stop": [40, 62], "stopword": [13, 19], "store": [1, 12, 16, 41, 49, 51, 61, 65, 67], "stoword": 42, "str": [2, 3, 4, 5, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 63, 64, 65, 66, 67, 68, 71, 72, 73], "str_to_vec": 67, "str_vec": 67, "straightforward": 29, "strategi": [17, 30, 32, 38, 39, 42, 49, 54, 64], "stream": 35, "strictli": 1, "string": [0, 1, 2, 4, 8, 12, 13, 14, 19, 23, 24, 50, 64, 66, 67, 68, 71, 72, 73], "strongli": [1, 41, 61], "structur": [0, 36, 49], "student": [21, 33], "studi": [1, 34, 62], "style": [1, 31, 36, 59], "sub": [0, 1, 71], "subfold": 1, "subject": [5, 24, 28, 39, 49, 64], "subjunct": 50, "sublist": 28, "submiss": 55, "subpart": [1, 71], "subsequ": [1, 30, 51, 58], "subset": 62, "substanc": 36, "substant": 31, "substanti": 1, "substr": 30, "subtask": 1, "subtract": [41, 58], "succe": 62, "success": [0, 1, 4, 31, 36, 43, 55, 58], "suggest": [1, 13, 34, 42, 44, 50], "suit": [62, 64], "suitabl": 2, "sum": [1, 28, 34, 61, 64, 65, 66, 72], "summar": [0, 1, 69], "summari": [65, 66, 72], "summariz": [0, 65], "summarize_featur": 69, "suppl": 6, "support": [1, 15, 42, 61], "suppos": 1, "sure": 30, "swear": 49, "symbol": 67, "syntax": [1, 32, 61], "system": [2, 59, 64], "t": [0, 1, 15, 29, 31, 36, 42, 45, 49, 54, 61, 62, 67], "tabl": [1, 62], "tag": 39, "take": [1, 4, 5, 9, 14, 25, 29, 31, 34, 37, 39, 42, 55, 61, 65, 67, 71], "taken": [59, 71], "talk": [1, 37, 47, 59, 62], "tandem": [1, 61], "target": 15, "task": [1, 2, 59, 71], "tausczik": [12, 37, 41, 52], "tausczikpennebaker2013": 12, "team": [0, 1, 4, 11, 12, 13, 34, 39, 40, 42, 59, 65], "team_bursti": 4, "team_comm_tool": [1, 61], "teamcommtool": 1, "technic": [29, 39, 61, 62], "teghxgbqdhgaaaaa": 5, "tempor": [0, 2, 55, 58, 64, 71], "temporal_featur": 11, "tend": [1, 34, 60], "term": [1, 28, 59, 67], "termin": [1, 2, 61], "terribl": 51, "test": [13, 33, 47], "text": [0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 28, 32, 33, 36, 42, 48, 55, 62, 64, 67, 71], "text_based_featur": 64, "textblob": [24, 39, 51, 52, 64], "textblob_sentiment_analysi": 11, "than": [0, 1, 2, 11, 13, 31, 34, 35, 36, 37, 40, 41, 45, 46, 54, 60, 62, 63], "thee": 62, "thei": [0, 1, 11, 28, 29, 31, 34, 36, 37, 39, 42, 47, 58, 59, 61, 62, 67], "them": [0, 1, 2, 19, 28, 29, 31, 36, 50, 51, 55, 59, 61, 62, 64, 65, 66, 67], "themselv": [31, 36, 60], "theoret": 35, "theori": [34, 50], "therebi": 0, "therefor": [0, 1, 11, 28, 37, 45, 59, 62, 69], "thi": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 20, 21, 23, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 71, 72, 73], "thing": [48, 61], "think": [1, 38, 47], "thorough": [43, 62], "those": [1, 21, 31, 36, 61, 71], "though": [34, 42, 50], "thought": [1, 35, 45], "thread": [1, 61], "three": [0, 1, 2, 22, 34, 37, 40, 51, 61, 62, 69, 71], "threshold": [15, 47], "through": [1, 45, 46, 50, 61, 62], "throughout": [31, 35, 36, 40, 45, 46, 55, 63], "tht": 35, "thu": [1, 34, 35, 36, 37, 46, 55, 71], "time": [0, 1, 2, 4, 23, 34, 35, 39, 42, 48, 51, 55, 59, 61, 62, 63, 64, 65, 66, 71], "time_diff": 55, "timediff": 4, "timestamp": [0, 1, 2, 8, 23, 58, 61, 62, 63, 64, 71], "timestamp_col": [0, 1, 2, 8, 61, 63, 64, 65, 71], "timestamp_end": [1, 23, 61], "timestamp_start": [1, 23, 61], "timestamp_unit": [0, 2, 23, 64], "to_datetim": [0, 2], "todai": [34, 35, 41, 43, 45, 46, 47], "todo": 66, "togeth": [0, 62, 66], "token": [16, 19, 39, 49, 54, 64, 67], "token_count": [19, 49], "too": [30, 31, 36, 62], "took": [1, 59], "tool": [1, 61, 62], "toolkit": [0, 1, 11, 42, 45, 46, 55, 62, 65, 66], "top": [1, 50, 59], "topic": [1, 13, 31, 34, 40, 42, 43, 65], "tormala": 5, "total": [0, 1, 3, 12, 16, 25, 31, 34, 36, 37, 41, 44, 53, 59, 60, 61, 62, 63, 64, 66, 72], "touch": [1, 61], "toward": [31, 36, 38, 42, 45, 46], "track": [65, 66], "tradit": 49, "train": [1, 2, 15, 64], "train_spacy_n": 15, "transcript": 0, "transfom": [45, 46], "transform": [1, 31, 34, 35, 36, 51], "transform_utter": 50, "treat": [0, 1, 42, 59, 61], "tri": 50, "trivial": [3, 44, 62], "troubl": [1, 61], "true": [0, 1, 2, 37, 61, 63, 65, 66, 67, 71], "truncat": 2, "truth_intensifi": 49, "ttr": 64, "tupl": [0, 1, 2, 15, 19, 64], "turn": [0, 2, 25, 28, 31, 32, 37, 39, 61, 64, 65, 71], "turn_count": 59, "turn_df": 71, "turn_id": 71, "turn_taking_featur": 11, "twice": 63, "twitter": [1, 51, 61], "two": [0, 1, 2, 23, 31, 34, 36, 41, 45, 46, 52, 62, 63, 67], "txt": 19, "type": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 37, 39, 52, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "typic": [1, 34, 40, 41, 42, 52, 60], "u": [0, 1, 2, 22, 31, 36, 48, 49, 58], "uci": 16, "uh": [31, 36], "ulrich": 55, "um": [31, 36, 60], "umbrella": [8, 29, 34], "uncertain": [5, 30], "uncertainti": 30, "under": [0, 1, 10, 11, 12, 28, 40], "underli": [1, 61], "underscor": [1, 42, 61], "understand": [0, 33, 39, 43, 48, 58, 61, 62], "understood": 33, "unhash": 71, "uninterrupt": 59, "uniqu": [0, 1, 2, 6, 9, 13, 16, 23, 25, 41, 47, 52, 60, 61, 63, 71], "unit": [0, 2, 23], "univers": 62, "unix": 58, "unless": [31, 36], "unpack": 62, "unpreprocess": 0, "until": [31, 36, 45, 46], "unzip": [1, 61], "up": [1, 17, 21, 28, 31, 35, 36, 37, 42, 45, 46, 51, 59, 61, 67], "updat": [1, 9, 40, 54, 61], "upenn": 1, "upgrad": 50, "upload": 13, "upon": 33, "us": [0, 2, 3, 5, 11, 12, 13, 17, 19, 24, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 56, 57, 60, 62, 64, 65, 66, 67, 71], "usag": [0, 21, 24], "use_gpu": [0, 1, 2, 67], "use_time_if_poss": 63, "user": [0, 1, 2, 9, 14, 15, 22, 37, 42, 47, 48, 51, 61, 62, 63, 64, 65, 66, 69, 72], "user_aggreg": [0, 1, 2, 65, 66], "user_column": [0, 1, 2, 65, 66], "user_data": [2, 65, 66], "user_df": 9, "user_level_featur": 2, "user_list": 9, "user_method": [0, 1, 2, 65, 66], "userlevelfeaturescalcul": [2, 66, 69], "usernam": [22, 48], "utf": 1, "util": [1, 12, 21, 61, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "utilti": 62, "utter": [0, 1, 2, 3, 4, 5, 13, 14, 15, 16, 17, 20, 21, 23, 24, 30, 31, 32, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 50, 51, 52, 54, 58, 60, 61, 67], "v": [0, 1, 13, 61], "v0": 0, "valenc": 51, "valid": [1, 55, 67, 71], "valu": [0, 1, 2, 5, 6, 10, 12, 13, 18, 19, 28, 30, 31, 34, 36, 37, 40, 41, 42, 45, 46, 47, 55, 59, 61, 64, 67, 68, 72, 73], "valueerror": [2, 71], "vari": [13, 31, 34, 35], "variabl": [1, 56, 57, 64, 65, 66], "varianc": [1, 8, 34], "variance_in_dd": 11, "variat": [4, 32], "varieti": [42, 62], "variou": [19, 42, 64, 65, 66], "vast": 62, "ve": [0, 31, 36, 50, 61], "vec": 6, "vect_data": [1, 7, 8, 28, 61, 64, 65, 66], "vect_path": 67, "vector": [0, 2, 6, 7, 8, 13, 28, 34, 35, 40, 55, 61, 64, 65, 67], "vector_data": [0, 1, 2, 61], "vector_directori": [0, 1, 2, 61, 65], "vein": 45, "verb": [19, 31, 36], "verbal": 32, "veri": [5, 28, 30, 31, 34, 35, 36, 42, 49, 54], "verifi": 2, "verify_timestamp_format": 2, "verit": 62, "version": [0, 1, 12, 14, 21, 28, 31, 40, 42, 50, 51, 61], "versu": [4, 29, 47, 55, 59], "vert": 2, "via": [3, 44], "view": 50, "visit": 41, "voila": 62, "w": [31, 42], "wa": [0, 1, 2, 5, 12, 31, 32, 35, 36, 47, 51, 56, 59, 62, 71], "wai": [0, 1, 2, 29, 30, 31, 32, 34, 49, 50, 54, 56, 57, 61, 62, 66], "waiai": 62, "wait": [4, 55], "walk": 1, "walkthrough": [0, 61, 62], "want": [1, 28, 34, 59, 61, 62, 65, 66, 67], "warn": [1, 50, 71], "watt": [1, 2, 62, 71], "we": [0, 1, 2, 3, 4, 5, 9, 10, 11, 12, 15, 16, 18, 24, 28, 29, 30, 31, 34, 35, 36, 37, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59, 61, 62, 66, 67, 71], "web": 70, "websit": [1, 61], "week": 47, "weight": 66, "weigt": 31, "welcom": 61, "well": [11, 29, 31, 36, 55, 62], "went": 41, "were": [1, 2, 12, 31, 36, 42, 71], "western": 1, "wh": [19, 31, 36], "wh_question": [32, 49, 54], "wharton": 62, "what": [1, 2, 12, 16, 20, 29, 31, 32, 34, 35, 36, 39, 41, 45, 46, 47, 50, 54, 62, 63], "whatev": [1, 31, 36], "wheel": 62, "when": [1, 16, 20, 31, 33, 36, 42, 47, 54, 55, 59, 60, 61, 62, 67, 69, 71], "whenev": 71, "where": [1, 2, 19, 20, 28, 31, 32, 36, 37, 40, 41, 42, 48, 50, 51, 54, 59, 61, 65, 68, 73], "wherea": [31, 34, 35, 36, 43], "wherev": [31, 36], "whether": [1, 2, 10, 16, 19, 32, 37, 38, 41, 43, 47, 57, 58, 62, 63, 64, 67, 71], "which": [0, 1, 2, 3, 4, 5, 7, 9, 12, 13, 15, 16, 18, 25, 28, 31, 34, 35, 36, 37, 38, 40, 41, 42, 51, 53, 54, 55, 56, 57, 58, 59, 61, 62, 64, 65, 66, 67, 68, 69, 71, 72, 73], "while": [1, 31, 32, 34, 36, 37, 44, 45, 46, 55, 62, 71], "whitespac": 43, "who": [1, 20, 31, 32, 36, 47, 51, 54, 59, 60, 62], "whole": [28, 42, 59, 62, 71], "whom": [31, 36, 54], "whose": [31, 36, 54], "why": [20, 29, 31, 36, 54], "wide": 31, "wien": 62, "wiki": [21, 29, 70], "wiki_link": [1, 61], "wikipedia": [21, 33, 37, 70], "williamson": 60, "wish": [1, 2, 18, 28], "within": [0, 1, 2, 8, 11, 16, 28, 30, 31, 34, 35, 36, 41, 45, 46, 52, 55, 59, 60, 62, 63, 64, 68, 71, 73], "within_group": 2, "within_person_discursive_rang": 11, "within_task": [0, 1, 2, 71], "without": [1, 19, 31, 36, 42, 47, 54, 62, 69], "won": [0, 31, 36, 45], "wonder": 56, "woolei": 4, "woollei": [13, 40, 55], "wooten": 55, "word": [0, 1, 3, 10, 11, 12, 13, 14, 16, 19, 20, 21, 22, 28, 30, 32, 33, 37, 38, 39, 40, 41, 43, 45, 46, 48, 49, 52, 53, 54, 56, 57, 62, 64, 65, 66, 67, 69, 70], "word_mimicri": 11, "word_start": [19, 49], "wordcount": [1, 42], "wordnet": [1, 61], "words_in_lin": 19, "work": [0, 11, 42, 47, 50, 55, 61, 62], "world": 55, "worri": 62, "would": [1, 29, 31, 34, 35, 36, 37, 42, 50, 54, 62], "wouldn": [31, 36], "wow": 50, "wp": 13, "wrap": 42, "write": [2, 29, 60], "www": [12, 13, 18, 41, 64], "x": [0, 1, 2, 4, 46, 68], "xinlan": 62, "yashveer": 62, "ye": 19, "yeah": [31, 36], "yeoman": [18, 49, 50], "yesno_quest": [32, 49, 54], "yet": 48, "ylatau": 12, "you": [0, 1, 2, 11, 24, 29, 31, 36, 37, 42, 43, 47, 50, 59, 61, 62, 69], "your": [0, 29, 31, 32, 36, 37, 50, 59, 61, 62], "your_data": 42, "yourself": [31, 36, 50], "yuluan": 62, "yup": [31, 36], "yuxuan": 62, "z": [12, 39, 49, 51, 64, 73], "z0": 67, "za": 67, "zero": [13, 52], "zhang": 62, "zheng": 62, "zhong": 62, "zhou": 62, "zscore": 41, "zscore_chat": 41, "zscore_chats_and_convers": 69, "zscore_convers": 41, "\u00bc": 47, "\u03c4": 55}, "titles": ["The Basics (Get Started Here!)", "Worked Example", "feature_builder module", "basic_features module", "burstiness module", "certainty module", "discursive_diversity module", "fflow module", "get_all_DD_features module", "get_user_network module", "hedge module", "Features: Technical Documentation", "info_exchange_zscore module", "information_diversity module", "lexical_features_v2 module", "named_entity_recognition_features module", "other_lexical_features module", "politeness_features module", "politeness_v2 module", "politeness_v2_helper module", "question_num module", "readability module", "reddit_tags module", "temporal_features module", "textblob_sentiment_analysis module", "turn_taking_features module", "variance_in_DD module", "within_person_discursive_range module", "word_mimicry module", "FEATURE NAME", "Certainty", "Content Word Accommodation", "Conversational Repair", "Dale-Chall Score", "Discursive Diversity", "Forward Flow", "Function Word Accommodation", "Gini Coefficient", "Hedge", "Features: Conceptual Documentation", "Information Diversity", "Information Exchange", "Linguistic Inquiry and Word Count (LIWC) and Other Lexicons", "Message Length", "Message Quantity", "Mimicry (BERT)", "Moving Mimicry", "Named Entity Recognition", "Online Discussion Tags", "Politeness/Receptiveness Markers", "Politeness Strategies", "Sentiment (RoBERTa)", "Positivity Z-Score", "Proportion of First Person Pronouns", "Question (Naive)", "Team Burstiness", "Textblob Polarity", "Textblob Subjectivity", "Time Difference", "Turn Taking Index", "Word Type-Token Ratio", "The Team Communication Toolkit", "Introduction", "assign_chunk_nums module", "calculate_chat_level_features module", "calculate_conversation_level_features module", "calculate_user_level_features module", "check_embeddings module", "gini_coefficient module", "Utilities", "preload_word_lists module", "preprocess module", "summarize_features module", "zscore_chats_and_conversation module"], "titleterms": {"0": 42, "1": 42, "5": 42, "A": 0, "One": 0, "The": [0, 61, 62], "accommod": [31, 36], "addit": 1, "advanc": 1, "aggreg": [1, 11], "analyz": 1, "assign_chunk_num": 63, "assumpt": 0, "base": 11, "basic": [0, 1, 29, 30, 31, 33, 34, 35, 36, 37, 38, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 55, 56, 57, 58, 59, 60], "basic_featur": 3, "bert": 45, "bring": 42, "bursti": [4, 55], "cach": 1, "calculate_chat_level_featur": 64, "calculate_conversation_level_featur": 65, "calculate_user_level_featur": 66, "caveat": [1, 29, 30, 31, 33, 34, 35, 36, 38, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 55, 56, 57, 58, 59], "certainti": [5, 30], "chall": 33, "chat": [11, 39], "check_embed": 67, "citat": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "class": 69, "code": [0, 1], "coeffici": 37, "column": [1, 61], "commun": 61, "conceptu": 39, "configur": 1, "consider": 1, "content": [31, 61], "convers": [1, 11, 32, 39, 62, 69], "count": [42, 59], "cumul": 1, "custom": [1, 42], "customiz": 0, "dale": 33, "data": 1, "declar": 61, "demo": [0, 1], "detail": 1, "differ": 58, "directori": 1, "discurs": 34, "discursive_divers": 6, "discuss": 48, "divers": [34, 40], "document": [11, 39, 62], "driver": 69, "entiti": [1, 47], "environ": [1, 61], "exampl": [1, 41, 47], "exchang": 41, "featur": [1, 11, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 69], "feature_build": 2, "featurebuild": [1, 61, 62], "fflow": 7, "file": [1, 30, 34, 35, 45, 46, 47, 51], "first": [1, 53], "flow": 35, "forward": 35, "function": [0, 36], "gener": [1, 61, 62], "get": [0, 1, 61, 62], "get_all_dd_featur": 8, "get_user_network": 9, "gini": 37, "gini_coeffici": 68, "gpu": 1, "group": 1, "hedg": [10, 38], "here": 0, "high": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "implement": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "import": [1, 61], "index": 59, "indic": 61, "info_exchange_zscor": 12, "inform": [1, 40, 41, 61], "information_divers": 13, "input": [1, 34], "inquiri": 42, "inspect": [1, 61], "interpret": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "introduct": 62, "intuit": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "kei": 0, "length": 43, "level": [11, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 69], "lexical_features_v2": 14, "lexicon": 42, "light": 0, "linguist": 42, "liwc": 42, "marker": 49, "messag": [43, 44], "mimicri": [45, 46], "modul": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "motiv": 62, "move": 46, "naiv": 54, "name": [1, 29, 47, 61], "named_entity_recognition_featur": 15, "new": 42, "note": [1, 29, 30, 31, 33, 34, 35, 36, 38, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 55, 56, 57, 58, 59], "onlin": 48, "other": [42, 69], "other_lexical_featur": 16, "ouput": 34, "our": 62, "output": [1, 30, 35, 45, 46, 47, 51], "overview": 1, "own": 42, "packag": [0, 1, 61], "paramet": [0, 1], "percentag": 1, "person": 53, "pip": [1, 61], "polar": 56, "polit": [49, 50], "politeness_featur": 17, "politeness_v2": 18, "politeness_v2_help": 19, "posit": 52, "preload_word_list": 70, "preprocess": 71, "pronoun": 53, "proport": 53, "quantiti": 44, "question": 54, "question_num": 20, "ratio": 60, "readabl": 21, "recept": 49, "recognit": [1, 47], "recommend": [1, 61], "reddit_tag": 22, "regener": 1, "relat": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "repair": 32, "roberta": 51, "run": 1, "sampl": [0, 1], "score": [33, 41, 52], "sentiment": 51, "speaker": [11, 59, 62, 69], "start": [0, 1, 61, 62], "strategi": 50, "subject": 57, "summarize_featur": 72, "tabl": 61, "tag": 48, "take": 59, "team": [55, 61, 62], "technic": 11, "temporal_featur": 23, "textblob": [56, 57], "textblob_sentiment_analysi": 24, "time": 58, "token": 60, "toolkit": 61, "touch": 0, "train": 47, "troubleshoot": [1, 61], "turn": [1, 59], "turn_taking_featur": 25, "type": 60, "us": [1, 61], "usag": 1, "user": 11, "util": 69, "utter": [11, 39, 62, 69], "v": 42, "variance_in_dd": 26, "vector": 1, "virtual": [1, 61], "walkthrough": 1, "within_person_discursive_rang": 27, "word": [31, 36, 42, 60], "word_mimicri": 28, "work": 1, "your": [1, 42], "z": [41, 52], "zscore_chats_and_convers": 73}}) \ No newline at end of file +Search.setIndex({"alltitles": {"A Light-Touch, One-Function Package": [[0, "a-light-touch-one-function-package"]], "Additional FeatureBuilder Considerations": [[1, "additional-featurebuilder-considerations"]], "Advanced Configuration Columns": [[1, "advanced-configuration-columns"]], "Aggregation Overview": [[1, "id3"]], "Analyzing First Percentage (%) [Deprecated]": [[1, "analyzing-first-percentage-deprecated"]], "Base Conversation-Level Features": [[11, "base-conversation-level-features"]], "Basic Input Columns": [[1, "basic-input-columns"]], "Certainty": [[30, "certainty"]], "Citation": [[29, "citation"], [30, "citation"], [31, "citation"], [32, "citation"], [33, "citation"], [34, "citation"], [35, "citation"], [36, "citation"], [37, "citation"], [38, "citation"], [40, "citation"], [41, "citation"], [42, "citation"], [43, "citation"], [44, "citation"], [45, "citation"], [46, "citation"], [47, "citation"], [48, "citation"], [49, "citation"], [50, "citation"], [51, "citation"], [52, "citation"], [53, "citation"], [54, "citation"], [55, "citation"], [56, "citation"], [57, "citation"], [58, "citation"], [59, "citation"], [60, "citation"]], "Configuring the FeatureBuilder": [[1, "configuring-the-featurebuilder"]], "Content Word Accommodation": [[31, "content-word-accommodation"]], "Contents:": [[61, null]], "Conversation Parameters": [[1, "conversation-parameters"]], "Conversation-Level Aggregates": [[11, "conversation-level-aggregates"]], "Conversation-Level Features": [[11, "conversation-level-features"], [39, "conversation-level-features"]], "Conversational Repair": [[32, "conversational-repair"]], "Cumulative Grouping": [[1, "cumulative-grouping"]], "Custom Aggregation": [[1, "custom-aggregation"]], "Custom Features": [[1, "custom-features"]], "Customizable Parameters": [[0, "customizable-parameters"]], "Dale-Chall Score": [[33, "dale-chall-score"]], "Declaring a FeatureBuilder": [[61, "declaring-a-featurebuilder"]], "Demo / Sample Code": [[0, "demo-sample-code"], [1, "demo-sample-code"]], "Discursive Diversity": [[34, "discursive-diversity"]], "Example Usage of Custom Aggregation Parameters": [[1, "example-usage-of-custom-aggregation-parameters"]], "Example:": [[41, "example"]], "FEATURE NAME": [[29, "feature-name"]], "Feature Column Names": [[1, "feature-column-names"], [61, "feature-column-names"]], "Feature Documentation": [[62, "feature-documentation"]], "Feature Information": [[1, "feature-information"], [61, "feature-information"]], "Features: Conceptual Documentation": [[39, "features-conceptual-documentation"]], "Features: Technical Documentation": [[11, "features-technical-documentation"]], "Forward Flow": [[35, "forward-flow"]], "Function Word Accommodation": [[36, "function-word-accommodation"]], "Generating Features: Utterance-, Speaker-, and Conversation-Level": [[62, "generating-features-utterance-speaker-and-conversation-level"]], "Generating Vectors using GPU": [[1, "generating-vectors-using-gpu"]], "Getting Started": [[1, "getting-started"], [61, "getting-started"], [62, "getting-started"]], "Gini Coefficient": [[37, "gini-coefficient"]], "Hedge": [[38, "hedge"]], "High*Level Intuition": [[54, "high-level-intuition"]], "High-Level Intuition": [[29, "high-level-intuition"], [30, "high-level-intuition"], [31, "high-level-intuition"], [32, "high-level-intuition"], [33, "high-level-intuition"], [34, "high-level-intuition"], [35, "high-level-intuition"], [36, "high-level-intuition"], [37, "high-level-intuition"], [38, "high-level-intuition"], [40, "high-level-intuition"], [41, "high-level-intuition"], [42, "high-level-intuition"], [43, "high-level-intuition"], [44, "high-level-intuition"], [45, "high-level-intuition"], [46, "high-level-intuition"], [47, "high-level-intuition"], [48, "high-level-intuition"], [49, "high-level-intuition"], [50, "high-level-intuition"], [51, "high-level-intuition"], [52, "high-level-intuition"], [53, "high-level-intuition"], [55, "high-level-intuition"], [56, "high-level-intuition"], [57, "high-level-intuition"], [58, "high-level-intuition"], [59, "high-level-intuition"], [60, "high-level-intuition"]], "Implementation": [[32, "implementation"], [42, "implementation"], [52, "implementation"], [54, "implementation"]], "Implementation Basics": [[29, "implementation-basics"], [30, "implementation-basics"], [31, "implementation-basics"], [33, "implementation-basics"], [34, "implementation-basics"], [35, "implementation-basics"], [36, "implementation-basics"], [37, "implementation-basics"], [38, "implementation-basics"], [40, "implementation-basics"], [41, "implementation-basics"], [43, "implementation-basics"], [44, "implementation-basics"], [45, "implementation-basics"], [46, "implementation-basics"], [47, "implementation-basics"], [48, "implementation-basics"], [49, "implementation-basics"], [50, "implementation-basics"], [51, "implementation-basics"], [53, "implementation-basics"], [55, "implementation-basics"], [56, "implementation-basics"], [57, "implementation-basics"], [58, "implementation-basics"], [59, "implementation-basics"], [60, "implementation-basics"]], "Implementation Notes/Caveats": [[29, "implementation-notes-caveats"], [30, "implementation-notes-caveats"], [31, "implementation-notes-caveats"], [33, "implementation-notes-caveats"], [34, "implementation-notes-caveats"], [35, "implementation-notes-caveats"], [36, "implementation-notes-caveats"], [38, "implementation-notes-caveats"], [40, "implementation-notes-caveats"], [41, "implementation-notes-caveats"], [43, "implementation-notes-caveats"], [44, "implementation-notes-caveats"], [45, "implementation-notes-caveats"], [46, "implementation-notes-caveats"], [47, "implementation-notes-caveats"], [48, "implementation-notes-caveats"], [49, "implementation-notes-caveats"], [50, "implementation-notes-caveats"], [51, "implementation-notes-caveats"], [53, "implementation-notes-caveats"], [55, "implementation-notes-caveats"], [56, "implementation-notes-caveats"], [57, "implementation-notes-caveats"], [58, "implementation-notes-caveats"], [59, "implementation-notes-caveats"]], "Import Recommendations: Virtual Environment and Pip": [[1, "import-recommendations-virtual-environment-and-pip"], [61, "import-recommendations-virtual-environment-and-pip"]], "Important Notes and Caveats": [[1, "important-notes-and-caveats"]], "Importing the Package": [[1, "importing-the-package"]], "Indices and Tables": [[61, "indices-and-tables"]], "Information Diversity": [[40, "information-diversity"]], "Information Exchange": [[41, "information-exchange"]], "Input File": [[34, "id2"]], "Inspecting Generated Features": [[1, "inspecting-generated-features"], [61, "inspecting-generated-features"]], "Interpretation:": [[41, "interpretation"]], "Interpreting the Feature": [[29, "interpreting-the-feature"], [30, "interpreting-the-feature"], [31, "interpreting-the-feature"], [32, "interpreting-the-feature"], [33, "interpreting-the-feature"], [34, "interpreting-the-feature"], [35, "interpreting-the-feature"], [36, "interpreting-the-feature"], [37, "interpreting-the-feature"], [38, "interpreting-the-feature"], [40, "interpreting-the-feature"], [41, "interpreting-the-feature"], [42, "interpreting-the-feature"], [43, "interpreting-the-feature"], [44, "interpreting-the-feature"], [45, "interpreting-the-feature"], [46, "interpreting-the-feature"], [47, "interpreting-the-feature"], [48, "interpreting-the-feature"], [49, "interpreting-the-feature"], [50, "interpreting-the-feature"], [51, "interpreting-the-feature"], [52, "interpreting-the-feature"], [53, "interpreting-the-feature"], [54, "interpreting-the-feature"], [55, "interpreting-the-feature"], [56, "interpreting-the-feature"], [57, "interpreting-the-feature"], [58, "interpreting-the-feature"], [59, "interpreting-the-feature"], [60, "interpreting-the-feature"]], "Introduction": [[62, "introduction"]], "Key Assumptions and Parameters": [[0, "key-assumptions-and-parameters"]], "Linguistic Inquiry and Word Count (LIWC) and Other Lexicons": [[42, "linguistic-inquiry-and-word-count-liwc-and-other-lexicons"]], "Message Length": [[43, "message-length"]], "Message Quantity": [[44, "message-quantity"]], "Mimicry (BERT)": [[45, "mimicry-bert"]], "Motivation": [[62, "motivation"]], "Moving Mimicry": [[46, "moving-mimicry"]], "Named Entity Recognition": [[1, "named-entity-recognition"], [47, "named-entity-recognition"]], "Named Entity Training Examples": [[47, "id2"]], "New in v.1.0.5: \u201cBring Your Own LIWC\u201d Custom Lexicon": [[42, "new-in-v-1-0-5-bring-your-own-liwc-custom-lexicon"]], "Online Discussion Tags": [[48, "online-discussion-tags"]], "Other Utilities": [[69, "other-utilities"]], "Ouput File": [[34, "id3"]], "Our Team": [[62, "our-team"]], "Output File": [[30, "id2"], [35, "id2"], [45, "id2"], [46, "id2"], [47, "id3"], [51, "id1"]], "Output File Naming Details": [[1, "output-file-naming-details"]], "Package Assumptions": [[0, "package-assumptions"]], "Politeness Strategies": [[50, "politeness-strategies"]], "Politeness/Receptiveness Markers": [[49, "politeness-receptiveness-markers"]], "Positivity Z-Score": [[52, "positivity-z-score"]], "Proportion of First Person Pronouns": [[53, "proportion-of-first-person-pronouns"]], "Question (Naive)": [[54, "question-naive"]], "Reducing Redundant Features": [[1, "reducing-redundant-features"]], "Regenerating Vector Cache": [[1, "regenerating-vector-cache"]], "Related Features": [[29, "related-features"], [30, "related-features"], [31, "related-features"], [32, "related-features"], [33, "related-features"], [34, "related-features"], [35, "related-features"], [36, "related-features"], [37, "related-features"], [38, "related-features"], [40, "related-features"], [41, "related-features"], [42, "related-features"], [43, "related-features"], [44, "related-features"], [45, "related-features"], [46, "related-features"], [47, "related-features"], [48, "related-features"], [49, "related-features"], [50, "related-features"], [51, "related-features"], [52, "related-features"], [53, "related-features"], [54, "related-features"], [55, "related-features"], [56, "related-features"], [57, "related-features"], [58, "related-features"], [59, "related-features"], [60, "related-features"]], "Sentiment (RoBERTa)": [[51, "sentiment-roberta"]], "Speaker Turn Counts": [[59, "id2"]], "Speaker- (User) Level Features": [[11, "speaker-user-level-features"]], "Table of Contents": [[61, "table-of-contents"]], "Team Burstiness": [[55, "team-burstiness"]], "Textblob Polarity": [[56, "textblob-polarity"]], "Textblob Subjectivity": [[57, "textblob-subjectivity"]], "The Basics (Get Started Here!)": [[0, "the-basics-get-started-here"]], "The FeatureBuilder": [[62, "the-featurebuilder"]], "The Team Communication Toolkit": [[61, "the-team-communication-toolkit"]], "Time Difference": [[58, "time-difference"]], "Troubleshooting": [[1, "troubleshooting"], [61, "troubleshooting"]], "Turn Taking Index": [[59, "turn-taking-index"]], "Turns": [[1, "turns"]], "Using the Package": [[61, "using-the-package"]], "Utilities": [[69, "utilities"]], "Utterance- (Chat) Level Features": [[11, "utterance-chat-level-features"], [39, "utterance-chat-level-features"]], "Vector Directory": [[1, "vector-directory"]], "Walkthrough: Running the FeatureBuilder on Your Data": [[1, "walkthrough-running-the-featurebuilder-on-your-data"]], "Word Type-Token Ratio": [[60, "word-type-token-ratio"]], "Worked Example": [[1, "worked-example"]], "assign_chunk_nums module": [[63, "module-utils.assign_chunk_nums"]], "basic_features module": [[3, "module-features.basic_features"]], "burstiness module": [[4, "module-features.burstiness"]], "calculate_chat_level_features module": [[64, "module-utils.calculate_chat_level_features"]], "calculate_conversation_level_features module": [[65, "module-utils.calculate_conversation_level_features"]], "calculate_user_level_features module": [[66, "module-utils.calculate_user_level_features"]], "certainty module": [[5, "module-features.certainty"]], "check_embeddings module": [[67, "module-utils.check_embeddings"]], "discursive_diversity module": [[6, "module-features.discursive_diversity"]], "feature_builder module": [[2, "module-feature_builder"]], "fflow module": [[7, "module-features.fflow"]], "get_all_DD_features module": [[8, "module-features.get_all_DD_features"]], "get_user_network module": [[9, "module-features.get_user_network"]], "gini_coefficient module": [[68, "module-utils.gini_coefficient"]], "hedge module": [[10, "module-features.hedge"]], "info_exchange_zscore module": [[12, "module-features.info_exchange_zscore"]], "information_diversity module": [[13, "module-features.information_diversity"]], "lexical_features_v2 module": [[14, "module-features.lexical_features_v2"]], "named_entity_recognition_features module": [[15, "module-features.named_entity_recognition_features"]], "other_lexical_features module": [[16, "module-features.other_lexical_features"]], "politeness_features module": [[17, "module-features.politeness_features"]], "politeness_v2 module": [[18, "module-features.politeness_v2"]], "politeness_v2_helper module": [[19, "module-features.politeness_v2_helper"]], "preload_word_lists module": [[70, "module-utils.preload_word_lists"]], "preprocess module": [[71, "module-utils.preprocess"]], "question_num module": [[20, "module-features.question_num"]], "readability module": [[21, "module-features.readability"]], "reddit_tags module": [[22, "module-features.reddit_tags"]], "summarize_features module": [[72, "module-utils.summarize_features"]], "temporal_features module": [[23, "module-features.temporal_features"]], "textblob_sentiment_analysis module": [[24, "module-features.textblob_sentiment_analysis"]], "turn_taking_features module": [[25, "module-features.turn_taking_features"]], "variance_in_DD module": [[26, "module-features.variance_in_DD"]], "within_person_discursive_range module": [[27, "module-features.within_person_discursive_range"]], "word_mimicry module": [[28, "module-features.word_mimicry"]], "z-scores:": [[41, "z-scores"]], "zscore_chats_and_conversation module": [[73, "module-utils.zscore_chats_and_conversation"]], "\u201cDriver\u201d Classes: Utterance-, Conversation-, and Speaker-Level Features": [[69, "driver-classes-utterance-conversation-and-speaker-level-features"]]}, "docnames": ["basics", "examples", "feature_builder", "features/basic_features", "features/burstiness", "features/certainty", "features/discursive_diversity", "features/fflow", "features/get_all_DD_features", "features/get_user_network", "features/hedge", "features/index", "features/info_exchange_zscore", "features/information_diversity", "features/lexical_features_v2", "features/named_entity_recognition_features", "features/other_lexical_features", "features/politeness_features", "features/politeness_v2", "features/politeness_v2_helper", "features/question_num", "features/readability", "features/reddit_tags", "features/temporal_features", "features/textblob_sentiment_analysis", "features/turn_taking_features", "features/variance_in_DD", "features/within_person_discursive_range", "features/word_mimicry", "features_conceptual/TEMPLATE", "features_conceptual/certainty", "features_conceptual/content_word_accommodation", "features_conceptual/conversational_repair", "features_conceptual/dale_chall_score", "features_conceptual/discursive_diversity", "features_conceptual/forward_flow", "features_conceptual/function_word_accommodation", "features_conceptual/gini_coefficient", "features_conceptual/hedge", "features_conceptual/index", "features_conceptual/information_diversity", "features_conceptual/information_exchange", "features_conceptual/liwc", "features_conceptual/message_length", "features_conceptual/message_quantity", "features_conceptual/mimicry_bert", "features_conceptual/moving_mimicry", "features_conceptual/named_entity_recognition", "features_conceptual/online_discussions_tags", "features_conceptual/politeness_receptiveness_markers", "features_conceptual/politeness_strategies", "features_conceptual/positivity_bert", "features_conceptual/positivity_z_score", "features_conceptual/proportion_of_first_person_pronouns", "features_conceptual/questions", "features_conceptual/team_burstiness", "features_conceptual/textblob_polarity", "features_conceptual/textblob_subjectivity", "features_conceptual/time_difference", "features_conceptual/turn_taking_index", "features_conceptual/word_ttr", "index", "intro", "utils/assign_chunk_nums", "utils/calculate_chat_level_features", "utils/calculate_conversation_level_features", "utils/calculate_user_level_features", "utils/check_embeddings", "utils/gini_coefficient", "utils/index", "utils/preload_word_lists", "utils/preprocess", "utils/summarize_features", "utils/zscore_chats_and_conversation"], "envversion": {"sphinx": 61, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["basics.rst", "examples.rst", "feature_builder.rst", "features/basic_features.rst", "features/burstiness.rst", "features/certainty.rst", "features/discursive_diversity.rst", "features/fflow.rst", "features/get_all_DD_features.rst", "features/get_user_network.rst", "features/hedge.rst", "features/index.rst", "features/info_exchange_zscore.rst", "features/information_diversity.rst", "features/lexical_features_v2.rst", "features/named_entity_recognition_features.rst", "features/other_lexical_features.rst", "features/politeness_features.rst", "features/politeness_v2.rst", "features/politeness_v2_helper.rst", "features/question_num.rst", "features/readability.rst", "features/reddit_tags.rst", "features/temporal_features.rst", "features/textblob_sentiment_analysis.rst", "features/turn_taking_features.rst", "features/variance_in_DD.rst", "features/within_person_discursive_range.rst", "features/word_mimicry.rst", "features_conceptual/TEMPLATE.rst", "features_conceptual/certainty.rst", "features_conceptual/content_word_accommodation.rst", "features_conceptual/conversational_repair.rst", "features_conceptual/dale_chall_score.rst", "features_conceptual/discursive_diversity.rst", "features_conceptual/forward_flow.rst", "features_conceptual/function_word_accommodation.rst", "features_conceptual/gini_coefficient.rst", "features_conceptual/hedge.rst", "features_conceptual/index.rst", "features_conceptual/information_diversity.rst", "features_conceptual/information_exchange.rst", "features_conceptual/liwc.rst", "features_conceptual/message_length.rst", "features_conceptual/message_quantity.rst", "features_conceptual/mimicry_bert.rst", "features_conceptual/moving_mimicry.rst", "features_conceptual/named_entity_recognition.rst", "features_conceptual/online_discussions_tags.rst", "features_conceptual/politeness_receptiveness_markers.rst", "features_conceptual/politeness_strategies.rst", "features_conceptual/positivity_bert.rst", "features_conceptual/positivity_z_score.rst", "features_conceptual/proportion_of_first_person_pronouns.rst", "features_conceptual/questions.rst", "features_conceptual/team_burstiness.rst", "features_conceptual/textblob_polarity.rst", "features_conceptual/textblob_subjectivity.rst", "features_conceptual/time_difference.rst", "features_conceptual/turn_taking_index.rst", "features_conceptual/word_ttr.rst", "index.rst", "intro.rst", "utils/assign_chunk_nums.rst", "utils/calculate_chat_level_features.rst", "utils/calculate_conversation_level_features.rst", "utils/calculate_user_level_features.rst", "utils/check_embeddings.rst", "utils/gini_coefficient.rst", "utils/index.rst", "utils/preload_word_lists.rst", "utils/preprocess.rst", "utils/summarize_features.rst", "utils/zscore_chats_and_conversation.rst"], "indexentries": {"adverb_limiter() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.adverb_limiter", false]], "assign_chunk_nums() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.assign_chunk_nums", false]], "bare_command() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.bare_command", false]], "built_spacy_ner() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.built_spacy_ner", false]], "burstiness() (in module features.burstiness)": [[4, "features.burstiness.burstiness", false]], "calculate_chat_level_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_chat_level_features", false]], "calculate_conversation_level_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.calculate_conversation_level_features", false]], "calculate_hedge_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_hedge_features", false]], "calculate_id_score() (in module features.information_diversity)": [[13, "features.information_diversity.calculate_ID_score", false]], "calculate_info_diversity() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.calculate_info_diversity", false]], "calculate_named_entities() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.calculate_named_entities", false]], "calculate_num_question_naive() (in module features.question_num)": [[20, "features.question_num.calculate_num_question_naive", false]], "calculate_politeness_sentiment() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_politeness_sentiment", false]], "calculate_politeness_v2() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_politeness_v2", false]], "calculate_team_burstiness() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.calculate_team_burstiness", false]], "calculate_textblob_sentiment() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_textblob_sentiment", false]], "calculate_user_level_features() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.calculate_user_level_features", false]], "calculate_vector_word_mimicry() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_vector_word_mimicry", false]], "calculate_word_mimicry() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.calculate_word_mimicry", false]], "chat_level_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.chat_level_features", false]], "chatlevelfeaturescalculator (class in utils.calculate_chat_level_features)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator", false]], "check_embeddings() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.check_embeddings", false]], "classify_ntri() (in module features.other_lexical_features)": [[16, "features.other_lexical_features.classify_NTRI", false]], "classify_text_dalechall() (in module features.readability)": [[21, "features.readability.classify_text_dalechall", false]], "clean_text() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.clean_text", false]], "commit_data() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.commit_data", false]], "compress() (in module utils.preprocess)": [[71, "utils.preprocess.compress", false]], "compute_frequency() (in module features.word_mimicry)": [[28, "features.word_mimicry.compute_frequency", false]], "compute_frequency_per_conv() (in module features.word_mimicry)": [[28, "features.word_mimicry.compute_frequency_per_conv", false]], "computetf() (in module features.word_mimicry)": [[28, "features.word_mimicry.computeTF", false]], "concat_bert_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.concat_bert_features", false]], "conjection_seperator() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.conjection_seperator", false]], "content_mimicry_score() (in module features.word_mimicry)": [[28, "features.word_mimicry.Content_mimicry_score", false]], "content_mimicry_score_per_conv() (in module features.word_mimicry)": [[28, "features.word_mimicry.Content_mimicry_score_per_conv", false]], "conv_level_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.conv_level_features", false]], "conv_to_float_arr() (in module features.get_all_dd_features)": [[8, "features.get_all_DD_features.conv_to_float_arr", false]], "conversationlevelfeaturescalculator (class in utils.calculate_conversation_level_features)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator", false]], "count_all_caps() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_all_caps", false]], "count_bullet_points() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_bullet_points", false]], "count_characters() (in module features.basic_features)": [[3, "features.basic_features.count_characters", false]], "count_difficult_words() (in module features.readability)": [[21, "features.readability.count_difficult_words", false]], "count_ellipses() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_ellipses", false]], "count_emojis() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_emojis", false]], "count_emphasis() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_emphasis", false]], "count_line_breaks() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_line_breaks", false]], "count_links() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_links", false]], "count_matches() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.count_matches", false]], "count_messages() (in module features.basic_features)": [[3, "features.basic_features.count_messages", false]], "count_numbering() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_numbering", false]], "count_parentheses() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_parentheses", false]], "count_quotes() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_quotes", false]], "count_responding_to_someone() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_responding_to_someone", false]], "count_spacy_matches() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.count_spacy_matches", false]], "count_syllables() (in module features.readability)": [[21, "features.readability.count_syllables", false]], "count_turn_taking_index() (in module features.turn_taking_features)": [[25, "features.turn_taking_features.count_turn_taking_index", false]], "count_turns() (in module features.turn_taking_features)": [[25, "features.turn_taking_features.count_turns", false]], "count_user_references() (in module features.reddit_tags)": [[22, "features.reddit_tags.count_user_references", false]], "count_words() (in module features.basic_features)": [[3, "features.basic_features.count_words", false]], "create_chunks() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.create_chunks", false]], "create_chunks_messages() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.create_chunks_messages", false]], "create_cumulative_rows() (in module utils.preprocess)": [[71, "utils.preprocess.create_cumulative_rows", false]], "dale_chall_helper() (in module features.readability)": [[21, "features.readability.dale_chall_helper", false]], "feat_counts() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.feat_counts", false]], "feature_builder": [[2, "module-feature_builder", false]], "featurebuilder (class in feature_builder)": [[2, "feature_builder.FeatureBuilder", false]], "features.basic_features": [[3, "module-features.basic_features", false]], "features.burstiness": [[4, "module-features.burstiness", false]], "features.certainty": [[5, "module-features.certainty", false]], "features.discursive_diversity": [[6, "module-features.discursive_diversity", false]], "features.fflow": [[7, "module-features.fflow", false]], "features.get_all_dd_features": [[8, "module-features.get_all_DD_features", false]], "features.get_user_network": [[9, "module-features.get_user_network", false]], "features.hedge": [[10, "module-features.hedge", false]], "features.info_exchange_zscore": [[12, "module-features.info_exchange_zscore", false]], "features.information_diversity": [[13, "module-features.information_diversity", false]], "features.lexical_features_v2": [[14, "module-features.lexical_features_v2", false]], "features.named_entity_recognition_features": [[15, "module-features.named_entity_recognition_features", false]], "features.other_lexical_features": [[16, "module-features.other_lexical_features", false]], "features.politeness_features": [[17, "module-features.politeness_features", false]], "features.politeness_v2": [[18, "module-features.politeness_v2", false]], "features.politeness_v2_helper": [[19, "module-features.politeness_v2_helper", false]], "features.question_num": [[20, "module-features.question_num", false]], "features.readability": [[21, "module-features.readability", false]], "features.reddit_tags": [[22, "module-features.reddit_tags", false]], "features.temporal_features": [[23, "module-features.temporal_features", false]], "features.textblob_sentiment_analysis": [[24, "module-features.textblob_sentiment_analysis", false]], "features.turn_taking_features": [[25, "module-features.turn_taking_features", false]], "features.variance_in_dd": [[26, "module-features.variance_in_DD", false]], "features.within_person_discursive_range": [[27, "module-features.within_person_discursive_range", false]], "features.word_mimicry": [[28, "module-features.word_mimicry", false]], "featurize() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.featurize", false]], "fix_abbreviations() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.fix_abbreviations", false]], "function_mimicry_score() (in module features.word_mimicry)": [[28, "features.word_mimicry.function_mimicry_score", false]], "generate_bert() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_bert", false]], "generate_certainty_pkl() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_certainty_pkl", false]], "generate_lexicon_pkl() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_lexicon_pkl", false]], "generate_summary_stats() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.generate_summary_stats", false]], "generate_vect() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.generate_vect", false]], "get_centroids() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_centroids", false]], "get_certainty() (in module features.certainty)": [[5, "features.certainty.get_certainty", false]], "get_certainty_score() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_certainty_score", false]], "get_content_words_in_message() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_content_words_in_message", false]], "get_conversation_level_aggregates() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_conversation_level_aggregates", false]], "get_cosine_similarity() (in module features.discursive_diversity)": [[6, "features.discursive_diversity.get_cosine_similarity", false]], "get_dale_chall_easy_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_dale_chall_easy_words", false]], "get_dale_chall_score_and_classfication() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_dale_chall_score_and_classfication", false]], "get_dd() (in module features.discursive_diversity)": [[6, "features.discursive_diversity.get_DD", false]], "get_dd_features() (in module features.get_all_dd_features)": [[8, "features.get_all_DD_features.get_DD_features", false]], "get_dep_pairs() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.get_dep_pairs", false]], "get_dep_pairs_noneg() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.get_dep_pairs_noneg", false]], "get_discursive_diversity_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_discursive_diversity_features", false]], "get_first_person_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_first_person_words", false]], "get_forward_flow() (in module features.fflow)": [[7, "features.fflow.get_forward_flow", false]], "get_forward_flow() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_forward_flow", false]], "get_function_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_function_words", false]], "get_function_words_in_message() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_function_words_in_message", false]], "get_gini() (in module utils.gini_coefficient)": [[68, "utils.gini_coefficient.get_gini", false]], "get_gini_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_gini_features", false]], "get_info_diversity() (in module features.information_diversity)": [[13, "features.information_diversity.get_info_diversity", false]], "get_info_exchange_wordcount() (in module features.info_exchange_zscore)": [[12, "features.info_exchange_zscore.get_info_exchange_wordcount", false]], "get_liwc_count() (in module features.lexical_features_v2)": [[14, "features.lexical_features_v2.get_liwc_count", false]], "get_max() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_max", false]], "get_mean() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_mean", false]], "get_median() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_median", false]], "get_mimicry_bert() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_mimicry_bert", false]], "get_min() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_min", false]], "get_moving_mimicry() (in module features.word_mimicry)": [[28, "features.word_mimicry.get_moving_mimicry", false]], "get_named_entity() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_named_entity", false]], "get_nan_vector() (in module features.within_person_discursive_range)": [[27, "features.within_person_discursive_range.get_nan_vector", false]], "get_nan_vector() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.get_nan_vector", false]], "get_polarity_score() (in module features.textblob_sentiment_analysis)": [[24, "features.textblob_sentiment_analysis.get_polarity_score", false]], "get_politeness_strategies() (in module features.politeness_features)": [[17, "features.politeness_features.get_politeness_strategies", false]], "get_politeness_v2() (in module features.politeness_v2)": [[18, "features.politeness_v2.get_politeness_v2", false]], "get_proportion_first_pronouns() (in module features.other_lexical_features)": [[16, "features.other_lexical_features.get_proportion_first_pronouns", false]], "get_question_words() (in module utils.preload_word_lists)": [[70, "utils.preload_word_lists.get_question_words", false]], "get_reddit_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_reddit_features", false]], "get_sentiment() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.get_sentiment", false]], "get_stdev() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_stdev", false]], "get_subjectivity_score() (in module features.textblob_sentiment_analysis)": [[24, "features.textblob_sentiment_analysis.get_subjectivity_score", false]], "get_sum() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_sum", false]], "get_team_burstiness() (in module features.burstiness)": [[4, "features.burstiness.get_team_burstiness", false]], "get_temporal_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.get_temporal_features", false]], "get_time_diff() (in module features.temporal_features)": [[23, "features.temporal_features.get_time_diff", false]], "get_time_diff_startend() (in module features.temporal_features)": [[23, "features.temporal_features.get_time_diff_startend", false]], "get_turn() (in module features.turn_taking_features)": [[25, "features.turn_taking_features.get_turn", false]], "get_turn_id() (in module utils.preprocess)": [[71, "utils.preprocess.get_turn_id", false]], "get_turn_taking_features() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_turn_taking_features", false]], "get_unique_pairwise_combos() (in module features.discursive_diversity)": [[6, "features.discursive_diversity.get_unique_pairwise_combos", false]], "get_user_level_aggregates() (utils.calculate_conversation_level_features.conversationlevelfeaturescalculator method)": [[65, "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator.get_user_level_aggregates", false]], "get_user_level_summary_statistics_features() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_user_level_summary_statistics_features", false]], "get_user_level_summed_features() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_user_level_summed_features", false]], "get_user_max_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_max_dataframe", false]], "get_user_mean_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_mean_dataframe", false]], "get_user_median_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_median_dataframe", false]], "get_user_min_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_min_dataframe", false]], "get_user_network() (in module features.get_user_network)": [[9, "features.get_user_network.get_user_network", false]], "get_user_network() (utils.calculate_user_level_features.userlevelfeaturescalculator method)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator.get_user_network", false]], "get_user_stdev_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_stdev_dataframe", false]], "get_user_sum_dataframe() (in module utils.summarize_features)": [[72, "utils.summarize_features.get_user_sum_dataframe", false]], "get_variance_in_dd() (in module features.variance_in_dd)": [[26, "features.variance_in_DD.get_variance_in_DD", false]], "get_within_person_disc_range() (in module features.within_person_discursive_range)": [[27, "features.within_person_discursive_range.get_within_person_disc_range", false]], "get_word_ttr() (in module features.other_lexical_features)": [[16, "features.other_lexical_features.get_word_TTR", false]], "get_zscore_across_all_chats() (in module utils.zscore_chats_and_conversation)": [[73, "utils.zscore_chats_and_conversation.get_zscore_across_all_chats", false]], "get_zscore_across_all_conversations() (in module utils.zscore_chats_and_conversation)": [[73, "utils.zscore_chats_and_conversation.get_zscore_across_all_conversations", false]], "gini_coefficient() (in module utils.gini_coefficient)": [[68, "utils.gini_coefficient.gini_coefficient", false]], "info_diversity() (in module features.information_diversity)": [[13, "features.information_diversity.info_diversity", false]], "info_exchange() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.info_exchange", false]], "is_hedged_sentence_1() (in module features.hedge)": [[10, "features.hedge.is_hedged_sentence_1", false]], "is_valid_term() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.is_valid_term", false]], "keep_one_column_per_group() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.keep_one_column_per_group", false]], "lexical_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.lexical_features", false]], "liwc_features() (in module features.lexical_features_v2)": [[14, "features.lexical_features_v2.liwc_features", false]], "load_custem_liwc_dict() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.load_custem_liwc_dict", false]], "load_liwc_dict() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.load_liwc_dict", false]], "load_saved_data() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.load_saved_data", false]], "load_to_dict() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.load_to_dict", false]], "load_to_lists() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.load_to_lists", false]], "log_column_groups() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.log_column_groups", false]], "merge_conv_data_with_original() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.merge_conv_data_with_original", false]], "mimic_words() (in module features.word_mimicry)": [[28, "features.word_mimicry.mimic_words", false]], "module": [[2, "module-feature_builder", false], [3, "module-features.basic_features", false], [4, "module-features.burstiness", false], [5, "module-features.certainty", false], [6, "module-features.discursive_diversity", false], [7, "module-features.fflow", false], [8, "module-features.get_all_DD_features", false], [9, "module-features.get_user_network", false], [10, "module-features.hedge", false], [12, "module-features.info_exchange_zscore", false], [13, "module-features.information_diversity", false], [14, "module-features.lexical_features_v2", false], [15, "module-features.named_entity_recognition_features", false], [16, "module-features.other_lexical_features", false], [17, "module-features.politeness_features", false], [18, "module-features.politeness_v2", false], [19, "module-features.politeness_v2_helper", false], [20, "module-features.question_num", false], [21, "module-features.readability", false], [22, "module-features.reddit_tags", false], [23, "module-features.temporal_features", false], [24, "module-features.textblob_sentiment_analysis", false], [25, "module-features.turn_taking_features", false], [26, "module-features.variance_in_DD", false], [27, "module-features.within_person_discursive_range", false], [28, "module-features.word_mimicry", false], [63, "module-utils.assign_chunk_nums", false], [64, "module-utils.calculate_chat_level_features", false], [65, "module-utils.calculate_conversation_level_features", false], [66, "module-utils.calculate_user_level_features", false], [67, "module-utils.check_embeddings", false], [68, "module-utils.gini_coefficient", false], [70, "module-utils.preload_word_lists", false], [71, "module-utils.preprocess", false], [72, "module-utils.summarize_features", false], [73, "module-utils.zscore_chats_and_conversation", false]], "named_entities() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.named_entities", false]], "num_named_entity() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.num_named_entity", false]], "other_lexical_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.other_lexical_features", false]], "phrase_split() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.phrase_split", false]], "positivity_zscore() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.positivity_zscore", false]], "prep_simple() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.prep_simple", false]], "prep_whole() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.prep_whole", false]], "preprocess_chat_data() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.preprocess_chat_data", false]], "preprocess_conversation_columns() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_conversation_columns", false]], "preprocess_naive_turns() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_naive_turns", false]], "preprocess_text() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_text", false]], "preprocess_text_lowercase_but_retain_punctuation() (in module utils.preprocess)": [[71, "utils.preprocess.preprocess_text_lowercase_but_retain_punctuation", false]], "preprocessing() (in module features.information_diversity)": [[13, "features.information_diversity.preprocessing", false]], "punctuation_seperator() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.punctuation_seperator", false]], "question() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.Question", false]], "read_in_lexicons() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.read_in_lexicons", false]], "reduce_chunks() (in module utils.assign_chunk_nums)": [[63, "utils.assign_chunk_nums.reduce_chunks", false]], "remove_active_user() (in module features.get_user_network)": [[9, "features.get_user_network.remove_active_user", false]], "remove_unhashable_cols() (in module utils.preprocess)": [[71, "utils.preprocess.remove_unhashable_cols", false]], "save_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.save_features", false]], "sentence_pad() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.sentence_pad", false]], "sentence_split() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.sentence_split", false]], "sentenciser() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.sentenciser", false]], "set_self_conv_data() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.set_self_conv_data", false]], "setup_logger() (in module utils.preprocess)": [[71, "utils.preprocess.setup_logger", false]], "sort_words() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.sort_words", false]], "str_to_vec() (in module utils.check_embeddings)": [[67, "utils.check_embeddings.str_to_vec", false]], "text_based_features() (utils.calculate_chat_level_features.chatlevelfeaturescalculator method)": [[64, "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator.text_based_features", false]], "token_count() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.token_count", false]], "train_spacy_ner() (in module features.named_entity_recognition_features)": [[15, "features.named_entity_recognition_features.train_spacy_ner", false]], "user_level_features() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.user_level_features", false]], "userlevelfeaturescalculator (class in utils.calculate_user_level_features)": [[66, "utils.calculate_user_level_features.UserLevelFeaturesCalculator", false]], "utils.assign_chunk_nums": [[63, "module-utils.assign_chunk_nums", false]], "utils.calculate_chat_level_features": [[64, "module-utils.calculate_chat_level_features", false]], "utils.calculate_conversation_level_features": [[65, "module-utils.calculate_conversation_level_features", false]], "utils.calculate_user_level_features": [[66, "module-utils.calculate_user_level_features", false]], "utils.check_embeddings": [[67, "module-utils.check_embeddings", false]], "utils.gini_coefficient": [[68, "module-utils.gini_coefficient", false]], "utils.preload_word_lists": [[70, "module-utils.preload_word_lists", false]], "utils.preprocess": [[71, "module-utils.preprocess", false]], "utils.summarize_features": [[72, "module-utils.summarize_features", false]], "utils.zscore_chats_and_conversation": [[73, "module-utils.zscore_chats_and_conversation", false]], "verify_timestamp_format() (feature_builder.featurebuilder method)": [[2, "feature_builder.FeatureBuilder.verify_timestamp_format", false]], "word_start() (in module features.politeness_v2_helper)": [[19, "features.politeness_v2_helper.word_start", false]]}, "objects": {"": [[2, 0, 0, "-", "feature_builder"]], "feature_builder": [[2, 1, 1, "", "FeatureBuilder"]], "feature_builder.FeatureBuilder": [[2, 2, 1, "", "chat_level_features"], [2, 2, 1, "", "conv_level_features"], [2, 2, 1, "", "featurize"], [2, 2, 1, "", "generate_summary_stats"], [2, 2, 1, "", "keep_one_column_per_group"], [2, 2, 1, "", "load_custem_liwc_dict"], [2, 2, 1, "", "log_column_groups"], [2, 2, 1, "", "merge_conv_data_with_original"], [2, 2, 1, "", "preprocess_chat_data"], [2, 2, 1, "", "save_features"], [2, 2, 1, "", "set_self_conv_data"], [2, 2, 1, "", "user_level_features"], [2, 2, 1, "", "verify_timestamp_format"]], "features": [[3, 0, 0, "-", "basic_features"], [4, 0, 0, "-", "burstiness"], [5, 0, 0, "-", "certainty"], [6, 0, 0, "-", "discursive_diversity"], [7, 0, 0, "-", "fflow"], [8, 0, 0, "-", "get_all_DD_features"], [9, 0, 0, "-", "get_user_network"], [10, 0, 0, "-", "hedge"], [12, 0, 0, "-", "info_exchange_zscore"], [13, 0, 0, "-", "information_diversity"], [14, 0, 0, "-", "lexical_features_v2"], [15, 0, 0, "-", "named_entity_recognition_features"], [16, 0, 0, "-", "other_lexical_features"], [17, 0, 0, "-", "politeness_features"], [18, 0, 0, "-", "politeness_v2"], [19, 0, 0, "-", "politeness_v2_helper"], [20, 0, 0, "-", "question_num"], [21, 0, 0, "-", "readability"], [22, 0, 0, "-", "reddit_tags"], [23, 0, 0, "-", "temporal_features"], [24, 0, 0, "-", "textblob_sentiment_analysis"], [25, 0, 0, "-", "turn_taking_features"], [26, 0, 0, "-", "variance_in_DD"], [27, 0, 0, "-", "within_person_discursive_range"], [28, 0, 0, "-", "word_mimicry"]], "features.basic_features": [[3, 3, 1, "", "count_characters"], [3, 3, 1, "", "count_messages"], [3, 3, 1, "", "count_words"]], "features.burstiness": [[4, 3, 1, "", "burstiness"], [4, 3, 1, "", "get_team_burstiness"]], "features.certainty": [[5, 3, 1, "", "get_certainty"]], "features.discursive_diversity": [[6, 3, 1, "", "get_DD"], [6, 3, 1, "", "get_cosine_similarity"], [6, 3, 1, "", "get_unique_pairwise_combos"]], "features.fflow": [[7, 3, 1, "", "get_forward_flow"]], "features.get_all_DD_features": [[8, 3, 1, "", "conv_to_float_arr"], [8, 3, 1, "", "get_DD_features"]], "features.get_user_network": [[9, 3, 1, "", "get_user_network"], [9, 3, 1, "", "remove_active_user"]], "features.hedge": [[10, 3, 1, "", "is_hedged_sentence_1"]], "features.info_exchange_zscore": [[12, 3, 1, "", "get_info_exchange_wordcount"]], "features.information_diversity": [[13, 3, 1, "", "calculate_ID_score"], [13, 3, 1, "", "get_info_diversity"], [13, 3, 1, "", "info_diversity"], [13, 3, 1, "", "preprocessing"]], "features.lexical_features_v2": [[14, 3, 1, "", "get_liwc_count"], [14, 3, 1, "", "liwc_features"]], "features.named_entity_recognition_features": [[15, 3, 1, "", "built_spacy_ner"], [15, 3, 1, "", "calculate_named_entities"], [15, 3, 1, "", "named_entities"], [15, 3, 1, "", "num_named_entity"], [15, 3, 1, "", "train_spacy_ner"]], "features.other_lexical_features": [[16, 3, 1, "", "classify_NTRI"], [16, 3, 1, "", "get_proportion_first_pronouns"], [16, 3, 1, "", "get_word_TTR"]], "features.politeness_features": [[17, 3, 1, "", "get_politeness_strategies"]], "features.politeness_v2": [[18, 3, 1, "", "get_politeness_v2"]], "features.politeness_v2_helper": [[19, 3, 1, "", "Question"], [19, 3, 1, "", "adverb_limiter"], [19, 3, 1, "", "bare_command"], [19, 3, 1, "", "clean_text"], [19, 3, 1, "", "commit_data"], [19, 3, 1, "", "conjection_seperator"], [19, 3, 1, "", "count_matches"], [19, 3, 1, "", "count_spacy_matches"], [19, 3, 1, "", "feat_counts"], [19, 3, 1, "", "get_dep_pairs"], [19, 3, 1, "", "get_dep_pairs_noneg"], [19, 3, 1, "", "load_saved_data"], [19, 3, 1, "", "load_to_dict"], [19, 3, 1, "", "load_to_lists"], [19, 3, 1, "", "phrase_split"], [19, 3, 1, "", "prep_simple"], [19, 3, 1, "", "prep_whole"], [19, 3, 1, "", "punctuation_seperator"], [19, 3, 1, "", "sentence_pad"], [19, 3, 1, "", "sentence_split"], [19, 3, 1, "", "sentenciser"], [19, 3, 1, "", "token_count"], [19, 3, 1, "", "word_start"]], "features.question_num": [[20, 3, 1, "", "calculate_num_question_naive"]], "features.readability": [[21, 3, 1, "", "classify_text_dalechall"], [21, 3, 1, "", "count_difficult_words"], [21, 3, 1, "", "count_syllables"], [21, 3, 1, "", "dale_chall_helper"]], "features.reddit_tags": [[22, 3, 1, "", "count_all_caps"], [22, 3, 1, "", "count_bullet_points"], [22, 3, 1, "", "count_ellipses"], [22, 3, 1, "", "count_emojis"], [22, 3, 1, "", "count_emphasis"], [22, 3, 1, "", "count_line_breaks"], [22, 3, 1, "", "count_links"], [22, 3, 1, "", "count_numbering"], [22, 3, 1, "", "count_parentheses"], [22, 3, 1, "", "count_quotes"], [22, 3, 1, "", "count_responding_to_someone"], [22, 3, 1, "", "count_user_references"]], "features.temporal_features": [[23, 3, 1, "", "get_time_diff"], [23, 3, 1, "", "get_time_diff_startend"]], "features.textblob_sentiment_analysis": [[24, 3, 1, "", "get_polarity_score"], [24, 3, 1, "", "get_subjectivity_score"]], "features.turn_taking_features": [[25, 3, 1, "", "count_turn_taking_index"], [25, 3, 1, "", "count_turns"], [25, 3, 1, "", "get_turn"]], "features.variance_in_DD": [[26, 3, 1, "", "get_variance_in_DD"]], "features.within_person_discursive_range": [[27, 3, 1, "", "get_nan_vector"], [27, 3, 1, "", "get_within_person_disc_range"]], "features.word_mimicry": [[28, 3, 1, "", "Content_mimicry_score"], [28, 3, 1, "", "Content_mimicry_score_per_conv"], [28, 3, 1, "", "computeTF"], [28, 3, 1, "", "compute_frequency"], [28, 3, 1, "", "compute_frequency_per_conv"], [28, 3, 1, "", "function_mimicry_score"], [28, 3, 1, "", "get_content_words_in_message"], [28, 3, 1, "", "get_function_words_in_message"], [28, 3, 1, "", "get_mimicry_bert"], [28, 3, 1, "", "get_moving_mimicry"], [28, 3, 1, "", "mimic_words"]], "utils": [[63, 0, 0, "-", "assign_chunk_nums"], [64, 0, 0, "-", "calculate_chat_level_features"], [65, 0, 0, "-", "calculate_conversation_level_features"], [66, 0, 0, "-", "calculate_user_level_features"], [67, 0, 0, "-", "check_embeddings"], [68, 0, 0, "-", "gini_coefficient"], [70, 0, 0, "-", "preload_word_lists"], [71, 0, 0, "-", "preprocess"], [72, 0, 0, "-", "summarize_features"], [73, 0, 0, "-", "zscore_chats_and_conversation"]], "utils.assign_chunk_nums": [[63, 3, 1, "", "assign_chunk_nums"], [63, 3, 1, "", "create_chunks"], [63, 3, 1, "", "create_chunks_messages"], [63, 3, 1, "", "reduce_chunks"]], "utils.calculate_chat_level_features": [[64, 1, 1, "", "ChatLevelFeaturesCalculator"]], "utils.calculate_chat_level_features.ChatLevelFeaturesCalculator": [[64, 2, 1, "", "calculate_chat_level_features"], [64, 2, 1, "", "calculate_hedge_features"], [64, 2, 1, "", "calculate_politeness_sentiment"], [64, 2, 1, "", "calculate_politeness_v2"], [64, 2, 1, "", "calculate_textblob_sentiment"], [64, 2, 1, "", "calculate_vector_word_mimicry"], [64, 2, 1, "", "calculate_word_mimicry"], [64, 2, 1, "", "concat_bert_features"], [64, 2, 1, "", "get_certainty_score"], [64, 2, 1, "", "get_dale_chall_score_and_classfication"], [64, 2, 1, "", "get_forward_flow"], [64, 2, 1, "", "get_named_entity"], [64, 2, 1, "", "get_reddit_features"], [64, 2, 1, "", "get_temporal_features"], [64, 2, 1, "", "info_exchange"], [64, 2, 1, "", "lexical_features"], [64, 2, 1, "", "other_lexical_features"], [64, 2, 1, "", "positivity_zscore"], [64, 2, 1, "", "text_based_features"]], "utils.calculate_conversation_level_features": [[65, 1, 1, "", "ConversationLevelFeaturesCalculator"]], "utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator": [[65, 2, 1, "", "calculate_conversation_level_features"], [65, 2, 1, "", "calculate_info_diversity"], [65, 2, 1, "", "calculate_team_burstiness"], [65, 2, 1, "", "get_conversation_level_aggregates"], [65, 2, 1, "", "get_discursive_diversity_features"], [65, 2, 1, "", "get_gini_features"], [65, 2, 1, "", "get_turn_taking_features"], [65, 2, 1, "", "get_user_level_aggregates"]], "utils.calculate_user_level_features": [[66, 1, 1, "", "UserLevelFeaturesCalculator"]], "utils.calculate_user_level_features.UserLevelFeaturesCalculator": [[66, 2, 1, "", "calculate_user_level_features"], [66, 2, 1, "", "get_centroids"], [66, 2, 1, "", "get_user_level_summary_statistics_features"], [66, 2, 1, "", "get_user_level_summed_features"], [66, 2, 1, "", "get_user_network"]], "utils.check_embeddings": [[67, 3, 1, "", "check_embeddings"], [67, 3, 1, "", "fix_abbreviations"], [67, 3, 1, "", "generate_bert"], [67, 3, 1, "", "generate_certainty_pkl"], [67, 3, 1, "", "generate_lexicon_pkl"], [67, 3, 1, "", "generate_vect"], [67, 3, 1, "", "get_nan_vector"], [67, 3, 1, "", "get_sentiment"], [67, 3, 1, "", "is_valid_term"], [67, 3, 1, "", "load_liwc_dict"], [67, 3, 1, "", "read_in_lexicons"], [67, 3, 1, "", "sort_words"], [67, 3, 1, "", "str_to_vec"]], "utils.gini_coefficient": [[68, 3, 1, "", "get_gini"], [68, 3, 1, "", "gini_coefficient"]], "utils.preload_word_lists": [[70, 3, 1, "", "get_dale_chall_easy_words"], [70, 3, 1, "", "get_first_person_words"], [70, 3, 1, "", "get_function_words"], [70, 3, 1, "", "get_question_words"]], "utils.preprocess": [[71, 3, 1, "", "compress"], [71, 3, 1, "", "create_cumulative_rows"], [71, 3, 1, "", "get_turn_id"], [71, 3, 1, "", "preprocess_conversation_columns"], [71, 3, 1, "", "preprocess_naive_turns"], [71, 3, 1, "", "preprocess_text"], [71, 3, 1, "", "preprocess_text_lowercase_but_retain_punctuation"], [71, 3, 1, "", "remove_unhashable_cols"], [71, 3, 1, "", "setup_logger"]], "utils.summarize_features": [[72, 3, 1, "", "get_max"], [72, 3, 1, "", "get_mean"], [72, 3, 1, "", "get_median"], [72, 3, 1, "", "get_min"], [72, 3, 1, "", "get_stdev"], [72, 3, 1, "", "get_sum"], [72, 3, 1, "", "get_user_max_dataframe"], [72, 3, 1, "", "get_user_mean_dataframe"], [72, 3, 1, "", "get_user_median_dataframe"], [72, 3, 1, "", "get_user_min_dataframe"], [72, 3, 1, "", "get_user_stdev_dataframe"], [72, 3, 1, "", "get_user_sum_dataframe"]], "utils.zscore_chats_and_conversation": [[73, 3, 1, "", "get_zscore_across_all_chats"], [73, 3, 1, "", "get_zscore_across_all_conversations"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "terms": {"": [0, 1, 2, 4, 5, 9, 11, 13, 14, 25, 28, 29, 31, 32, 34, 35, 36, 40, 41, 42, 43, 44, 45, 46, 47, 49, 50, 51, 52, 55, 59, 61, 62, 64, 65, 66], "0": [0, 1, 2, 5, 10, 13, 16, 21, 24, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 43, 45, 46, 47, 50, 51, 53, 55, 59, 61, 71], "00222437221134802": [5, 64], "01": 51, "02": 51, "04": 40, "0496": [21, 33], "05": [13, 40, 50, 51], "06": 51, "08": [42, 50], "09": [45, 46, 50], "1": [0, 1, 2, 3, 10, 13, 22, 24, 32, 34, 35, 37, 38, 40, 41, 43, 44, 45, 46, 47, 48, 51, 53, 55, 56, 57, 59, 61, 62, 67], "10": [1, 5, 6, 21, 24, 33, 59, 61, 64, 71], "100": [21, 33, 37, 42, 47, 62], "10th": 33, "1145": [21, 24], "1177": [5, 64], "11th": 33, "12": [35, 45, 46, 50], "1287": 6, "12th": 33, "13": 50, "14": 50, "15": [1, 37, 50], "1579": [21, 33], "17": 50, "1948": 33, "195": 36, "1977": 62, "1d": 67, "1lpngokujsx": 5, "1st": 50, "1st_person": 50, "1st_person_pl": 50, "1st_person_start": 50, "2": [0, 1, 2, 34, 35, 41, 47, 59, 61, 62, 67], "20": [37, 59, 71], "2004": 42, "2007": [0, 5, 42, 67], "2009": 60, "2012": 55, "2013": [12, 16, 31, 32, 36, 37, 38, 41, 43, 49, 50, 52, 54, 70], "2015": [42, 53, 58, 60, 67], "2016": 4, "2017": 13, "2018": [40, 44, 55], "2019": [35, 52], "2020": [18, 21, 24, 33, 49, 50, 56, 57], "2021": [1, 6, 43, 44], "2022": [13, 34], "2023": [1, 5, 30, 59, 61, 64], "2024": [40, 42], "21": 59, "22": [41, 50], "2384068": 4, "24": [1, 61], "25": 47, "27": [42, 50], "28": 50, "29": 50, "2nd": 50, "2nd_person": 50, "2nd_person_start": 50, "3": [0, 1, 2, 21, 34, 41, 42, 51, 59, 61, 67, 71], "30": [50, 71], "3000": 33, "32": [34, 50], "3432929": [21, 24], "35": 51, "36": 50, "38": 50, "39": 49, "39512260": 68, "3n": 59, "4": [0, 1, 5, 13, 21, 30, 33, 41, 42, 56, 61, 62, 71], "40": 71, "4274": 6, "43": 50, "45": 50, "47": 50, "49": 50, "4pit4bqz6": 5, "4th": [21, 33], "5": [1, 5, 21, 30, 33, 37, 41, 59], "50": [47, 71], "52": 50, "53": 50, "57": 50, "58": 50, "5th": 33, "6": [1, 33, 43], "60": 51, "63": 50, "6365": 21, "64": 67, "68": 47, "6th": 33, "7": [30, 33, 48], "70": 50, "78": [35, 50], "7th": 33, "8": [0, 1, 30, 33, 42, 67], "80": [21, 70], "82": 41, "85": 34, "86": 35, "87": 50, "89": [45, 46], "8th": 33, "9": [0, 1, 2, 5, 21, 30, 33, 40, 47, 50, 67], "9123": 47, "92": 51, "93chall_readability_formula": [21, 70], "94": 15, "95": 47, "95450": 42, "97": 51, "9855072464": 47, "9992": 47, "99954": 47, "9th": 33, "A": [1, 2, 4, 12, 13, 14, 15, 16, 17, 18, 19, 21, 23, 25, 28, 33, 34, 35, 37, 38, 40, 41, 44, 45, 46, 47, 49, 50, 51, 52, 57, 59, 60, 61, 62, 64, 66, 67, 68, 70, 71, 72, 73], "And": [1, 62], "As": [1, 31, 35, 36, 40, 42, 45, 61], "Be": 1, "But": [1, 50, 62], "By": [0, 1, 11, 42, 50], "For": [0, 1, 31, 34, 37, 41, 42, 43, 47, 49, 54, 56, 59, 62, 65], "If": [0, 1, 2, 5, 21, 29, 30, 35, 42, 45, 47, 50, 55, 61, 62, 63, 65, 66, 67, 71], "In": [1, 21, 30, 31, 34, 35, 36, 37, 39, 41, 42, 45, 46, 47, 50, 55, 59, 61, 62], "It": [1, 2, 31, 32, 33, 36, 37, 41, 44, 45, 46, 50, 64, 65, 66, 67, 71], "NO": 37, "NOT": [1, 61], "No": [19, 50, 53], "Not": 41, "One": [1, 37, 61], "That": [29, 55], "The": [1, 2, 3, 4, 5, 7, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 58, 59, 60, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73], "Then": [1, 55, 61], "There": [1, 11, 32, 61, 66], "These": [1, 11, 17, 32, 34, 42, 48, 52, 61, 62, 69], "To": [0, 1, 29, 31, 34, 37, 40, 42, 55, 56, 57, 61, 62], "WITH": 21, "Will": 50, "_deviat": 55, "_lexical_wordcount_custom": 42, "_preprocessed_": 0, "abbrevi": 67, "abil": [13, 29], "abl": [31, 36, 61], "abort": 1, "about": [1, 12, 29, 31, 36, 41, 47, 61, 62], "abov": [1, 21, 34, 61], "absolut": [0, 1, 2], "abstract_id": 4, "academ": 42, "accept": [0, 1, 58, 61], "access": [0, 1, 11, 15, 61], "accommod": [28, 32, 39, 45, 46, 64], "accord": [21, 37, 59, 64, 70], "accordingli": 63, "account": [1, 29, 32, 42], "accus": 50, "achiev": [50, 62], "acknowledg": 49, "acm": [21, 24], "acommod": 36, "across": [1, 13, 28, 31, 34, 40, 41, 50, 62, 64, 73], "action": 59, "activ": [1, 9, 44, 55, 71], "actual": [0, 1, 41, 56], "ad": [61, 62, 71], "adapt": 59, "add": [0, 1, 2, 21, 51, 61], "addit": [2, 32, 34, 42, 63, 69], "addition": [0, 30, 31, 32, 54], "address": 1, "adjac": 71, "adjust": [0, 21, 37, 63], "advanc": [31, 36], "advantag": 4, "adverb": [19, 31, 36], "adverb_limit": [19, 49], "affect": [0, 1, 29, 35, 44], "affirm": 49, "after": [0, 1, 31, 34, 36, 42, 43, 61, 62, 64, 67], "again": [32, 34, 67], "against": [28, 31, 36, 52, 67], "agarw": 62, "aggreg": [0, 2, 3, 37, 44, 61, 62, 65, 66, 72], "agre": 47, "agreement": 49, "ah": [31, 36], "ai": 62, "aim": [39, 62], "airtim": [37, 62], "al": [1, 5, 16, 18, 21, 24, 30, 31, 32, 33, 34, 35, 36, 38, 42, 43, 44, 49, 50, 52, 53, 54, 56, 57, 58, 59, 60, 64], "algorithm": [56, 57], "align": [35, 51], "all": [0, 1, 2, 6, 11, 12, 13, 15, 19, 22, 28, 30, 31, 34, 35, 36, 37, 40, 41, 42, 46, 48, 49, 51, 52, 55, 58, 61, 62, 64, 65, 66, 71, 73], "allow": [0, 1, 67], "almaatouq": 59, "alon": 67, "along": 1, "alongsid": 1, "alphabet": 49, "alphanumer": [42, 67, 71], "alreadi": [0, 1, 2, 4, 10, 12, 16, 67], "also": [0, 1, 2, 28, 30, 31, 32, 34, 36, 37, 38, 42, 47, 51, 54, 60, 61, 62, 64, 65, 67, 69], "alsobai": 59, "altern": 59, "although": [1, 23, 31, 36], "alwai": [1, 55], "am": [31, 36, 42, 54, 62], "amaz": [48, 56], "ambient": 32, "american": 33, "ami": [47, 59, 62], "amic": 62, "among": [36, 37, 52, 55, 62], "amongst": [6, 35, 48], "an": [0, 1, 2, 5, 8, 11, 12, 13, 21, 29, 30, 31, 32, 33, 34, 36, 38, 40, 41, 42, 45, 47, 48, 49, 50, 51, 52, 54, 59, 60, 61, 62, 63, 65, 66, 67, 68, 71], "analys": [1, 62], "analysi": [0, 1, 11, 52, 62, 67, 71], "analyt": 62, "analyz": [0, 13, 14, 16, 17, 19, 20, 21, 22, 24, 28, 43, 52, 62, 67, 71], "analyze_first_pct": 1, "angri": 47, "ani": [0, 1, 29, 31, 33, 38, 54, 62, 71], "annot": [17, 50], "anoth": [1, 30, 34, 36, 48], "answer": 29, "anybodi": [31, 36], "anyth": [1, 31, 36, 56], "anywher": [31, 36], "apartment": 42, "api": [2, 47], "api_refer": 24, "apolog": [17, 50], "apologi": 49, "appear": [0, 15, 28, 31, 37, 38, 42, 64, 67], "append": [1, 17, 42, 64, 65, 66, 67], "appli": [4, 13, 18, 62, 64, 69], "applic": [29, 71], "appreci": 50, "approach": [32, 38, 42, 45, 46, 49, 53, 64], "appropri": [1, 31, 69], "ar": [0, 1, 2, 3, 5, 9, 10, 11, 15, 17, 19, 21, 23, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 51, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 71], "arcross": 34, "area": 62, "aren": [31, 36], "around": 2, "arous": 48, "arrai": [6, 8, 67, 68], "articl": [37, 50], "ask": [20, 47, 54], "ask_ag": 49, "aspect": [50, 62], "assign": [1, 31, 36, 38, 45, 46, 52, 59, 61, 63, 71], "assign_chunk_num": 69, "associ": [1, 4, 15, 21, 29, 30, 31, 32, 36, 40, 45, 46, 47, 48, 61], "assum": [0, 1, 2, 10, 12, 16, 23, 31, 41, 60, 61, 64, 67, 71], "assumpt": [1, 41, 61], "asterisk": 22, "attribut": [0, 1, 11, 34, 51, 52, 56, 62], "author": [5, 31, 36, 59], "auto": 2, "automat": [0, 1, 61, 69], "auxiliari": [31, 36], "avail": [0, 1, 61, 62, 63, 64, 67], "averag": [1, 11, 13, 28, 30, 33, 34, 35, 40, 41, 46, 52, 64, 65, 72], "avil": 62, "avoid": 30, "awar": 29, "awesom": 62, "b": [4, 34, 35, 42, 45, 46, 55, 62], "bachelor": 42, "back": 62, "bag": [32, 38, 42, 45, 46, 49, 53, 56, 57], "bare_command": [19, 49], "base": [0, 1, 2, 15, 18, 19, 31, 32, 34, 35, 36, 37, 40, 42, 51, 52, 53, 54, 55, 56, 57, 61, 62, 63, 64, 65, 66, 71], "basic": [10, 11, 12, 16, 61, 62], "basic_featur": 11, "batch": 67, "batch_num": 1, "batch_siz": 67, "bay": [56, 57], "bbevi": [18, 19], "becaus": [1, 12, 21, 31, 36, 40, 42, 56, 61], "becom": [44, 61, 62], "been": [1, 12, 16, 31, 36, 61], "befor": [0, 1, 2, 17, 31, 36, 45, 48], "beforehand": 64, "begin": [34, 42, 54, 58, 61, 62, 63], "behavior": [0, 1, 11, 62, 63], "being": [4, 13, 14, 16, 17, 20, 21, 24, 31, 32, 36, 43, 47, 51, 55, 56, 60], "belong": 1, "below": [1, 11, 21, 33, 36, 45, 48, 51, 61, 62, 67, 69], "ber": 54, "bert": [0, 1, 2, 31, 35, 36, 39, 46, 61, 64, 67], "bert_path": 67, "bert_sentiment_data": [1, 61, 64], "best": [2, 29], "better": [31, 61], "between": [4, 6, 13, 21, 23, 24, 28, 30, 31, 34, 35, 36, 37, 40, 45, 46, 55, 58, 59, 62, 64, 65, 67], "betwen": 34, "beyond": 2, "big": 59, "binari": [10, 32, 38], "blame": 47, "blob": [1, 19, 24, 61, 67], "block": [22, 32, 48, 59], "blog": 15, "bodi": 67, "bold": [22, 64], "bool": [2, 63, 65, 66, 67, 71], "boolean": 1, "bootstrap": 62, "both": [0, 1, 2, 42, 52, 54, 55, 59, 62], "bother": 50, "bottom": 59, "bought": 41, "bound": [29, 35, 36, 37, 52, 55], "boundari": [34, 35, 42], "boyd": [0, 42], "break": [22, 48, 64], "brief": 44, "bring": 0, "broader": 52, "broken": 59, "btw": 50, "bug": [1, 61], "build": [1, 7, 34, 45, 46, 62], "built": [1, 11, 42, 67], "built_spacy_n": 15, "bullet": [22, 48, 64], "bunch": 59, "burst": 58, "bursti": [1, 11, 39, 58, 61, 65], "by_the_wai": 49, "c": [12, 34, 35, 45, 46, 62], "cach": [0, 2, 51, 61], "calcul": [1, 2, 5, 11, 12, 16, 18, 21, 28, 33, 41, 48, 49, 50, 56, 57, 58, 60, 62, 63, 64, 65, 66, 67, 68, 72, 73], "calculate_chat_level_featur": [1, 61, 69], "calculate_conversation_level_featur": 69, "calculate_hedge_featur": 64, "calculate_id_scor": 13, "calculate_info_divers": 65, "calculate_named_ent": 15, "calculate_num_question_na": 20, "calculate_politeness_senti": 64, "calculate_politeness_v2": 64, "calculate_team_bursti": 65, "calculate_textblob_senti": 64, "calculate_user_level_featur": 69, "calculate_vector_word_mimicri": 64, "calculate_word_mimicri": 64, "call": [1, 2, 8, 11, 13, 61, 62, 64, 69], "can": [0, 1, 2, 11, 31, 32, 33, 34, 36, 37, 42, 43, 44, 47, 48, 49, 50, 52, 54, 60, 61, 62, 67, 69], "can_you": 49, "cannot": [1, 2, 31, 36, 45, 46, 49, 62], "cao": [21, 24, 33, 43, 44, 56, 57, 62], "cap": [22, 48, 64], "capit": [0, 2, 48], "captur": [29, 30, 32, 34, 35, 38, 41, 42, 55], "caract": 40, "cardiffnlp": [1, 61], "care": 1, "carefulli": 60, "carri": 31, "casa_token": 5, "case": [1, 13, 16, 28, 29, 30, 31, 36, 37, 41, 45, 46, 51, 55, 56, 59, 61], "casual": 43, "categori": [21, 32, 42, 45, 46, 49, 52, 67], "caus": [31, 32, 36, 59], "caveat": 42, "center": 62, "central": 34, "centroid": [34, 66], "certain": [5, 19, 30, 42, 45, 46, 49, 71], "certainli": 42, "certainti": [11, 38, 39, 42, 64, 67], "cfm": 4, "chall": [1, 21, 39, 64, 70], "chang": [0, 1, 34, 50, 61, 71], "charact": [1, 2, 3, 15, 19, 37, 42, 49, 62, 64, 65, 66, 67, 71], "characterist": [1, 62], "chat": [0, 1, 2, 4, 5, 6, 7, 8, 12, 13, 14, 16, 23, 25, 28, 29, 32, 35, 36, 41, 44, 45, 46, 49, 59, 61, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73], "chat_data": [2, 6, 7, 8, 26, 27, 28, 63, 64, 65, 66, 67, 71], "chat_df": 14, "chat_featur": [1, 42, 61, 65, 66], "chat_level_data": 72, "chat_level_featur": 2, "chatlevelfeaturescalcul": [1, 2, 17, 21, 61, 64, 69], "chats_data": 73, "check": [19, 44, 64, 67], "check_embed": [1, 61, 69], "chen": 62, "choic": 1, "choos": [1, 60], "chose": 1, "chronolog": 1, "chunk": [34, 59, 63], "chunk_num": 63, "circlelyt": 13, "citat": [21, 24], "cite": 50, "clarif": [16, 32, 64], "class": [1, 2, 31, 61, 62, 64, 65, 66], "classif": [21, 64], "classifi": [16, 21, 50, 56, 57], "classify_ntri": 16, "classify_text_dalechal": 21, "clean": [2, 17, 19, 67, 71], "clean_text": 19, "clear": 1, "close": [31, 42, 48, 62], "closer": [45, 46, 59], "clue": 62, "cluster": 1, "cmu": 12, "code": [6, 18, 29, 32, 51, 55, 61, 62, 68], "coeffici": [1, 4, 39, 62, 65, 68], "cognit": 62, "colab": [0, 1], "collabor": [59, 62], "collaps": 2, "collect": [1, 2, 34, 49, 50, 52, 61, 62], "colleg": 33, "column": [0, 2, 4, 6, 7, 8, 9, 12, 13, 14, 16, 18, 23, 25, 28, 42, 51, 56, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "column_count_frequ": 28, "column_count_mim": 28, "column_mimc": 28, "column_nam": 71, "column_to_summar": 72, "com": [1, 2, 4, 5, 13, 15, 18, 19, 64, 67, 68, 71], "comb": 62, "combin": [0, 1, 6, 28, 64, 71], "come": [1, 12, 13, 21, 32, 33, 42, 58, 61], "comm": [1, 61], "command": [1, 61], "comment": 48, "commit": 23, "commit_data": 19, "common": [0, 32, 62, 64], "commonli": 37, "commun": [0, 1, 11, 42, 44, 48, 55, 60, 62, 64], "companion": 1, "compar": [31, 35, 44, 45, 52, 64, 71, 73], "compat": [0, 1, 61], "complement": [31, 36], "complet": [1, 2, 31, 55], "complex": [0, 35, 43, 50, 62], "compon": [2, 50], "compos": 2, "comprehens": [33, 48], "compress": 71, "comput": [0, 1, 2, 4, 5, 6, 10, 11, 12, 13, 14, 28, 29, 30, 31, 34, 35, 36, 37, 40, 41, 42, 45, 46, 49, 50, 52, 55, 62, 64, 65, 66, 67, 69, 73], "compute_frequ": 28, "compute_frequency_per_conv": 28, "compute_vectors_from_preprocess": [0, 2], "computetf": 28, "conain": 61, "concat_bert_featur": [1, 61, 64], "concaten": [19, 49, 64, 71], "concentr": 55, "concept": [29, 39, 42, 62], "conceptu": [61, 62], "concis": 43, "concret": 29, "conduct": 1, "confid": [2, 5, 15, 30, 47, 64], "configur": 71, "conflict": 62, "confound": 44, "congruent": 34, "conjection_seper": 19, "conjunct": [19, 31, 36, 49], "conjunction_start": 49, "connect": [2, 39], "conscious": 35, "consecut": 22, "consequ": [0, 1], "consid": [1, 2, 33, 37], "consider": [61, 62], "consist": [31, 36, 40, 41], "constitut": 41, "constrain": [34, 35], "construct": [1, 11, 55, 62], "constructor": 47, "consult": 5, "contact": 0, "contain": [1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 25, 28, 29, 30, 35, 38, 42, 47, 49, 50, 55, 61, 62, 63, 64, 67, 71, 72, 73], "content": [0, 1, 12, 13, 28, 34, 36, 39, 41, 42, 45, 46, 62, 64, 67], "content_mimicry_scor": 28, "content_mimicry_score_per_conv": 28, "content_word_accommod": 31, "content_word_accommodation_per_conv": 31, "content_word_mimicri": 28, "contentcod": 67, "contentcodingdictionari": 67, "context": [2, 32, 42, 48, 62, 71], "continu": [56, 57], "contract": 49, "contrast": 39, "contribut": [13, 34, 37, 62], "control": 1, "conv": [1, 61], "conv_data": [2, 65], "conv_features_al": [1, 61], "conv_features_bas": [1, 11, 61], "conv_level_featur": 2, "conv_to_float_arr": 8, "convei": [6, 34, 52], "conveni": [1, 61], "convers": [0, 2, 3, 4, 6, 7, 8, 9, 12, 13, 23, 25, 28, 29, 31, 34, 35, 36, 37, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 52, 55, 58, 59, 61, 63, 64, 65, 66, 68, 71, 72, 73], "conversation_id": [28, 61, 71], "conversation_id_col": [0, 1, 2, 4, 6, 7, 8, 9, 13, 23, 25, 26, 27, 61, 63, 64, 65, 66, 68, 72, 73], "conversation_num": [0, 1, 2, 6, 7, 64, 66, 73], "conversationlevelfeaturescalcul": [2, 65, 69], "convert": [8, 41, 49, 67, 71], "convict": 5, "convo_aggreg": [0, 1, 2, 65], "convo_column": [0, 1, 2, 65], "convo_method": [0, 1, 2, 65], "convokit": [17, 50, 62, 64], "coordin": 55, "copi": [0, 1, 42], "copular": [31, 36], "core": [34, 69], "cornel": 17, "corpu": [0, 1, 50], "corr_thresh": [0, 1, 2], "corrado": 37, "correl": [0, 1, 2, 41, 55], "correspond": [30, 34, 35, 40, 49, 55, 66], "cosin": [6, 7, 13, 28, 31, 34, 35, 36, 40, 45, 46, 65], "could": [1, 31, 33, 36, 50, 54], "could_you": 49, "couldn": [31, 36], "count": [0, 1, 2, 3, 12, 14, 15, 16, 19, 21, 25, 28, 30, 31, 32, 36, 39, 41, 43, 44, 49, 52, 53, 54, 56, 58, 64, 65, 66], "count_all_cap": 22, "count_bullet_point": 22, "count_charact": 3, "count_difficult_word": 21, "count_ellips": 22, "count_emoji": 22, "count_emphasi": 22, "count_line_break": 22, "count_link": 22, "count_match": [19, 49], "count_messag": 3, "count_numb": 22, "count_parenthes": 22, "count_quot": 22, "count_responding_to_someon": 22, "count_spacy_match": 19, "count_syl": 21, "count_turn": 25, "count_turn_taking_index": 25, "count_user_refer": 22, "count_word": 3, "countabl": [1, 65], "countd": 36, "counterfactu": 50, "cours": [16, 31, 34, 36, 63], "cover": 28, "cpu": [0, 1, 67], "creat": [0, 1, 2, 13, 19, 31, 40, 42, 61, 62, 64, 65, 66, 71], "create_chunk": 63, "create_chunks_messag": 63, "create_cumulative_row": 71, "credit": 33, "criteria": 67, "critic": 71, "crowd": 13, "csv": [1, 2, 61, 62, 67], "cuda": 67, "cumul": [2, 71], "cumulative_group": [0, 1, 2, 71], "current": [1, 11, 23, 31, 34, 35, 36, 40, 45, 46, 58, 61, 64, 71], "curt": 43, "custom": [0, 2, 11, 14, 62], "custom_featur": [0, 1, 2, 61], "custom_liwc_dictionari": [14, 64], "custom_liwc_dictionary_path": [0, 2, 42], "customiz": 62, "cut": [], "cutoff": [2, 15, 47, 64], "d": [0, 1, 2, 31, 34, 36, 61], "dale": [1, 21, 39, 64, 70], "dale_chall_help": 21, "danescu": [49, 50], "dash": 22, "data": [0, 2, 6, 7, 8, 9, 13, 19, 20, 32, 37, 40, 41, 47, 51, 55, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "datafram": [0, 1, 2, 4, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 23, 25, 28, 37, 42, 47, 49, 59, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "dataknowsal": 15, "dataset": [1, 2, 9, 12, 13, 28, 31, 41, 47, 52, 61, 64, 65, 66, 73], "date": [1, 42, 61], "datetim": [2, 58], "dcosta": 62, "deal": [50, 59], "death": 1, "debat": 59, "debug": 71, "decid": [1, 62], "decis": [1, 13, 62], "declar": [1, 62, 69], "deepli": 62, "default": [0, 1, 2, 5, 11, 13, 16, 23, 30, 34, 35, 42, 47, 62, 63, 64, 66, 67, 71, 73], "defer": [17, 50], "defin": [0, 11, 21, 31, 34, 36, 40, 59, 62, 64, 65, 66, 70], "definit": [1, 3, 44], "degre": [6, 30, 36, 45, 46, 55], "delet": 29, "deliber": 1, "demo": 61, "democrat": 1, "demonstr": 1, "demystifi": 62, "denomin": 59, "denot": 42, "densiti": 60, "dep_": 49, "dep_pair": 19, "depend": [0, 1, 10, 19, 32, 49, 52, 61, 63], "deriv": [2, 11, 65, 66], "descend": 67, "describ": [1, 11, 62], "descript": [1, 61], "design": [0, 1, 2, 13, 34, 62], "desir": [2, 63, 72], "detail": [0, 2, 11, 33, 41, 43, 61, 62], "detect": [0, 1, 32, 37, 38, 47, 48, 49, 54], "determin": [13, 18, 31, 35, 36, 40, 45, 46, 71], "dev": 24, "develop": [5, 37, 40, 62], "deviat": [4, 5, 29, 40, 41, 55, 58, 65, 72, 73], "devic": 67, "df": [2, 4, 8, 9, 12, 13, 16, 18, 23, 28, 63, 71], "dic": [2, 14, 42, 67], "diccategori": 67, "dict": [2, 14, 17, 19, 28, 64, 67, 71], "dicterm": 67, "dictext": 67, "dictionari": [0, 1, 2, 14, 15, 17, 19, 28, 30, 42, 49, 61, 64, 67, 71], "did": [1, 31, 36, 37, 47, 50, 54, 62], "didn": [31, 36], "differ": [0, 1, 2, 4, 11, 12, 23, 28, 29, 31, 34, 36, 37, 39, 40, 44, 45, 46, 47, 49, 55, 62, 63, 64, 65, 66, 67, 71], "differenti": [49, 59], "difficult": [21, 33], "difficult_word": 21, "difficulti": 33, "dimens": [40, 62], "dimension": [34, 35], "dinner": 41, "direct": [34, 43, 45, 47, 50, 69], "direct_quest": [32, 50, 54], "direct_start": 50, "directli": [1, 62, 69], "directori": [0, 2, 19, 61, 65, 67], "disabl": 1, "disagr": 49, "disagre": 51, "discours": [31, 36], "discov": 2, "discret": [31, 36, 45, 46], "discurs": [0, 1, 6, 8, 39, 40, 61, 65, 66], "discursive_divers": 11, "discus": 8, "discuss": [0, 1, 31, 34, 39, 40, 42, 43, 61, 62, 71], "dispers": 68, "displai": [1, 34, 42, 46, 61], "dispos": 1, "distanc": [34, 35, 40], "distinct": [31, 36, 59], "distinguish": 59, "distribut": 31, "div": 16, "diverg": [6, 34, 35], "divers": [0, 1, 6, 8, 13, 39, 61, 65], "divid": [16, 34, 59, 63], "dl": [21, 24], "do": [0, 1, 29, 31, 34, 36, 37, 43, 49, 50, 54, 62, 69], "doc": [2, 19], "doc_top": 13, "document": [1, 17, 61, 69], "doe": [1, 2, 29, 40, 42, 43, 45, 47, 54, 61, 71], "doesn": [0, 1, 29, 31, 36, 42, 45, 61, 67], "doi": [5, 6, 21, 24, 64], "domain": [31, 50], "domin": 1, "don": [31, 36, 49, 54, 62, 67], "done": [2, 50], "dot": 22, "doubl": 30, "down": [31, 36], "download": [1, 61], "download_resourc": [1, 61], "downstream": [17, 62], "dozen": 62, "drive": [62, 69], "driver": [2, 61, 64, 65, 66], "drop": [0, 1, 2, 64], "drop_redundant_column": [0, 1, 2], "due": [34, 59], "duncan": 62, "duplic": [1, 2, 71], "durat": [58, 63], "dure": [2, 55, 59, 62], "dynam": [59, 61], "e": [0, 1, 2, 4, 15, 20, 29, 30, 31, 32, 34, 35, 36, 37, 38, 41, 42, 47, 48, 49, 52, 54, 56, 59, 61, 63, 65, 66, 67, 71], "e2": [21, 70], "each": [0, 1, 2, 3, 4, 7, 8, 9, 11, 12, 15, 17, 19, 23, 25, 28, 30, 31, 34, 35, 36, 37, 40, 41, 42, 44, 45, 46, 47, 49, 50, 51, 52, 55, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73], "earlier": [0, 1, 42], "easi": [1, 21, 62, 70], "easier": 21, "easili": 33, "easy_word": 21, "eat": 34, "echo": 31, "econom": 37, "edg": [29, 59], "edu": [1, 12, 16, 17, 70], "effect": [1, 41], "effici": 1, "effort": 55, "either": [2, 20, 52, 55, 67], "elaps": 58, "element": [1, 6, 67], "ellips": [22, 48, 64], "els": [1, 22, 47, 64], "embed": [8, 31, 34, 35, 36, 45, 46, 65, 66, 67, 69], "emili": [30, 35, 45, 46, 47, 59, 62], "emoji": [22, 42, 48, 64, 67, 71], "emot": [1, 61], "emoticon": 48, "emphas": [22, 48, 64], "emphasi": 48, "empirica": [1, 2, 71], "emploi": 45, "empti": [0, 2, 13, 64, 67], "en": [1, 21, 24, 61, 70], "en_core_web_sm": [1, 61], "enabl": 71, "enclos": 22, "encod": [1, 8], "encompass": 62, "encount": [1, 34, 35, 61], "encourag": 64, "end": [0, 1, 15, 20, 23, 34, 42, 54, 62, 63, 67], "end_timestamp_col": 2, "engag": 43, "engin": 2, "english": [34, 42], "enjoi": 62, "ensur": [0, 1, 40, 42, 49, 61, 63, 67], "entir": [0, 1, 12, 28, 31, 36, 40, 41, 52, 59, 62, 73], "entiti": [0, 2, 15, 39, 64], "entityrecogn": 47, "entri": [1, 28, 61], "ep8dauru1ogvjurwdbof5h6ayfbslvughjyiv31d_as6ppbt": 5, "equal": [1, 21, 34, 37, 40, 55, 59, 61, 62, 63], "equival": [0, 1, 41, 55, 61], "eric": 62, "error": [1, 16, 61, 71], "escap": 42, "especi": [41, 62], "essenti": 51, "establish": 31, "estim": 31, "et": [1, 5, 16, 18, 21, 24, 30, 31, 32, 33, 34, 35, 36, 38, 42, 43, 44, 49, 50, 52, 53, 54, 56, 57, 58, 59, 60, 64], "etc": [10, 15, 16, 17, 42], "evalu": [5, 47, 50], "evan": 62, "even": [0, 1, 2, 34, 37, 42, 62, 63, 67], "evenli": [34, 55], "event": [1, 34, 55, 61], "ever": 62, "everi": [1, 4, 13, 31, 34, 35, 36, 61, 62], "everybodi": [31, 36], "everyon": [31, 36, 47, 62], "everyth": [31, 36, 56], "everywher": [31, 36], "evolut": 35, "evolv": [35, 71], "exactli": [1, 2, 71], "examin": [40, 62, 63], "exampl": [0, 10, 11, 15, 21, 24, 29, 31, 32, 34, 37, 42, 43, 48, 50, 51, 54, 56, 59, 60, 61, 62, 67], "example_data": 1, "exce": [1, 15], "except": [42, 67, 71], "exchang": [12, 35, 39, 40, 45, 55, 64], "exclud": [0, 41, 42], "exclus": [41, 42], "excus": 32, "exhibit": 35, "exist": [0, 1, 2, 55, 61, 62, 63, 64, 67], "expand": 49, "expect": [1, 37, 42, 47], "expected_valu": 47, "explain": [0, 29], "explan": [29, 43], "explor": [61, 62], "express": [5, 14, 30, 31, 32, 36, 38, 42, 64, 67], "extend": 1, "extens": [43, 44], "extent": [1, 4, 7, 12, 31, 34, 35, 37, 51, 55, 59, 61], "extern": 48, "extra": 51, "extract": [1, 17, 19, 28, 40, 50, 64], "extrem": [55, 56, 57], "face": [1, 51, 61], "facilit": [62, 71], "fact": [4, 35, 50, 54, 59], "factual": [17, 24, 50], "fail": [1, 61], "fals": [0, 1, 2, 31, 54, 61, 67, 71], "famili": 42, "far": [34, 35, 46, 50, 62], "faster": 14, "feat_count": 19, "featur": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 63, 64, 65, 66, 67], "feature_build": [0, 1, 42, 61, 64, 71], "feature_dict": [1, 61], "feature_method": [64, 65], "feature_nam": [1, 61], "featurebuild": [0, 2, 11, 42, 47, 69], "features_conceptu": [1, 61], "feauturebuild": 1, "few": [48, 62], "fewer": [12, 60], "fflow": 11, "field": [13, 17], "file": [0, 2, 12, 14, 19, 42, 61, 65, 67, 71], "filenam": [1, 2, 19], "filenotfounderror": 67, "filler": [37, 60], "filler_paus": 49, "filter": [2, 19, 62], "final": [1, 2, 34, 42, 62], "find": [1, 19, 28, 50], "fingertip": 62, "finit": 55, "first": [0, 2, 11, 12, 16, 19, 31, 34, 35, 36, 39, 40, 41, 42, 45, 46, 49, 52, 54, 59, 61, 62, 64, 67, 70, 71], "first_person": 12, "first_person_plur": 49, "first_person_raw": [12, 16], "first_person_singl": 49, "five": 37, "fix": [52, 67], "fix_abbrevi": 67, "flag": [1, 71], "float": [0, 2, 4, 5, 6, 8, 10, 13, 14, 16, 21, 24, 25, 28, 68], "floor": 59, "flow": [0, 1, 7, 31, 36, 39, 41, 45, 46, 61, 64], "focal": [31, 36], "focu": 41, "folder": [0, 1, 19], "follow": [0, 1, 2, 11, 16, 17, 29, 31, 32, 33, 41, 42, 47, 49, 50, 53, 55, 59, 60, 61, 64, 65, 67], "for_m": 49, "for_you": 49, "forc": [0, 1, 61], "form": 1, "formal": [1, 61], "formal_titl": 49, "format": [0, 1, 8, 17, 22, 42, 47, 48, 61, 62, 64, 67], "former": [45, 46], "formula": [33, 42, 59, 64, 70], "fornt": 1, "forward": [0, 1, 7, 39, 41, 61, 64], "forward_flow": 35, "found": [1, 2, 5, 28, 30, 33, 61, 69], "four": [1, 8], "fourth": 33, "frac": 55, "fraction": [1, 59], "framework": [49, 50, 62], "frequenc": [28, 31, 44, 64], "frequency_dict": 28, "fridai": 34, "from": [0, 1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 19, 21, 28, 29, 31, 32, 33, 34, 35, 36, 39, 41, 42, 49, 50, 51, 53, 55, 56, 57, 58, 61, 62, 64, 65, 66, 67, 71], "full": [1, 28, 37], "full_empirical_dataset": 1, "fulli": [32, 48], "functinon": 12, "function": [1, 2, 3, 4, 10, 11, 12, 13, 14, 16, 20, 21, 28, 31, 39, 44, 45, 46, 50, 56, 57, 60, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73], "function_mimic_word": 28, "function_mimicry_scor": 28, "function_word_mimicri": 28, "function_word_refer": 28, "fund": 62, "further": [1, 61, 71], "furthermor": 42, "futur": [23, 66], "g": [0, 1, 2, 4, 15, 20, 29, 31, 32, 36, 37, 38, 41, 42, 47, 48, 52, 54, 59, 61, 63, 65, 66, 67, 71], "game": [1, 2, 59, 71], "gaug": [33, 52], "gener": [0, 2, 9, 11, 12, 16, 21, 31, 34, 35, 36, 40, 42, 45, 46, 49, 51, 59, 65, 66, 67, 69, 71, 72], "generaliz": 23, "generate_bert": 67, "generate_certainty_pkl": 67, "generate_lexicon_pkl": 67, "generate_summary_stat": 2, "generate_vect": 67, "gensim": 40, "get": [16, 20, 21, 28, 30, 31, 36, 49, 66, 67], "get_all_dd_featur": 11, "get_centroid": 66, "get_certainti": 5, "get_certainty_scor": 64, "get_content_words_in_messag": 28, "get_conversation_level_aggreg": 65, "get_cosine_similar": 6, "get_dale_chall_easy_word": [21, 70], "get_dale_chall_score_and_classf": 64, "get_dd": 6, "get_dd_featur": 8, "get_dep_pair": [19, 49], "get_dep_pairs_noneg": [19, 49], "get_discursive_diversity_featur": 65, "get_first_pct_of_chat": 1, "get_first_person_word": [12, 70], "get_forward_flow": [7, 64], "get_function_word": 70, "get_function_words_in_messag": 28, "get_gini": 68, "get_gini_featur": 65, "get_info_divers": 13, "get_info_exchange_wordcount": 12, "get_liwc_count": 14, "get_max": 72, "get_mean": 72, "get_median": 72, "get_mimicry_bert": 28, "get_min": 72, "get_moving_mimicri": 28, "get_named_ent": 64, "get_nan_vector": [27, 67], "get_polarity_scor": 24, "get_politeness_strategi": 17, "get_politeness_v2": 18, "get_proportion_first_pronoun": 16, "get_question_word": 70, "get_reddit_featur": 64, "get_senti": 67, "get_stdev": 72, "get_subjectivity_scor": 24, "get_sum": 72, "get_team_bursti": 4, "get_temporal_featur": [4, 64], "get_time_diff": 23, "get_time_diff_startend": 23, "get_turn": 25, "get_turn_id": 71, "get_turn_taking_featur": 65, "get_unique_pairwise_combo": 6, "get_user_level_aggreg": 65, "get_user_level_summary_statistics_featur": 66, "get_user_level_summed_featur": 66, "get_user_max_datafram": 72, "get_user_mean_datafram": 72, "get_user_median_datafram": 72, "get_user_min_datafram": 72, "get_user_network": [11, 66], "get_user_stdev_datafram": 72, "get_user_sum_datafram": 72, "get_variance_in_dd": 26, "get_within_person_disc_rang": 27, "get_word_ttr": 16, "get_zscore_across_all_chat": 73, "get_zscore_across_all_convers": 73, "gina": 62, "gini": [1, 39, 62, 65, 68], "gini_coeffici": [11, 69], "github": [0, 1, 2, 18, 19, 67, 71], "give": [0, 1, 29, 37], "give_ag": 49, "given": [0, 1, 5, 6, 13, 14, 28, 30, 31, 33, 34, 35, 36, 40, 41, 55, 59, 66, 67, 71], "go": [1, 34, 35, 45, 46, 50, 62], "goal": 62, "goe": 67, "good": [50, 56, 62], "goodby": 49, "googl": [0, 1], "got": [31, 36], "gotta": [31, 36], "gpu": [0, 2, 67], "grade": 33, "grader": 21, "grai": 35, "grammat": 36, "granularli": 35, "grate": [42, 62], "gratitud": [17, 49, 50], "great": [47, 50, 51, 56, 59, 60, 62], "greater": 55, "greet": 50, "groceri": 41, "group": [0, 2, 4, 13, 29, 33, 34, 41, 52, 59, 62, 68, 71, 72], "grouping_kei": [0, 1, 2, 71], "gt": 22, "guess": 10, "gun": 1, "gy": 15, "gym": 34, "ha": [0, 1, 32, 34, 35, 37, 42, 43, 46, 52, 54, 55, 56, 59, 61, 62, 63, 67, 71], "had": [1, 31, 36, 54, 61], "hadn": [31, 36], "handl": [19, 29], "happen": [1, 2, 55, 62, 63], "happi": 42, "hardcod": 67, "harder": 21, "hashedg": [17, 50], "hasn": [31, 36], "hasneg": 50, "hasposit": 50, "hate": 31, "have": [0, 1, 10, 12, 16, 31, 34, 36, 37, 40, 41, 42, 45, 46, 50, 54, 59, 60, 61, 62, 71], "haven": [31, 36], "he": [1, 31, 36], "header": [18, 67], "hear": 32, "heart": [61, 62], "heat": 1, "heavi": 62, "hedg": [11, 30, 39, 49, 50, 64], "hei": [1, 35, 45, 46, 50], "helena": [47, 62], "hello": [0, 43, 49], "help": [0, 31, 34, 36, 43, 45, 46, 52, 58, 69], "helper": 67, "her": [30, 31, 36], "here": [1, 29, 31, 34, 41, 42, 47, 61, 62, 66], "herself": [31, 36], "hesit": [60, 64], "hi": [31, 35, 36, 43, 45, 46], "hierach": 71, "hierarch": 71, "high": [0, 1, 2, 61, 62, 71], "higher": [0, 1, 21, 31, 34, 36, 40, 41, 42, 44, 45, 46, 55, 60], "highest": [1, 71], "highli": [0, 1, 2], "highlight": 1, "him": [31, 36], "himself": [31, 36], "hmm": [31, 36], "hoc": 62, "hold": 31, "hole": 62, "home": 42, "homework": 34, "homonym": 31, "hood": 1, "hope": 35, "host": [45, 46], "hour": 48, "how": [1, 5, 28, 29, 30, 31, 34, 35, 36, 39, 43, 45, 51, 52, 54, 56, 62], "howev": [0, 1, 3, 11, 35, 40, 42, 44, 54, 56, 61, 62], "howitwork": 1, "html": [1, 2, 15, 17, 24, 61], "http": [1, 2, 4, 5, 6, 12, 13, 15, 16, 17, 18, 19, 21, 24, 41, 45, 46, 47, 61, 64, 67, 68, 70, 71], "hu": [1, 42, 62], "hug": [1, 51, 61], "huggingfac": 1, "huh": [31, 32, 36], "human": [37, 50, 62], "hyperlink": 48, "hyphen": [1, 42, 61, 67], "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 19, 20, 21, 22, 23, 24, 25, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 71, 73], "iby1": 5, "id": [2, 4, 7, 23, 28, 62, 64, 66, 68, 71, 72, 73], "idea": [12, 35, 40, 47, 51], "ident": [34, 35], "identif": 1, "identifi": [0, 1, 2, 4, 8, 9, 15, 23, 25, 30, 31, 41, 47, 50, 52, 61, 63, 64, 71, 72], "identiif": [13, 71], "ignor": [1, 32], "illustr": [1, 41, 48, 62], "imagin": 1, "immedi": [31, 35, 64], "impact": [1, 60], "impersonal_pronoun": 49, "implement": 64, "impli": 37, "import": [31, 32, 36, 44, 45, 62, 69], "incent": 13, "includ": [0, 1, 10, 17, 22, 31, 32, 35, 36, 42, 45, 46, 51, 52, 56, 61, 62, 66, 71], "inclus": [13, 71], "incongru": [8, 34], "incorpor": [1, 42, 45, 46], "increas": [1, 62], "incredibli": 42, "increment": 71, "independ": 1, "index": [1, 2, 4, 13, 25, 37, 39, 55, 61, 65], "indic": [1, 2, 16, 21, 22, 30, 32, 34, 35, 36, 40, 41, 43, 44, 48, 49, 50, 52, 55, 60, 63, 71], "indirect": 50, "indirect_btw": 50, "indirect_greet": 50, "indirectli": 69, "individu": [0, 1, 5, 11, 31, 34, 37, 45, 50, 59, 60, 62, 72], "inequ": 37, "infer": [1, 51, 67], "influenc": 1, "info": [13, 18, 64, 71], "info_divers": 13, "info_exchang": 64, "info_exchange_wordcount": [41, 64], "info_exchange_zscor": 11, "inform": [0, 6, 11, 12, 13, 24, 32, 34, 39, 48, 62, 64, 65], "informal_titl": 49, "information_divers": 11, "initi": [2, 62, 63, 64, 65, 66], "input": [0, 2, 4, 6, 12, 13, 14, 15, 16, 19, 20, 22, 28, 42, 50, 55, 60, 62, 63, 64, 65, 66, 67, 71, 72], "input_data": [25, 68, 72], "input_df": [1, 2, 42, 61, 71], "inquiri": [0, 30, 39, 52], "insid": 1, "insight": 1, "inspir": 15, "instal": [1, 61, 62], "instanc": [1, 22, 50, 59, 66], "instanti": 2, "insteac": 1, "instead": [1, 62], "instruct": [1, 61], "int": [2, 3, 10, 13, 15, 16, 19, 20, 22, 28, 63, 64, 67, 71], "intact": 71, "integ": [0, 13, 40, 47], "intend": 59, "interact": [1, 11, 43, 44, 62, 69], "interconnect": 62, "interest": [1, 61, 62], "interfac": 62, "intermedi": [59, 64], "intern": 29, "interpret": [0, 1], "interrupt": 59, "interv": [58, 65], "introduc": [42, 62], "introduct": [11, 61], "invalid": [2, 67], "invers": 64, "involv": [41, 62, 65], "io": [1, 24, 47, 61], "ipynb": [0, 1], "is_hedged_sentence_1": 10, "is_valid_term": 67, "isn": [1, 31, 36], "issu": [1, 31, 36, 37, 42, 61], "ital": 64, "italic": 22, "item": [0, 71], "its": [0, 1, 15, 31, 35, 36, 40, 41, 47, 54, 55, 64, 69], "itself": [31, 36, 44], "jami": [0, 42], "john": 1, "jonson": 62, "journal": [5, 64], "json": [1, 61], "jurafski": 70, "juri": 1, "juries_df": 1, "jury_conversations_with_outcome_var": 1, "jury_feature_build": 1, "jury_output": 1, "jury_output_chat_level": [1, 61], "jury_output_turn_level": 1, "just": [1, 2, 31, 36, 46, 50, 59, 61, 62], "katharina": 34, "keep": [1, 2, 71], "keep_one_column_per_group": 2, "kei": [1, 2, 4, 19, 28, 30, 54, 61, 71], "kept": 1, "keyword": [19, 49], "kind": [10, 62], "kitchen": 42, "knob": 0, "know": 30, "knowledg": 29, "known": [1, 32, 61], "kumar": 62, "kw": 19, "l714": 67, "l81": 67, "lab": [1, 2, 62, 71], "label": [1, 15, 21, 51], "lack": [31, 38, 45, 46], "languag": [15, 31, 34, 42, 50, 62], "larg": [1, 31, 69], "larger": [0, 31, 61], "last": [1, 31], "late": 32, "later": [0, 1, 42, 61], "latest": [1, 61], "latter": [31, 36], "lda": [13, 40], "learn": [1, 61, 62], "least": [10, 32, 42, 63, 67], "led": 62, "legal": 49, "lemmat": [13, 40], "len": 28, "length": [35, 39, 41, 42, 44, 67], "less": [1, 13, 32, 50, 52, 55, 62, 63], "let": [41, 49, 53], "let_me_know": 49, "letter": [49, 71], "level": [0, 1, 2, 3, 4, 6, 7, 8, 9, 12, 13, 14, 16, 23, 61, 64, 65, 66, 71, 72], "lexic": [1, 10, 12, 14, 16, 31, 32, 36, 42, 60, 62, 64], "lexical_featur": [14, 64], "lexical_features_v2": [10, 11], "lexicon": [0, 5, 10, 14, 30, 39, 50, 52, 67, 69], "lexicons_dict": 67, "librari": [34, 51, 56, 57], "lift": 62, "light": 61, "like": [1, 22, 31, 34, 36, 41, 50, 61, 62], "limiat": 32, "limit": [11, 32, 37, 42, 54], "line": [0, 1, 19, 22, 48, 61, 62, 64, 67], "linear": 64, "linguist": [0, 18, 19, 30, 39, 50, 52], "link": [22, 29, 48, 50, 64], "list": [1, 2, 6, 7, 10, 11, 12, 13, 15, 19, 20, 21, 22, 28, 31, 33, 36, 37, 42, 48, 49, 50, 53, 54, 61, 64, 65, 66, 67, 68, 70, 71], "liter": 42, "literatur": 62, "littl": 38, "littlehors": 1, "liu": [42, 52], "live": [1, 54], "liwc": [0, 2, 14, 30, 39, 51, 52, 56, 62, 64, 67], "liwc2015": 42, "liwc_featur": [10, 14], "liwc_test_output": 42, "lix": 34, "ll": [1, 31, 36, 61], "load": [2, 19, 67, 69], "load_custem_liwc_dict": 2, "load_liwc_dict": 67, "load_saved_data": 19, "load_to_dict": 19, "load_to_list": 19, "loc": 15, "local": [1, 42, 51, 61], "locat": [1, 62], "log": [0, 1, 2, 67, 71], "log_column_group": 2, "log_file_path": 71, "logger": [2, 64, 65, 66, 67, 71], "long": 4, "longer": [30, 41, 43, 48, 61, 62], "look": [2, 34, 61, 65, 66], "loos": 36, "lot": [31, 36], "loud": 60, "love": [31, 56], "low": [1, 2, 29, 55, 60, 71], "lower": [0, 1, 21, 31, 33, 36, 41, 44, 55, 60], "lowercas": [2, 13, 40, 48, 49, 71], "lowest": 71, "lpearl": 16, "lst": 6, "m": [0, 2, 23, 30, 31, 36], "made": [1, 23, 35, 59, 61, 62], "magnitud": 55, "mai": [1, 11, 28, 31, 32, 35, 36, 37, 41, 42, 43, 44, 54, 61, 62, 71], "main": [1, 2, 5, 19, 62, 64, 65, 66, 67], "make": [1, 5, 31, 34, 55, 56, 62, 66, 69, 71], "man": 62, "mani": [0, 1, 4, 11, 32, 37, 41, 60, 62, 66], "manner": [55, 62], "manual": [1, 61], "map": [13, 34, 67], "mark": [19, 20, 22, 43, 54, 64, 71], "marker": [18, 32, 39, 42, 50, 51, 52, 54, 56], "marlow": 44, "matarazzo": 62, "match": [1, 5, 16, 19, 30, 67], "math": 34, "matter": [28, 47], "max": [0, 1, 2, 11, 66, 72], "max_cols_per_group": 2, "max_group": 2, "max_num_chunk": 63, "max_user_mean_num_word": 1, "maxim": [34, 35, 37, 72], "maximum": [1, 2, 63, 65, 72], "mayb": [38, 47], "mcfarland": 70, "me": [31, 32, 36, 41, 50, 53], "mean": [0, 1, 2, 4, 6, 11, 13, 21, 29, 31, 34, 36, 40, 41, 42, 47, 55, 56, 58, 61, 62, 65, 66, 72, 73], "mean_num_word": 1, "meaning": [31, 41, 55], "meaningless": 41, "meant": 39, "measur": [0, 1, 7, 12, 13, 20, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 51, 52, 54, 55, 56, 57, 58, 59, 60, 62, 64, 68], "mechan": 32, "median": [0, 1, 72], "medium": 21, "meet": [1, 48], "member": [13, 34, 37, 55], "merg": [2, 8, 65, 66], "merge_conv_data_with_origin": 2, "messag": [0, 1, 2, 3, 4, 5, 8, 11, 12, 13, 14, 15, 16, 17, 20, 21, 22, 23, 24, 28, 30, 31, 34, 35, 36, 37, 39, 41, 45, 46, 47, 48, 50, 51, 52, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 71, 73], "messaga": 61, "message_col": [0, 1, 2, 12, 13, 14, 61, 64, 65, 67, 71], "message_col_origin": 14, "message_embed": [6, 7, 8], "message_lower_with_punc": 71, "metadata": [0, 1], "method": [1, 2, 5, 31, 41, 50, 62, 65], "metric": [0, 1, 2, 8, 30, 34, 35, 46, 47, 48, 55, 66], "michael": 1, "mid": [1, 2, 71], "middl": [21, 34, 63], "might": [0, 1, 29, 43, 48, 53], "mikeyeoman": [18, 64], "mileston": 34, "millisecond": [0, 2], "mimic": [28, 31, 36, 45], "mimic_word": 28, "mimick": [28, 31, 64], "mimicri": [0, 1, 28, 31, 35, 36, 39, 61, 64], "mimicry_bert": [45, 46], "min": [1, 2, 11, 72], "min_group_s": [0, 1, 2], "min_na_ratio": [0, 1, 2], "min_zero_ratio": [0, 1, 2], "mind": [1, 35, 50], "mine": [31, 36, 53, 59], "minim": [0, 41, 60], "minimum": [1, 2, 65, 72], "minmiz": 72, "minu": [12, 41, 64], "minut": [55, 58], "mirror": 1, "miss": [0, 1, 2, 32, 61], "missing": 2, "mitig": [31, 36], "mizil": [49, 50], "mm": [31, 36], "mnsc": 6, "modal": 50, "mode": 60, "model": [1, 2, 13, 15, 31, 34, 35, 36, 40, 45, 46, 47, 51, 62, 67], "model_bert": 67, "modif": 35, "modifi": [1, 9, 19, 32, 64], "modul": [0, 1, 11, 34, 49, 50, 61, 69], "monologu": 59, "more": [0, 1, 11, 12, 22, 23, 24, 28, 31, 32, 34, 36, 37, 40, 41, 42, 43, 44, 45, 46, 50, 52, 55, 59, 61, 62, 71], "morn": 1, "most": [1, 24, 31, 55, 62, 69], "mostli": 1, "motiv": 61, "move": [0, 1, 28, 31, 36, 39, 45, 59, 61], "movi": 31, "much": [1, 28, 31, 34, 35, 36, 45, 62], "multi": [1, 2, 71], "multidimension": [45, 46], "multipl": [0, 1, 2, 19, 62, 71], "must": [1, 6, 62, 71], "my": [30, 31, 35, 36, 45, 46, 50, 53], "my_chat_featur": 1, "my_feature_build": 61, "my_fil": 1, "my_output": 61, "my_output_chat_level": 61, "my_output_conv_level": 61, "my_output_user_level": 61, "my_pandas_datafram": 61, "myself": [31, 36, 53], "n": [0, 2, 35, 45, 46, 47, 57, 59, 60], "n_chat": 59, "na": [0, 1, 5, 33, 43, 44, 48, 49, 50, 53, 58], "naiv": [20, 32, 34, 38, 39, 53, 56, 57, 64], "name": [0, 2, 4, 7, 8, 9, 12, 13, 14, 15, 17, 19, 23, 25, 28, 30, 32, 35, 39, 42, 45, 46, 50, 51, 56, 63, 64, 66, 67, 68, 71, 72, 73], "name_to_train": 47, "named_ent": [15, 47], "named_entity_recognition_featur": 11, "nan": [0, 34, 67], "nate": [35, 45, 46], "nathaniel": [35, 45, 46], "nativ": 50, "natur": [43, 55], "ndarrai": 68, "nearest": [13, 40], "nearli": 62, "necessari": [63, 67], "need": [0, 1, 2, 21, 62, 66, 67], "need_sent": 67, "need_senti": 67, "neg": [1, 24, 29, 31, 34, 35, 36, 42, 50, 51, 52, 54, 56, 61, 62, 67], "negat": [19, 49], "negative_bert": [0, 1, 51, 61], "negative_emot": [49, 51, 52, 56], "negoti": 62, "neighborhood": 54, "neither": 30, "ner": 15, "ner_cutoff": [0, 1, 2, 47, 64], "ner_train": 64, "ner_training_df": [0, 1, 2, 47], "nest": [0, 1, 2, 22, 71], "net": [45, 46], "network": 11, "neutral": [1, 5, 24, 30, 51, 55, 61, 67], "neutral_bert": [1, 51, 61], "never": 1, "new": [0, 1, 4, 13, 34, 61, 64, 65, 66, 72], "new_column_nam": 72, "next": [1, 32, 47, 58], "nice": [1, 50, 54, 61], "nicknam": 1, "niculescu": [49, 50], "night": 31, "nikhil": [59, 62], "nltk": [1, 42, 61], "nobodi": [31, 36], "nois": 32, "non": [1, 2, 28, 31, 37, 42, 48, 61, 62, 67, 71], "none": [1, 2, 19, 37, 55, 61, 64, 65, 66, 67], "nor": 30, "normal": [19, 28, 31], "notabl": 62, "note": [0, 2, 12, 16, 20, 42, 61, 67, 71], "notebook": [0, 1], "noth": [31, 36, 56], "notset": 71, "noun": 1, "novel": [45, 46], "now": [0, 1], "nowher": [31, 36], "np": [67, 68], "ntri": 32, "null": 34, "num": 48, "num_char": 65, "num_chunk": [27, 63], "num_hedge_word": 10, "num_messag": 65, "num_named_ent": [15, 47], "num_row": 63, "num_top": 13, "num_word": [12, 16, 65], "number": [0, 1, 2, 3, 11, 12, 13, 14, 15, 16, 19, 20, 21, 22, 25, 28, 31, 32, 34, 36, 37, 40, 41, 42, 43, 44, 47, 48, 49, 54, 56, 58, 59, 60, 62, 63, 64, 66, 67, 69, 71, 72], "numer": [0, 1, 2, 11, 13, 33, 68, 72, 73], "numpi": [1, 61, 68], "o": 35, "object": [1, 2, 11, 19, 44, 50, 57, 58, 61, 62, 64, 65, 66], "obtain": [0, 1, 13, 17, 23, 24, 34, 42, 61], "occur": [0, 4, 31, 42, 71], "occurr": 19, "off": [0, 1, 31, 36], "offer": 0, "offici": [61, 67], "often": [28, 36, 47, 48, 62], "oh": [31, 36, 48], "okai": [31, 36], "older": [1, 49, 61], "on_column": [18, 23, 28, 68, 72, 73], "onc": [1, 2, 11, 58, 61, 62, 67], "one": [0, 1, 2, 4, 10, 12, 19, 23, 25, 28, 29, 31, 32, 36, 37, 47, 51, 56, 59, 61, 62, 67, 68, 71, 73], "ones": [31, 36], "onli": [0, 1, 2, 5, 11, 23, 29, 31, 32, 34, 36, 37, 45, 53, 58, 59, 61, 62, 67, 71], "onlin": [1, 32, 39, 64], "onward": 0, "open": [0, 62, 66], "operation": [39, 50, 59], "opinion": [24, 31], "oppos": [2, 31, 34, 35, 55], "opposit": 34, "opt": 1, "option": [1, 2, 37, 62, 63, 67, 71], "order": [0, 1, 35, 37, 42, 67, 71], "org": [2, 6, 15, 21, 24, 41, 70], "organ": 1, "origin": [1, 2, 5, 12, 21, 31, 32, 35, 36, 37, 45, 46, 49, 50, 59], "orthogon": 34, "other": [0, 1, 2, 9, 11, 28, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 45, 46, 48, 51, 52, 54, 56, 58, 59, 61, 62, 64, 66, 71], "other_lexical_featur": [11, 64], "otherwis": [2, 10, 21, 32, 38, 63, 67, 71], "our": [0, 1, 2, 11, 13, 29, 31, 32, 36, 37, 39, 53, 59, 61, 71], "ourselv": 53, "out": [1, 16, 19, 31, 36, 42, 55, 60, 62], "outcom": [1, 44, 62], "output": [0, 2, 10, 17, 19, 40, 42, 61, 62, 64, 67, 71], "output_file_bas": [0, 1, 2, 42, 61], "output_file_path_chat_level": [1, 2], "output_file_path_conv_level": [1, 2], "output_file_path_user_level": [1, 2], "output_path": 67, "outsid": [1, 2, 12], "over": [1, 16, 29, 31, 34, 35, 36, 37, 53, 55, 60, 62, 67, 71], "overal": [30, 31, 34, 36, 45, 46], "overrid": [0, 1, 2], "overview": [0, 61, 62], "overwhelmingli": 1, "overwritten": 1, "own": [0, 1, 2, 9, 35, 62, 64], "p": 55, "pacakg": 24, "pace": [43, 62], "packag": [17, 18, 40, 62], "pad": 19, "page": [1, 11, 29, 39, 61, 62, 69], "pair": [6, 19, 34, 49, 71], "pairwis": [6, 34], "panda": [0, 1, 2, 12, 14, 16, 23, 47, 64, 65, 66, 71, 72, 73], "paper": [4, 5, 12, 18, 29, 40, 49, 50, 64], "paragraph": 22, "param": 19, "paramet": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 47, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73], "paramt": 1, "pardon": 32, "parenthes": [22, 42, 48, 64], "parenthet": [22, 48], "pars": [2, 16, 50, 60], "parser": 67, "part": [1, 10, 13, 29, 36, 42, 52, 67, 71], "particip": [1, 9, 37, 62], "particl": [31, 36], "particular": [1, 11, 31, 32, 34, 41, 45, 47, 51, 59, 62], "particularli": 42, "partner": 32, "pass": [1, 13, 21, 47, 71], "path": [1, 2, 19, 61, 67, 71], "path_in": 19, "pattern": [4, 11, 19, 42, 55, 62, 67], "paus": 4, "pd": [1, 2, 4, 6, 7, 8, 9, 12, 13, 14, 15, 16, 18, 19, 23, 25, 63, 64, 65, 66, 67, 68, 71], "pdf": [5, 12, 13, 16, 18, 21, 24, 64, 70], "penalti": 1, "pennebak": [0, 12, 37, 41, 42, 52], "pennyslvania": 62, "peopl": [1, 32, 59, 62], "per": [0, 1, 2, 6, 9, 19, 42, 63, 66, 72], "percentag": [2, 21], "perfect": [37, 59], "perform": [0, 1, 16, 50], "perhap": 1, "period": [4, 34, 55], "person": [1, 8, 12, 15, 16, 32, 34, 39, 41, 42, 50, 59, 62, 64, 70], "perspect": 1, "petrocelli": 5, "phrase": [19, 30, 38, 54], "phrase_split": 19, "pickl": [19, 67], "piec": [36, 42, 59, 63], "pl": 50, "place": [55, 61, 62], "plan": [34, 35, 45, 46], "player": 59, "pleas": [0, 1, 38, 49, 50, 61, 62], "please_start": 50, "plu": 2, "point": [22, 24, 34, 35, 42, 45, 46, 48, 52, 64, 66], "poisson": 55, "polar": [24, 39, 51, 52, 64], "polit": [1, 17, 18, 30, 32, 38, 39, 42, 51, 52, 54, 56, 64], "politeness_featur": 11, "politeness_v2": 11, "politeness_v2_help": 11, "politenesspi": 19, "politenessstrategi": [17, 50], "popul": 1, "portion": 1, "posit": [0, 1, 11, 15, 24, 29, 31, 39, 42, 50, 51, 54, 56, 61, 62, 64, 67], "positive_affect_lexical_per_100": [51, 52, 56], "positive_bert": [0, 1, 51, 61], "positive_emot": [49, 51, 52, 56], "positivity_bert": [1, 61], "positivity_zscor": 64, "positivity_zscore_chat": 52, "positivity_zscore_convers": 52, "possess": 31, "possibl": [1, 34, 62, 66], "possibli": [38, 62], "ppron": 67, "practic": [34, 35], "pre": [1, 4, 21, 37, 49, 64], "preced": [31, 35, 71], "precend": 35, "precis": 47, "precomput": 51, "predefin": 19, "predetermin": [31, 36], "predict": [2, 47, 51, 64], "prefer": [0, 1], "preload_word_list": 69, "prep_simpl": 19, "prep_whol": 19, "preposit": [31, 36], "preproces": 48, "preprocess": [0, 1, 2, 13, 19, 40, 43, 49, 51, 61, 69], "preprocess_chat_data": 2, "preprocess_conversation_column": 71, "preprocess_naive_turn": 71, "preprocess_text": 71, "preprocess_text_lowercase_but_retain_punctu": 71, "presenc": [2, 32, 67], "present": [1, 2, 14, 30, 31, 38, 42, 55, 62], "preserv": [1, 2, 42], "prespecifi": 19, "prevent": 51, "previou": [1, 7, 28, 31, 36, 45, 46, 58, 64, 71], "previous": [], "primari": 34, "print": [2, 71], "prior": [2, 64, 71], "priya": [47, 62], "probabl": [15, 47], "problem": 62, "procedur": 62, "proceed": 46, "process": [0, 1, 2, 4, 10, 21, 37, 42, 55, 62, 64, 65, 67, 69, 71], "prodi": 15, "produc": [1, 2, 34], "product": 15, "professor": 62, "progress": [1, 2], "project": [54, 62], "pronoun": [12, 16, 31, 36, 39, 41, 42, 64, 67, 70], "proper": 1, "properli": 42, "properti": [1, 11, 61], "proport": [16, 39, 42, 64], "propos": 37, "provid": [0, 1, 2, 15, 29, 30, 33, 36, 39, 44, 47, 54, 62], "proxi": 42, "pseudonym": 1, "psycholog": 42, "pub": 70, "publish": [5, 30, 64], "pubsonlin": 6, "punctuat": [0, 2, 16, 19, 20, 21, 28, 43, 54, 60, 67, 71], "punctuation_seper": 19, "puncut": 48, "pure": [24, 36], "purpos": 1, "put": [34, 42, 50, 62, 66], "py": [0, 1, 14, 19, 49, 61, 64, 67], "pydata": 2, "pypi": [1, 61], "python": [1, 32, 41, 56, 57, 61, 62, 68], "qtd": 62, "qualiti": 41, "quantifi": [31, 36, 62], "quantiti": [37, 39, 41, 47], "quartil": 50, "question": [16, 19, 20, 29, 32, 39, 49, 50, 64, 66, 68, 70], "question_num": 11, "question_word": 20, "quick": [1, 43], "quickli": 0, "quit": 40, "quot": [22, 48, 64], "quotat": [22, 48], "rabbit": 62, "rain": 41, "rais": [2, 67, 71], "random": 55, "rang": [5, 8, 24, 30, 33, 34, 35, 40, 51, 53, 55, 56, 57], "ranganath": [16, 31, 32, 36, 38, 43, 54, 70], "ranganath2013": 70, "ranganathetal2013_detectingflirt": 16, "rapid": [1, 4], "rare": [34, 35], "rate": [2, 42, 51], "rather": [1, 31, 34, 35, 36, 37, 45, 46, 63], "ratio": [2, 16, 39, 64], "raw": [0, 12, 16, 21, 31, 33, 42, 50, 64], "re": [1, 31, 36, 42, 50, 61], "reach": 42, "read": [0, 1, 2, 16, 21, 29, 33, 61, 62, 64, 65, 66, 67], "read_csv": 1, "read_in_lexicon": 67, "readabl": [11, 33, 64, 70], "reader": 33, "readi": 1, "readili": 62, "readthedoc": [1, 24, 61], "real": [1, 55], "realit": 13, "realli": [31, 36, 50], "reason": [31, 36, 45, 46, 49], "reassur": 49, "reattach": 2, "recal": [1, 47], "recent": [0, 50], "recept": [18, 32, 39, 42, 50, 51, 52, 54, 56, 62, 64], "recogn": [1, 42, 43, 47], "recognit": [0, 2, 39, 64], "recommend": [0, 42, 62], "reddit": [48, 64], "reddit_tag": 11, "redditus": 48, "reduc": [0, 2, 63], "reduce_chunk": 63, "reduct": 2, "redund": [0, 2, 42, 62], "refer": [0, 1, 2, 11, 19, 22, 24, 28, 31, 42, 48, 52, 61, 62, 64, 70], "reflect": [37, 43], "regardless": 1, "regener": [0, 2, 51, 67], "regenerate_vector": [0, 1, 2, 67], "regex": [14, 16, 42, 49, 67], "regist": 37, "regress": 1, "regular": [5, 14, 30, 32, 42, 55, 58, 67], "reichel": [53, 58, 60], "reidl": [4, 13], "reinvent": 62, "rel": [41, 51, 52, 55, 60, 64], "relat": [1, 61, 62, 64], "relationship": 36, "relev": [1, 29, 42, 44, 49, 51, 56, 61, 64, 65], "reli": [31, 34, 35, 36, 69], "reliabl": [33, 42], "remain": [1, 30, 71], "rememb": 1, "remov": [0, 1, 2, 9, 13, 19, 28, 40, 43, 48, 49, 50, 71], "remove_active_us": 9, "remove_unhashable_col": 71, "renam": 1, "repair": [16, 39], "repeat": [60, 71], "repetit": 60, "replac": 19, "report": [1, 61], "repres": [0, 1, 2, 4, 6, 7, 11, 13, 23, 31, 34, 36, 42, 45, 46, 64, 66, 67, 68, 71, 72, 73], "represent": [34, 38, 67], "reproduc": [36, 62], "republican": 1, "request": [32, 50, 51], "requir": [0, 1, 20, 21, 31, 55, 61, 62, 64, 65, 66, 67, 71], "research": [1, 62], "reserv": 0, "resolv": 62, "resourc": [1, 39, 48, 61, 62], "respect": [1, 2, 12, 31, 36, 37, 69], "respons": [22, 48, 55, 58, 64], "restaur": [34, 56], "restor": 0, "restrict": 71, "result": [40, 55, 65, 72], "retain": [0, 1, 2, 16, 20, 21, 60, 71], "retriev": 50, "retunr": 3, "return": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 30, 32, 43, 49, 50, 51, 55, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "reveal": 62, "revert": 50, "review": 62, "rewrit": 50, "rich": 62, "riedl": [13, 40, 55], "right": [31, 36, 61, 62], "roberta": [0, 1, 39, 42, 52, 56, 61, 64, 67], "robust": 13, "rocklag": [5, 30, 64], "room": 59, "root": [13, 40], "rough": [12, 54], "roughli": 31, "round": [13, 40, 59, 71], "round_num": 1, "row": [0, 1, 2, 9, 13, 25, 37, 40, 59, 63, 68, 71, 72, 73], "rowbotham": 62, "rucker": 5, "rule": [1, 69], "run": [0, 10, 12, 16, 35, 46, 47, 48, 51, 61, 69], "runtim": [1, 35], "ryan": [0, 42], "ryanboyd": 67, "sagepub": [5, 64], "sai": [1, 32, 50, 59], "said": [1, 36, 62], "same": [0, 1, 2, 31, 34, 37, 45, 48, 52, 59, 60, 62, 71], "sampl": [61, 62], "sarcast": 48, "save": [0, 1, 2, 19, 64, 67], "save_featur": 2, "sbert": [0, 1, 28, 31, 34, 35, 36, 45, 46, 64, 65, 67], "scale": [42, 51], "schema": 1, "scheme": 0, "school": [21, 62], "scienc": [29, 39, 62], "scientist": [61, 62], "score": [1, 2, 4, 5, 11, 12, 13, 15, 21, 24, 28, 29, 30, 31, 34, 35, 36, 38, 39, 40, 45, 46, 47, 50, 51, 53, 56, 57, 61, 64, 65, 67, 73], "script": [1, 61], "sea": 1, "seamless": 62, "search": [19, 61], "second": [0, 1, 4, 34, 42, 58, 59], "second_person": 49, "secr": [18, 49, 50, 64], "section": [1, 29, 61], "see": [0, 1, 2, 11, 30, 34, 38, 41, 45, 46, 47, 55, 62, 71], "seek": [5, 62], "seen": 67, "segment": [0, 19], "select": [1, 2, 4, 23, 28, 36, 45, 64, 66, 67, 68, 71, 72, 73], "self": [1, 2, 61], "semant": [31, 34, 35, 41], "semantic_group": [1, 61], "send": [1, 37, 55], "sens": [1, 5, 31, 54, 66], "sensibl": 1, "sent": [1, 37, 64], "sentenc": [0, 1, 10, 15, 19, 20, 21, 33, 34, 35, 36, 42, 45, 46, 47, 48, 54, 56, 61, 67], "sentence_pad": 19, "sentence_split": 19, "sentence_to_train": 47, "sentencis": 19, "sentiment": [0, 1, 24, 31, 39, 42, 52, 56, 61, 62, 64, 67], "sentimet": 1, "separ": [1, 2, 19, 34, 42, 51, 67], "septemb": 40, "sequenc": [1, 59], "sequenti": 1, "seri": [12, 16, 23, 28, 42, 71, 73], "serv": 12, "set": [0, 1, 2, 13, 34, 48, 59, 71], "set_self_conv_data": 2, "setup_logg": 71, "sever": [1, 30, 41, 42, 48, 51, 56, 61], "shall": 54, "share": [31, 36, 37], "she": [30, 31, 36], "shift": 34, "shop": 62, "short": [55, 58], "shorter": [13, 40, 41, 42, 43], "should": [0, 1, 2, 4, 14, 23, 28, 29, 31, 36, 47, 48, 54, 61, 62, 64, 66, 67, 68, 69, 71, 72, 73], "shouldn": [31, 36], "show": [1, 37, 61], "showeth": 62, "shown": 2, "shruti": [35, 45, 46, 47, 62], "side": 31, "signal": [45, 55], "signifi": 42, "signific": [1, 61], "silent": 37, "similar": [1, 2, 6, 7, 13, 28, 29, 31, 34, 35, 36, 40, 45, 46, 49, 50, 62, 65], "similarli": [1, 35], "simpl": [0, 1, 16, 19, 42, 61, 62], "simpli": [1, 5, 11, 28, 42, 56, 62], "simplifi": 1, "simplist": 41, "sinc": [1, 32, 41, 71], "singh": 62, "singl": [0, 1, 2, 11, 12, 19, 23, 31, 34, 35, 36, 37, 41, 45, 46, 59, 62, 71, 72], "singular": [12, 41, 64], "site": 16, "situat": 37, "size": [1, 2, 13, 63, 67], "skip": 1, "slightli": [32, 62, 63], "slow": 1, "small": 40, "so": [0, 1, 2, 10, 30, 31, 36, 37, 42, 50, 61, 62, 66, 67], "social": [29, 39, 61, 62], "socsci": 16, "softwar": 62, "sohi": 62, "sol3": 4, "solut": [1, 59], "solv": 62, "some": [0, 1, 11, 17, 29, 32, 34, 35, 37, 41, 61, 63], "somebodi": [31, 36], "someon": [22, 29, 31, 36, 47, 48, 61, 64], "someplac": [31, 36], "someth": 47, "sometim": 1, "somewhat": 35, "soon": 62, "sorri": [16, 32, 50], "sort": [10, 42, 67], "sort_word": 67, "sound": [47, 51], "sourc": [4, 5, 6, 12, 13, 16, 17, 21, 34, 35, 50, 64, 68], "space": [34, 40, 42, 67, 71], "spaci": [1, 19, 47, 49, 50, 61], "span": 63, "spars": [1, 32], "speak": [1, 31, 36, 37, 59, 60, 62], "speaker": [0, 1, 2, 6, 8, 9, 25, 31, 34, 35, 37, 38, 42, 45, 46, 61, 66, 71, 72], "speaker_id": [61, 72], "speaker_id_col": [0, 1, 2, 6, 8, 9, 25, 26, 27, 61, 65, 66, 71, 72], "speaker_nicknam": [0, 1, 2, 6, 9, 59, 66], "spearman": [0, 1, 2], "special": [0, 1, 2, 48, 71], "specif": [1, 2, 12, 32, 41, 48, 55, 61, 62, 69, 71], "specifi": [1, 2, 19, 47, 49, 65, 66, 67, 68, 71, 72, 73], "speciifc": 63, "speed": 1, "spend": [51, 62], "spike": 55, "split": [19, 21, 43, 63], "spoke": 59, "spoken": [11, 37], "spread": 55, "squar": [13, 40], "src": 67, "ssrn": 4, "stabl": 40, "stack": 14, "stackoverflow": 68, "stage": [1, 2, 34, 71], "stamp": 55, "standard": [1, 2, 4, 37, 40, 41, 42, 49, 55, 58, 60, 65, 72, 73], "stanford": 70, "start": [15, 19, 20, 22, 23, 50], "start_timestamp_col": 2, "statement": [1, 38, 42, 47, 48, 61, 62, 64], "statist": [1, 2, 65, 66, 68], "statologi": 41, "stdev": [1, 2, 11, 65, 66], "stem": 42, "step": [1, 4, 28, 41, 45, 46, 51], "still": [1, 41, 45, 46], "stochast": 40, "stop": [40, 62], "stopword": [13, 19], "store": [1, 12, 16, 41, 49, 51, 61, 65, 67], "stoword": 42, "str": [2, 3, 4, 5, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 63, 64, 65, 66, 67, 68, 71, 72, 73], "str_to_vec": 67, "str_vec": 67, "straightforward": 29, "strategi": [17, 30, 32, 38, 39, 42, 49, 54, 64], "strategy_extractor": 19, "stream": 35, "strictli": 1, "string": [0, 1, 2, 4, 8, 12, 13, 14, 19, 23, 24, 50, 64, 66, 67, 68, 71, 72, 73], "strongli": [1, 41, 61], "structur": [0, 36, 49], "student": [21, 33], "studi": [1, 34, 62], "style": [1, 31, 36, 59], "sub": [0, 1, 71], "subfold": 1, "subject": [5, 24, 28, 39, 49, 64], "subjunct": 50, "sublist": 28, "submiss": 55, "subpart": [1, 71], "subsequ": [1, 30, 51, 58], "subset": [1, 62], "substanc": 36, "substant": 31, "substanti": 1, "substr": 30, "subtask": 1, "subtract": [41, 58], "succe": 62, "success": [0, 1, 4, 31, 36, 43, 55, 58], "suggest": [1, 13, 34, 42, 44, 50], "suit": [62, 64], "suitabl": 2, "sum": [1, 28, 34, 61, 64, 65, 66, 72], "summar": [0, 1, 2, 69], "summari": [2, 65, 66, 72], "summariz": [0, 65], "summarize_featur": 69, "suppl": 6, "support": [1, 15, 42, 61], "suppos": 1, "sure": 30, "swear": 49, "switch": 1, "symbol": 67, "syntax": [1, 32, 61], "system": [2, 59, 64], "t": [0, 1, 15, 29, 31, 36, 42, 45, 49, 54, 61, 62, 67], "tabl": [1, 62], "tag": 39, "take": [1, 4, 5, 9, 14, 25, 29, 31, 34, 37, 39, 42, 55, 61, 65, 67, 71], "taken": [59, 71], "talk": [1, 37, 47, 59, 62], "tandem": [1, 61], "target": 15, "task": [1, 2, 59, 71], "tausczik": [12, 37, 41, 52], "tausczikpennebaker2013": 12, "team": [0, 1, 4, 11, 12, 13, 34, 39, 40, 42, 59, 65], "team_bursti": 4, "team_comm_tool": [1, 61], "teamcommtool": 1, "technic": [29, 39, 61, 62], "teghxgbqdhgaaaaa": 5, "tempor": [0, 2, 55, 58, 64, 71], "temporal_featur": 11, "tend": [1, 34, 60], "term": [1, 28, 59, 67], "termin": [1, 2, 61], "terribl": 51, "test": [13, 33, 47], "text": [0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 28, 32, 33, 36, 42, 48, 55, 62, 64, 67, 71], "text_based_featur": 64, "textblob": [24, 39, 51, 52, 64], "textblob_sentiment_analysi": 11, "than": [0, 1, 11, 13, 31, 34, 35, 36, 37, 40, 41, 45, 46, 54, 60, 62, 63], "thee": 62, "thei": [0, 1, 11, 28, 29, 31, 34, 36, 37, 39, 42, 47, 58, 59, 61, 62, 67], "them": [0, 1, 2, 19, 28, 29, 31, 36, 50, 51, 55, 59, 61, 62, 64, 65, 66, 67], "themselv": [31, 36, 60], "theoret": 35, "theori": [34, 50], "therebi": 0, "therefor": [0, 1, 11, 28, 37, 45, 59, 62, 69], "thi": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 20, 21, 23, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 71, 72, 73], "thing": [48, 61], "think": [1, 38, 47], "thorough": [43, 62], "those": [1, 21, 31, 36, 61, 71], "though": [34, 42, 50], "thought": [1, 35, 45], "thread": [1, 61], "three": [0, 1, 2, 22, 34, 37, 40, 51, 61, 62, 69, 71], "threshold": [1, 2, 15, 47], "through": [1, 45, 46, 50, 61, 62], "throughout": [31, 35, 36, 40, 45, 46, 55, 63], "tht": 35, "thu": [1, 34, 35, 36, 37, 46, 55, 71], "time": [0, 1, 2, 4, 23, 34, 35, 39, 42, 48, 51, 55, 59, 61, 62, 63, 64, 65, 66, 71], "time_diff": 55, "timediff": 4, "timestamp": [0, 1, 2, 8, 23, 58, 61, 62, 63, 64, 71], "timestamp_col": [0, 1, 2, 8, 61, 63, 64, 65, 71], "timestamp_end": [1, 23, 61], "timestamp_start": [1, 23, 61], "timestamp_unit": [0, 2, 23, 64], "to_datetim": [0, 2], "todai": [34, 35, 41, 43, 45, 46, 47], "todo": 66, "togeth": [0, 62, 66], "token": [16, 19, 39, 49, 54, 64, 67], "token_count": [19, 49], "too": [0, 30, 31, 36, 62], "took": [1, 59], "tool": [1, 61, 62], "toolkit": [0, 1, 11, 42, 45, 46, 55, 62, 65, 66], "top": [1, 50, 59], "topic": [1, 13, 31, 34, 40, 42, 43, 65], "tormala": 5, "total": [0, 1, 3, 12, 16, 25, 31, 34, 36, 37, 41, 44, 53, 59, 60, 61, 62, 63, 64, 66, 72], "touch": [1, 61], "toward": [31, 36, 38, 42, 45, 46], "track": [65, 66], "tradit": 49, "train": [1, 2, 15, 64], "train_spacy_n": 15, "transcript": 0, "transfom": [45, 46], "transform": [1, 31, 34, 35, 36, 51], "transform_utter": 50, "treat": [0, 1, 2, 42, 59, 61], "treat_zero_as_na": [0, 1, 2], "tri": 50, "trivial": [3, 44, 62], "troubl": [1, 61], "true": [0, 1, 2, 37, 61, 63, 65, 66, 67, 71], "truncat": 2, "truth_intensifi": 49, "ttr": 64, "tupl": [0, 1, 2, 15, 19, 64], "turn": [0, 2, 25, 28, 31, 32, 37, 39, 61, 64, 65, 71], "turn_count": 59, "turn_df": 71, "turn_id": 71, "turn_taking_featur": 11, "twice": 63, "twitter": [1, 51, 61], "two": [0, 1, 2, 23, 31, 34, 36, 41, 45, 46, 52, 62, 63, 67], "txt": 19, "type": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 37, 39, 52, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "typic": [1, 34, 40, 41, 42, 52, 60], "u": [0, 1, 2, 22, 31, 36, 48, 49, 58], "uci": 16, "uh": [31, 36], "ulrich": 55, "um": [31, 36, 60], "umbrella": [8, 29, 34], "uncertain": [5, 30], "uncertainti": 30, "under": [0, 1, 10, 11, 12, 28, 40], "underli": [1, 61], "underscor": [1, 42, 61], "understand": [0, 33, 39, 43, 48, 58, 61, 62], "understood": 33, "unhash": 71, "uninterrupt": 59, "uniqu": [0, 1, 2, 6, 9, 13, 16, 23, 25, 41, 47, 52, 60, 61, 63, 71], "unit": [0, 2, 23], "univers": 62, "unix": 58, "unless": [31, 36], "unpack": 62, "unpreprocess": 0, "until": [31, 36, 45, 46], "untouch": 1, "unzip": [1, 61], "up": [1, 17, 21, 28, 31, 35, 36, 37, 42, 45, 46, 51, 59, 61, 67, 71], "updat": [1, 9, 40, 54, 61], "upenn": 1, "upgrad": 50, "upload": 13, "upon": 33, "us": [0, 2, 3, 5, 11, 12, 13, 17, 19, 24, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 56, 57, 60, 62, 64, 65, 66, 67, 71], "usag": [0, 21, 24], "use_gpu": [0, 1, 2, 67], "use_time_if_poss": 63, "user": [0, 1, 2, 9, 14, 15, 22, 37, 42, 47, 48, 51, 61, 62, 63, 64, 65, 66, 69, 72], "user_aggreg": [0, 1, 2, 65, 66], "user_column": [0, 1, 2, 65, 66], "user_data": [2, 65, 66], "user_df": 9, "user_level_featur": 2, "user_list": 9, "user_method": [0, 1, 2, 65, 66], "userlevelfeaturescalcul": [2, 66, 69], "usernam": [22, 48], "utf": 1, "util": [1, 12, 21, 61, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "utilti": 62, "utter": [0, 1, 2, 3, 4, 5, 13, 14, 15, 16, 17, 20, 21, 23, 24, 30, 31, 32, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 50, 51, 52, 54, 58, 60, 61, 67], "v": [0, 1, 13, 61], "v0": 0, "valenc": 51, "valid": [1, 2, 55, 67, 71], "valu": [0, 1, 2, 5, 6, 10, 12, 13, 18, 19, 28, 30, 31, 34, 36, 37, 40, 41, 42, 45, 46, 47, 55, 59, 61, 64, 67, 68, 72, 73], "valueerror": [2, 71], "vari": [13, 31, 34, 35], "variabl": [1, 56, 57, 64, 65, 66], "varianc": [1, 2, 8, 34], "variance_in_dd": 11, "variat": [4, 32], "varieti": [42, 62], "variou": [19, 42, 64, 65, 66], "vast": 62, "ve": [0, 31, 36, 50, 61], "vec": 6, "vect_data": [1, 7, 8, 28, 61, 64, 65, 66], "vect_path": 67, "vector": [0, 2, 6, 7, 8, 13, 28, 34, 35, 40, 55, 61, 64, 65, 67], "vector_data": [0, 1, 2, 61], "vector_directori": [0, 1, 2, 61, 65], "vein": 45, "verb": [19, 31, 36], "verbal": 32, "veri": [5, 28, 30, 31, 34, 35, 36, 42, 49, 54], "verifi": 2, "verify_timestamp_format": 2, "verit": 62, "version": [0, 1, 12, 14, 21, 28, 31, 40, 42, 50, 51, 61], "versu": [4, 29, 47, 55, 59], "vert": 2, "via": [3, 44], "view": 50, "visit": 41, "voila": 62, "w": [31, 42], "wa": [0, 1, 2, 5, 12, 31, 32, 35, 36, 47, 51, 56, 59, 62, 71], "wai": [0, 1, 2, 29, 30, 31, 32, 34, 49, 50, 54, 56, 57, 61, 62, 66], "waiai": 62, "wait": [4, 55], "walk": 1, "walkthrough": [0, 61, 62], "want": [1, 28, 34, 59, 61, 62, 65, 66, 67], "warn": [1, 50, 71], "watt": [1, 2, 62, 71], "we": [0, 1, 2, 3, 4, 5, 9, 10, 11, 12, 15, 16, 18, 24, 28, 29, 30, 31, 34, 35, 36, 37, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59, 61, 62, 66, 67, 71], "web": 70, "websit": [1, 61], "week": 47, "weight": 66, "weigt": 31, "welcom": 61, "well": [0, 1, 11, 29, 31, 36, 55, 62], "went": 41, "were": [1, 2, 12, 31, 36, 42, 71], "western": 1, "wh": [19, 31, 36], "wh_question": [32, 49, 54], "wharton": 62, "what": [1, 2, 12, 16, 20, 29, 31, 32, 34, 35, 36, 39, 41, 45, 46, 47, 50, 54, 62, 63], "whatev": [1, 31, 36], "wheel": 62, "when": [1, 2, 16, 20, 31, 33, 36, 42, 47, 54, 55, 59, 60, 61, 62, 67, 69, 71], "whenev": 71, "where": [1, 2, 19, 20, 28, 31, 32, 36, 37, 40, 41, 42, 48, 50, 51, 54, 59, 61, 65, 68, 73], "wherea": [31, 34, 35, 36, 43], "wherev": [31, 36], "whether": [1, 2, 10, 16, 19, 32, 37, 38, 41, 43, 47, 57, 58, 62, 63, 64, 67, 71], "which": [0, 1, 2, 3, 4, 5, 7, 9, 12, 13, 15, 16, 18, 25, 28, 31, 34, 35, 36, 37, 38, 40, 41, 42, 51, 53, 54, 55, 56, 57, 58, 59, 61, 62, 64, 65, 66, 67, 68, 69, 71, 72, 73], "while": [1, 31, 32, 34, 36, 37, 44, 45, 46, 55, 62, 71], "whitespac": 43, "who": [1, 20, 31, 32, 36, 47, 51, 54, 59, 60, 62], "whole": [28, 42, 59, 62, 71], "whom": [31, 36, 54], "whose": [1, 31, 36, 54], "why": [20, 29, 31, 36, 54], "wide": 31, "wien": 62, "wiki": [21, 29, 70], "wiki_link": [1, 61], "wikipedia": [21, 33, 37, 70], "williamson": 60, "wish": [1, 2, 18, 28], "within": [0, 1, 2, 8, 11, 16, 28, 30, 31, 34, 35, 36, 41, 45, 46, 52, 55, 59, 60, 62, 63, 64, 68, 71, 73], "within_person_discursive_rang": 11, "within_task": [0, 1, 2, 71], "without": [1, 19, 31, 36, 42, 47, 54, 62, 69], "won": [0, 31, 36, 45], "wonder": 56, "woolei": 4, "woollei": [13, 40, 55], "wooten": 55, "word": [0, 1, 3, 10, 11, 12, 13, 14, 16, 19, 20, 21, 22, 28, 30, 32, 33, 37, 38, 39, 40, 41, 43, 45, 46, 48, 49, 52, 53, 54, 56, 57, 62, 64, 65, 66, 67, 69, 70], "word_mimicri": 11, "word_start": [19, 49], "wordcount": [1, 42], "wordnet": [1, 61], "words_in_lin": 19, "work": [0, 11, 42, 47, 50, 55, 61, 62], "world": 55, "worri": 62, "would": [1, 29, 31, 34, 35, 36, 37, 42, 50, 54, 62], "wouldn": [31, 36], "wow": 50, "wp": 13, "wrap": 42, "write": [2, 29, 60], "www": [12, 13, 18, 41, 64], "x": [4, 46, 68], "xinlan": 62, "yashveer": 62, "ye": 19, "yeah": [31, 36], "yeoman": [18, 49, 50], "yesno_quest": [32, 49, 54], "yet": 48, "ylatau": 12, "you": [0, 1, 2, 11, 24, 29, 31, 36, 37, 42, 43, 47, 50, 59, 61, 62, 69], "your": [0, 29, 31, 32, 36, 37, 50, 59, 61, 62], "your_data": 42, "yourself": [31, 36, 50], "yuluan": 62, "yup": [31, 36], "yuxuan": 62, "z": [12, 39, 49, 51, 64, 73], "z0": 67, "za": 67, "zero": [0, 1, 2, 13, 52], "zhang": 62, "zheng": 62, "zhong": 62, "zhou": 62, "zscore": 41, "zscore_chat": 41, "zscore_chats_and_convers": 69, "zscore_convers": 41, "\u00bc": 47, "\u03c4": 55}, "titles": ["The Basics (Get Started Here!)", "Worked Example", "feature_builder module", "basic_features module", "burstiness module", "certainty module", "discursive_diversity module", "fflow module", "get_all_DD_features module", "get_user_network module", "hedge module", "Features: Technical Documentation", "info_exchange_zscore module", "information_diversity module", "lexical_features_v2 module", "named_entity_recognition_features module", "other_lexical_features module", "politeness_features module", "politeness_v2 module", "politeness_v2_helper module", "question_num module", "readability module", "reddit_tags module", "temporal_features module", "textblob_sentiment_analysis module", "turn_taking_features module", "variance_in_DD module", "within_person_discursive_range module", "word_mimicry module", "FEATURE NAME", "Certainty", "Content Word Accommodation", "Conversational Repair", "Dale-Chall Score", "Discursive Diversity", "Forward Flow", "Function Word Accommodation", "Gini Coefficient", "Hedge", "Features: Conceptual Documentation", "Information Diversity", "Information Exchange", "Linguistic Inquiry and Word Count (LIWC) and Other Lexicons", "Message Length", "Message Quantity", "Mimicry (BERT)", "Moving Mimicry", "Named Entity Recognition", "Online Discussion Tags", "Politeness/Receptiveness Markers", "Politeness Strategies", "Sentiment (RoBERTa)", "Positivity Z-Score", "Proportion of First Person Pronouns", "Question (Naive)", "Team Burstiness", "Textblob Polarity", "Textblob Subjectivity", "Time Difference", "Turn Taking Index", "Word Type-Token Ratio", "The Team Communication Toolkit", "Introduction", "assign_chunk_nums module", "calculate_chat_level_features module", "calculate_conversation_level_features module", "calculate_user_level_features module", "check_embeddings module", "gini_coefficient module", "Utilities", "preload_word_lists module", "preprocess module", "summarize_features module", "zscore_chats_and_conversation module"], "titleterms": {"0": 42, "1": 42, "5": 42, "A": 0, "One": 0, "The": [0, 61, 62], "accommod": [31, 36], "addit": 1, "advanc": 1, "aggreg": [1, 11], "analyz": 1, "assign_chunk_num": 63, "assumpt": 0, "base": 11, "basic": [0, 1, 29, 30, 31, 33, 34, 35, 36, 37, 38, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 55, 56, 57, 58, 59, 60], "basic_featur": 3, "bert": 45, "bring": 42, "bursti": [4, 55], "cach": 1, "calculate_chat_level_featur": 64, "calculate_conversation_level_featur": 65, "calculate_user_level_featur": 66, "caveat": [1, 29, 30, 31, 33, 34, 35, 36, 38, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 55, 56, 57, 58, 59], "certainti": [5, 30], "chall": 33, "chat": [11, 39], "check_embed": 67, "citat": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "class": 69, "code": [0, 1], "coeffici": 37, "column": [1, 61], "commun": 61, "conceptu": 39, "configur": 1, "consider": 1, "content": [31, 61], "convers": [1, 11, 32, 39, 62, 69], "count": [42, 59], "cumul": 1, "custom": [1, 42], "customiz": 0, "dale": 33, "data": 1, "declar": 61, "demo": [0, 1], "deprec": 1, "detail": 1, "differ": 58, "directori": 1, "discurs": 34, "discursive_divers": 6, "discuss": 48, "divers": [34, 40], "document": [11, 39, 62], "driver": 69, "entiti": [1, 47], "environ": [1, 61], "exampl": [1, 41, 47], "exchang": 41, "featur": [1, 11, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 69], "feature_build": 2, "featurebuild": [1, 61, 62], "fflow": 7, "file": [1, 30, 34, 35, 45, 46, 47, 51], "first": [1, 53], "flow": 35, "forward": 35, "function": [0, 36], "gener": [1, 61, 62], "get": [0, 1, 61, 62], "get_all_dd_featur": 8, "get_user_network": 9, "gini": 37, "gini_coeffici": 68, "gpu": 1, "group": 1, "hedg": [10, 38], "here": 0, "high": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "implement": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "import": [1, 61], "index": 59, "indic": 61, "info_exchange_zscor": 12, "inform": [1, 40, 41, 61], "information_divers": 13, "input": [1, 34], "inquiri": 42, "inspect": [1, 61], "interpret": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "introduct": 62, "intuit": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "kei": 0, "length": 43, "level": [11, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 69], "lexical_features_v2": 14, "lexicon": 42, "light": 0, "linguist": 42, "liwc": 42, "marker": 49, "messag": [43, 44], "mimicri": [45, 46], "modul": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73], "motiv": 62, "move": 46, "naiv": 54, "name": [1, 29, 47, 61], "named_entity_recognition_featur": 15, "new": 42, "note": [1, 29, 30, 31, 33, 34, 35, 36, 38, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 55, 56, 57, 58, 59], "onlin": 48, "other": [42, 69], "other_lexical_featur": 16, "ouput": 34, "our": 62, "output": [1, 30, 35, 45, 46, 47, 51], "overview": 1, "own": 42, "packag": [0, 1, 61], "paramet": [0, 1], "percentag": 1, "person": 53, "pip": [1, 61], "polar": 56, "polit": [49, 50], "politeness_featur": 17, "politeness_v2": 18, "politeness_v2_help": 19, "posit": 52, "preload_word_list": 70, "preprocess": 71, "pronoun": 53, "proport": 53, "quantiti": 44, "question": 54, "question_num": 20, "ratio": 60, "readabl": 21, "recept": 49, "recognit": [1, 47], "recommend": [1, 61], "reddit_tag": 22, "reduc": 1, "redund": 1, "regener": 1, "relat": [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], "repair": 32, "roberta": 51, "run": 1, "sampl": [0, 1], "score": [33, 41, 52], "sentiment": 51, "speaker": [11, 59, 62, 69], "start": [0, 1, 61, 62], "strategi": 50, "subject": 57, "summarize_featur": 72, "tabl": 61, "tag": 48, "take": 59, "team": [55, 61, 62], "technic": 11, "temporal_featur": 23, "textblob": [56, 57], "textblob_sentiment_analysi": 24, "time": 58, "token": 60, "toolkit": 61, "touch": 0, "train": 47, "troubleshoot": [1, 61], "turn": [1, 59], "turn_taking_featur": 25, "type": 60, "us": [1, 61], "usag": 1, "user": 11, "util": 69, "utter": [11, 39, 62, 69], "v": 42, "variance_in_dd": 26, "vector": 1, "virtual": [1, 61], "walkthrough": 1, "within_person_discursive_rang": 27, "word": [31, 36, 42, 60], "word_mimicri": 28, "work": 1, "your": [1, 42], "z": [41, 52], "zscore_chats_and_convers": 73}}) \ No newline at end of file diff --git a/docs/build/html/utils/assign_chunk_nums.html b/docs/build/html/utils/assign_chunk_nums.html index c530ec89..6793e966 100644 --- a/docs/build/html/utils/assign_chunk_nums.html +++ b/docs/build/html/utils/assign_chunk_nums.html @@ -5,7 +5,7 @@ assign_chunk_nums module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/utils/calculate_chat_level_features.html b/docs/build/html/utils/calculate_chat_level_features.html index 1874a432..feca6800 100644 --- a/docs/build/html/utils/calculate_chat_level_features.html +++ b/docs/build/html/utils/calculate_chat_level_features.html @@ -5,7 +5,7 @@ calculate_chat_level_features module — Team Communication Toolkit 0.1.1 documentation - + @@ -96,7 +96,7 @@

      calculate_chat_level_features module

      -class utils.calculate_chat_level_features.ChatLevelFeaturesCalculator(chat_data: DataFrame, vect_data: DataFrame, bert_sentiment_data: DataFrame, ner_training: DataFrame, ner_cutoff: int, conversation_id_col: str, message_col: str, timestamp_col: str | tuple[str, str], timestamp_unit: str, custom_liwc_dictionary: dict)
      +class utils.calculate_chat_level_features.ChatLevelFeaturesCalculator(chat_data: DataFrame, vect_data: DataFrame, bert_sentiment_data: DataFrame, ner_training: DataFrame, ner_cutoff: int, conversation_id_col: str, message_col: str, timestamp_col: str | tuple[str, str], timestamp_unit: str, custom_liwc_dictionary: dict, logger: Logger)

      Bases: object

      Initialize variables and objects used by the ChatLevelFeaturesCalculator class.

      This class uses various feature modules to define chat-level features. It reads input data and diff --git a/docs/build/html/utils/calculate_conversation_level_features.html b/docs/build/html/utils/calculate_conversation_level_features.html index b377c9a3..e2a1adca 100644 --- a/docs/build/html/utils/calculate_conversation_level_features.html +++ b/docs/build/html/utils/calculate_conversation_level_features.html @@ -5,7 +5,7 @@ calculate_conversation_level_features module — Team Communication Toolkit 0.1.1 documentation - + @@ -96,7 +96,7 @@

      calculate_conversation_level_features module

      -class utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator(chat_data: DataFrame, user_data: DataFrame, conv_data: DataFrame, vect_data: DataFrame, vector_directory: str, conversation_id_col: str, speaker_id_col: str, message_col: str, timestamp_col: str, convo_aggregation: bool, convo_methods: list, convo_columns: list, user_aggregation: bool, user_methods: list, user_columns: list, chat_features: list)
      +class utils.calculate_conversation_level_features.ConversationLevelFeaturesCalculator(chat_data: DataFrame, user_data: DataFrame, conv_data: DataFrame, vect_data: DataFrame, vector_directory: str, conversation_id_col: str, speaker_id_col: str, message_col: str, timestamp_col: str, convo_aggregation: bool, convo_methods: list, convo_columns: list, user_aggregation: bool, user_methods: list, user_columns: list, chat_features: list, logger)

      Bases: object

      Initialize variables and objects used by the ConversationLevelFeaturesCalculator class.

      This class uses various feature modules to define conversation-level features. It reads input data and diff --git a/docs/build/html/utils/calculate_user_level_features.html b/docs/build/html/utils/calculate_user_level_features.html index a8a17750..8f91cb2a 100644 --- a/docs/build/html/utils/calculate_user_level_features.html +++ b/docs/build/html/utils/calculate_user_level_features.html @@ -5,7 +5,7 @@ calculate_user_level_features module — Team Communication Toolkit 0.1.1 documentation - + @@ -96,7 +96,7 @@

      calculate_user_level_features module

      -class utils.calculate_user_level_features.UserLevelFeaturesCalculator(chat_data: DataFrame, user_data: DataFrame, vect_data: DataFrame, conversation_id_col: str, speaker_id_col: str, user_aggregation: bool, user_methods: list, user_columns: list, chat_features: list)
      +class utils.calculate_user_level_features.UserLevelFeaturesCalculator(chat_data: DataFrame, user_data: DataFrame, vect_data: DataFrame, conversation_id_col: str, speaker_id_col: str, user_aggregation: bool, user_methods: list, user_columns: list, chat_features: list, logger)

      Bases: object

      Initialize variables and objects used by the UserLevelFeaturesCalculator class.

      This class uses various feature modules to define user- (speaker) level features. It reads input data and diff --git a/docs/build/html/utils/check_embeddings.html b/docs/build/html/utils/check_embeddings.html index 76302496..b0809f52 100644 --- a/docs/build/html/utils/check_embeddings.html +++ b/docs/build/html/utils/check_embeddings.html @@ -5,7 +5,7 @@ check_embeddings module — Team Communication Toolkit 0.1.1 documentation - + @@ -111,7 +111,7 @@

      check_embeddings module

      -utils.check_embeddings.check_embeddings(chat_data: DataFrame, vect_path: str, bert_path: str, need_sentence: bool, need_sentiment: bool, regenerate_vectors: bool, use_gpu: bool, message_col: str = 'message')
      +utils.check_embeddings.check_embeddings(chat_data: DataFrame, vect_path: str, bert_path: str, need_sentence: bool, need_sentiment: bool, regenerate_vectors: bool, use_gpu: bool, message_col: str, logger)

      Check if embeddings and required lexicons exist, and generate them if they don’t.

      This function ensures the necessary vector and BERT embeddings are available. It also checks for the presence of certainty and lexicon files, generating them if needed.

      @@ -126,6 +126,7 @@
    • regenerate_vectors (bool, optional) – If true, will regenerate vector data even if it already exists

    • use_gpu (bool) – If true, will use GPU for embeddings if available; otherwise, will use CPU.

    • message_col (str, optional) – A string representing the column name that should be selected as the message. Defaults to “message”.

    • +
    • logger (logging.Logger) – Logger for logging messages

    Returns:
    diff --git a/docs/build/html/utils/gini_coefficient.html b/docs/build/html/utils/gini_coefficient.html index 53487b1b..92d29500 100644 --- a/docs/build/html/utils/gini_coefficient.html +++ b/docs/build/html/utils/gini_coefficient.html @@ -5,7 +5,7 @@ gini_coefficient module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/utils/index.html b/docs/build/html/utils/index.html index 0cecb5d2..9a8ffc51 100644 --- a/docs/build/html/utils/index.html +++ b/docs/build/html/utils/index.html @@ -5,7 +5,7 @@ Utilities — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/utils/preload_word_lists.html b/docs/build/html/utils/preload_word_lists.html index 7b5787a5..d112e22b 100644 --- a/docs/build/html/utils/preload_word_lists.html +++ b/docs/build/html/utils/preload_word_lists.html @@ -5,7 +5,7 @@ preload_word_lists module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/utils/preprocess.html b/docs/build/html/utils/preprocess.html index 1d9afed0..35f3fb4d 100644 --- a/docs/build/html/utils/preprocess.html +++ b/docs/build/html/utils/preprocess.html @@ -5,7 +5,7 @@ preprocess module — Team Communication Toolkit 0.1.1 documentation - + @@ -65,6 +65,7 @@
  • preprocess_text()
  • preprocess_text_lowercase_but_retain_punctuation()
  • remove_unhashable_cols()
  • +
  • setup_logger()
  • summarize_features module
  • @@ -296,6 +297,27 @@ +
    +
    +utils.preprocess.setup_logger(name: str, log_file_path: str, level: int = 20)
    +

    Set up a logger

    +
    +
    Parameters:
    +
      +
    • name (str) – The name of the logger.

    • +
    • log_file_path (str) – Path to the log file, such as ‘./output/logs/feature_builder.log’.

    • +
    • level (int, optional) – Logging level, defaults to logging.INFO. All levels: 0: NOTSET, 10: DEBUG, 20: INFO, 30: WARNING, 40: ERROR, 50: CRITICAL.

    • +
    +
    +
    Returns:
    +

    Configured logger.

    +
    +
    Return type:
    +

    logging.Logger

    +
    +
    +
    +
    diff --git a/docs/build/html/utils/summarize_features.html b/docs/build/html/utils/summarize_features.html index 11d56eec..1469c785 100644 --- a/docs/build/html/utils/summarize_features.html +++ b/docs/build/html/utils/summarize_features.html @@ -5,7 +5,7 @@ summarize_features module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/build/html/utils/zscore_chats_and_conversation.html b/docs/build/html/utils/zscore_chats_and_conversation.html index 5d0691ef..92594553 100644 --- a/docs/build/html/utils/zscore_chats_and_conversation.html +++ b/docs/build/html/utils/zscore_chats_and_conversation.html @@ -5,7 +5,7 @@ zscore_chats_and_conversation module — Team Communication Toolkit 0.1.1 documentation - + diff --git a/docs/source/basics.rst b/docs/source/basics.rst index 1fe30963..5035fa6a 100644 --- a/docs/source/basics.rst +++ b/docs/source/basics.rst @@ -82,25 +82,23 @@ Customizable Parameters Here are some parameters that can be customized. For more details, refer to the :ref:`examples` and :ref:`feature_builder`. -1. ``analyze_first_pct``: Analyze only the first portion (X% of utterances) of a conversation. +1. ``turns``: Combine successive messages by the same individual into a single "turn." -2. ``turns``: Combine successive messages by the same individual into a single "turn." +2. ``cumulative_grouping`` and ``within_task``: Perform nested grouping, analyzing "sub-conversations" within a larger conversation together. -3. ``cumulative_grouping`` and ``within_task``: Perform nested grouping, analyzing "sub-conversations" within a larger conversation together. +3. ``ner_training_df`` and ``ner_cutoff``: Measure the number of named entities in each utterance (see :ref:`named_entity_recognition`). -4. ``ner_training_df`` and ``ner_cutoff``: Measure the number of named entities in each utterance (see :ref:`named_entity_recognition`). +4. ``regenerate_vectors``: Force-regenerate vector data even if it already exists. -5. ``regenerate_vectors``: Force-regenerate vector data even if it already exists. +5. ``use_gpu``: If set to True and a GPU is available, the package will generate sentence vectors (SBERT) and RoBERTa sentiments using the GPU. Defaults to False (which means the package will only use the CPU). -6. ``use_gpu``: If set to True and a GPU is available, the package will generate sentence vectors (SBERT) and RoBERTa sentiments using the GPU. Defaults to False (which means the package will only use the CPU). +6. ``compute_vectors_from_preprocessed``: Computes vectors using preprocessed text (that is, with capitalization and punctuation removed). This was the default behavior for v.0.1.3 and earlier, but we now default to computing metrics on the unpreprocessed text (which INCLUDES capitalization and punctuation), and this parameter now defaults to False. -7. ``compute_vectors_from_preprocessed``: Computes vectors using preprocessed text (that is, with capitalization and punctuation removed). This was the default behavior for v.0.1.3 and earlier, but we now default to computing metrics on the unpreprocessed text (which INCLUDES capitalization and punctuation), and this parameter now defaults to False. +7. ``custom_liwc_dictionary_path``: Allows the user to "bring their own" LIWC dictionary, and thereby access more recent versions of the LIWC features. Our default version of LIWC is 2007, but users can obtain more recent versions of the lexicon by contacting `Ryan Boyd `_ and `Jamie Pennebaker `_. For more information on using the custom LIWC dictionary, please see :ref:`liwc`. -8. ``custom_liwc_dictionary_path``: Allows the user to "bring their own" LIWC dictionary, and thereby access more recent versions of the LIWC features. Our default version of LIWC is 2007, but users can obtain more recent versions of the lexicon by contacting `Ryan Boyd `_ and `Jamie Pennebaker `_. For more information on using the custom LIWC dictionary, please see :ref:`liwc`. +8. **Custom Aggregation of Utterance (Chat)-Level Attributes** (``convo_aggregation``, ``convo_methods``, ``convo_columns``, ``user_aggregation``, ``user_methods``, and ``user_columns``): Customize the ways in which attributes at a lower level of analysis (for example, the number of words in a given message) get aggregated to a higher level of analysis (for example, the total number of words in an entire conversation.) See the Worked Example (:ref:`custom_aggregation`) for details. -9. **Custom Aggregation of Utterance (Chat)-Level Attributes** (``convo_aggregation``, ``convo_methods``, ``convo_columns``, ``user_aggregation``, ``user_methods``, and ``user_columns``): Customize the ways in which attributes at a lower level of analysis (for example, the number of words in a given message) get aggregated to a higher level of analysis (for example, the total number of words in an entire conversation.) See the Worked Example (:ref:`custom_aggregation`) for details. - -Example Usage: +Custom Aggregation Example Usage: .. code-block:: python @@ -115,3 +113,14 @@ To turn off aggregation, set the following parameters to ``False``. By default, convo_aggregation = False user_aggregation = False + +9. **Reducing Redundant Features** (``drop_redundant_columns``, ``corr_thresh``, ``min_na_ratio``, ``min_zero_ratio``, ``min_group_size``, and ``treat_zero_as_na``): **New in v.0.1.8.** The FeatureBuilder can automatically detect groups of highly correlated features and retain only one representative per group, as well as drop columns with too many missing (NA) or zero values. See the Worked Example (:ref:`reducing_redundant_features`) for details. + +Reducing Redundant Features Example Usage: + +.. code-block:: python + + # By default, drop_redundant_columns is False, so redundant columns are only logged, not removed. + # Set it to True to actually drop them from the output. + drop_redundant_columns = True + corr_thresh = 0.9 # Treat features correlated at >= 0.9 (absolute Spearman) as redundant. diff --git a/docs/source/examples.rst b/docs/source/examples.rst index bab74995..14a7d8e6 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -94,7 +94,8 @@ Now we are ready to call the FeatureBuilder on our data. All we need to do is de grouping_keys = ["batch_num", "round_num"], # NOTE: This example demonstrates grouping. Use conversation_id_col if you have a single conversation identifier. vector_directory = "./vector_data/", output_file_base = "jury_output", - turns = True # NOTE: This defaults to False. Decide whether you want to combine successive 'utterances' by the same person as a 'turn.' + turns = True, # NOTE: This defaults to False. Decide whether you want to combine successive 'utterances' by the same person as a 'turn.' + drop_redundant_columns = True # NOTE: This defaults to False. When True, highly correlated and sparse feature columns are dropped from the output. ) jury_feature_builder.featurize() @@ -241,18 +242,12 @@ Custom Features * You can chose to add any of these features depending on your preference. -Analyzing First Percentage (%) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Analyzing First Percentage (%) [Deprecated] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* The **analyze_first_pct** parameter allows you to "cut off" and separately analyze the first X% of a conversation, in case you wish to separately study different sections of a conversation as it progresses. For example, you may be interested in knowing how the attributes of the first 50% of a conversation differ from the attributes of the entire conversation. Then you can sepcify the following: +.. warning:: - .. code-block:: python - - analyze_first_pct: [0.5, 1.0] - - * This will first analyze the first 50% of each conversation, and then analyze the full conversation. - - * By default, we will simply analyze 100% of each conversation. + **Deprecated as of v.0.1.8.** The **analyze_first_pct** parameter (and its underlying ``get_first_pct_of_chat`` method) has been removed. To analyze only a portion of a conversation, subset your input dataframe before passing it to the FeatureBuilder. Named Entity Recognition ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -350,6 +345,42 @@ Important Notes and Caveats - **[NOTE 2]** Be careful when choosing the "sum" aggregation method, as it is not always appropriate to use the "sum" as an aggregation function. While it is a sensible choice for utterance-level attributes that are *countable* (for example, the total number of words, or other lexical wordcounts), it is a less sensible choice for others (for example, it does not make sense to sum sentiment scores for each utterance in a conversation). Consequently, using the "sum" feature will come with an associated warning. - **[NOTE 3]** In addition to aggregating from the utterance (chat) level to the conversation level, we also aggregate from the speaker (user) level to the conversation level, using the same methods specified in ``convo_methods`` to do so. +.. _reducing_redundant_features: + +Reducing Redundant Features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* **New in v.0.1.8.** The FeatureBuilder generates a large number of features, and many of them are highly correlated with one another or are sparsely populated (containing mostly missing or zero values). The following parameters allow the FeatureBuilder to automatically detect groups of redundant features and retain only one representative per group, as well as drop columns that are dominated by missing (NA) or zero values. + +* The **drop_redundant_columns** parameter is the main "switch" controlling this behavior. It defaults to **False**, meaning that no columns are dropped; instead, the FeatureBuilder simply *logs* which columns it identified as redundant. When set to **True**, the redundant columns are actually removed from the chat-, user-, and conversation-level outputs. + + * Note that only features generated by the FeatureBuilder are considered. Your original input columns (metadata, outcome variables, and other non-numeric columns) are always preserved untouched. + +* The remaining parameters control *how* redundancy is detected: + + * **corr_thresh** (default ``0.9``): the minimum absolute Spearman correlation at which two numeric features are treated as redundant. Features whose correlations meet this threshold are clustered into groups, and one representative (the column with the most valid data and highest variance) is kept from each group. + + * **min_group_size** (default ``2``): the minimum number of correlated columns required to form a redundancy group. + + * **min_na_ratio** (default ``0.3``): numeric columns whose fraction of missing (NA) values exceeds this threshold are flagged (and dropped, if **drop_redundant_columns** is **True**). + + * **min_zero_ratio** (default ``0.9``): numeric columns whose fraction of zero values exceeds this threshold are flagged (and dropped, if **drop_redundant_columns** is **True**). + + * **treat_zero_as_na** (default ``True``): if **True**, zeros are treated as missing values when computing redundancy metrics and selecting the representative column for each group. + +* **Example: dropping correlated features.** To actually drop redundant columns (rather than just logging them), set **drop_redundant_columns = True**. Recall that this parameter **defaults to False**, so you must opt in. The example below keeps only one representative from each group of features correlated at an absolute Spearman correlation of 0.9 or higher: + + .. code-block:: python + + jury_feature_builder = FeatureBuilder( + input_df = juries_df, + grouping_keys = ["batch_num", "round_num"], + output_file_base = "jury_output", + drop_redundant_columns = True, # Defaults to False; set to True to drop redundant columns rather than only logging them. + corr_thresh = 0.9 # Cluster features correlated at >= 0.9 (absolute Spearman) and keep one representative per group. + ) + jury_feature_builder.featurize() + Cumulative Grouping ~~~~~~~~~~~~~~~~~~~~ diff --git a/src/team_comm_tools/feature_builder.py b/src/team_comm_tools/feature_builder.py index fe433b08..05ca5732 100644 --- a/src/team_comm_tools/feature_builder.py +++ b/src/team_comm_tools/feature_builder.py @@ -1,12 +1,11 @@ -# feature_builder.py - # 3rd Party Imports import pandas as pd pd.options.mode.chained_assignment = None import re import numpy as np from pathlib import Path -import time +from datetime import datetime +from time import perf_counter import itertools import warnings @@ -47,9 +46,6 @@ class FeatureBuilder: be calculated. Defaults to an empty list (i.e., no additional features beyond the defaults will be computed). :type custom_features: list, optional - :param analyze_first_pct: Analyze the first X% of the data. This parameter is useful because the - earlier stages of the conversation may be more predictive than the later stages. Defaults to [1.0]. - :type analyze_first_pct: list(float), optional :param turns: If true, collapses multiple "chats"/messages by the same speaker in a row into a single "turn." Defaults to False. :type turns: bool, optional @@ -62,9 +58,9 @@ class FeatureBuilder: :param message_col: A string representing the column name that should be selected as the message. Defaults to "message". :type message_col: str, optional - :param timestamp_col: A string representing the column name that should be selected as the message. + :param timestamp_col: A timestamp column name, or a tuple of (start_timestamp_col, end_timestamp_col). Defaults to "timestamp". - :type timestamp_col: str, optional + :type timestamp_col: str | tuple[str, str], optional :param timestamp_unit: A string representing the unit of the timestamp (if the timestamp is numeric). Defaults to 'ms' (milliseconds). Other options (D, s, ms, us, ns) can be found on the Pandas reference: https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html @@ -75,7 +71,7 @@ class FeatureBuilder: :type grouping_keys: list, optional :param cumulative_grouping: If true, uses a cumulative way of grouping chats (looking not just within a single ID, but also at what happened before). NOTE: This parameter and the following one - (`within_grouping`) were created in the context of a multi-stage Empirica game (see: + (`within_task`) were created in the context of a multi-stage Empirica game (see: https://github.com/Watts-Lab/multi-task-empirica). Assumes exactly 3 nested columns at different levels: a High, Mid, and Low level; that are temporally nested. Defaults to False. :type cumulative_grouping: bool, optional @@ -87,7 +83,7 @@ class FeatureBuilder: :type ner_training_df: pd.DataFrame, optional :param ner_cutoff: The cutoff value for the confidence of prediction for each named entity. Defaults to 0.9. - :type ner_cutoff: int + :type ner_cutoff: float :param regenerate_vectors: If true, regenerates vector data even if it already exists. Defaults to False. :type regenerate_vectors: bool, optional :param compute_vectors_from_preprocessed: If true, computes vectors using preprocessed text (with @@ -113,6 +109,24 @@ class FeatureBuilder: :type user_columns: list, optional :param use_gpu: Specifies whether to use GPU for vert/bert model. Defaults to False. :type use_gpu: bool, optional + :param corr_thresh: Minimum absolute Spearman correlation used to treat two numeric + columns as redundant during summary reduction. Defaults to 0.9. + :type corr_thresh: float, optional + :param min_na_ratio: Threshold for dropping numeric columns with high missing-value + ratio during summary reduction. Defaults to 0.3. + :type min_na_ratio: float, optional + :param min_zero_ratio: Threshold for dropping numeric columns with high zero ratio + during summary reduction. Defaults to 0.9. + :type min_zero_ratio: float, optional + :param min_group_size: Minimum connected-component size to treat a correlated set + of columns as a redundancy group. Defaults to 2. + :type min_group_size: int, optional + :param treat_zero_as_na: If true, zeros are treated as missing values when computing + redundancy metrics and selecting representative columns. Defaults to True. + :type treat_zero_as_na: bool, optional + :param drop_redundant_columns: If true, chat/user/conversation outputs are reduced to + representative numeric columns based on summary statistics. Defaults to False. + :type drop_redundant_columns: bool, optional :return: The FeatureBuilder writes the generated features to files in the specified paths. The progress will be printed in the terminal, indicating completion with "All Done!". :rtype: None @@ -126,7 +140,7 @@ def __init__( output_file_path_user_level: str = None, output_file_path_conv_level: str = None, custom_features: list = [], - analyze_first_pct: list = [1.0], + # analyze_first_pct: list = [1.0], turns: bool = False, conversation_id_col: str = "conversation_num", speaker_id_col: str = "speaker_nickname", @@ -137,7 +151,7 @@ def __init__( cumulative_grouping = False, within_task = False, ner_training_df: pd.DataFrame = None, - ner_cutoff: int = 0.9, + ner_cutoff: float = 0.9, regenerate_vectors: bool = False, compute_vectors_from_preprocessed: bool = False, custom_liwc_dictionary_path: str = '', @@ -147,19 +161,41 @@ def __init__( user_aggregation = True, user_methods: list = ['mean', 'max', 'min', 'stdev'], user_columns: list = None, - use_gpu: bool = False + use_gpu: bool = False, + corr_thresh: float = 0.9, + min_na_ratio: float = 0.3, + min_zero_ratio: float = 0.9, + min_group_size: int = 2, + treat_zero_as_na: bool = True, + drop_redundant_columns: bool = False ) -> None: - # Some error catching + ###### Initialization ###### + # Ensure output_file_base only contains alphanumeric characters and underscores. + self.output_file_base = re.sub('[^A-Za-z0-9_]', '', output_file_base) + if self.output_file_base != output_file_base: + output_file_base = re.sub('[^A-Za-z0-9_]', '', output_file_base) + warnings.warn("WARNING: Special characters detected in output_file_base. These characters have been automatically removed.") + # Determine a human-readable identifier for this run (used in log headers). + # Prefer the distinct output file name; fall back to output_file_base if no path is given. + if output_file_path_chat_level: + self.file_base_name = re.sub(r'(_chat_level|_level_chat|_turn_level|_level_turn|\.csv)', '', output_file_path_chat_level.split("/")[-1]) + else: + self.file_base_name = self.output_file_base + # Set up logging + self.logger = setup_logger(name="feature_builder_logger", log_file_path=f"./{self.output_file_base}/logs/feature_builder.log") + self.summ_logger = setup_logger(name="summary_details_logger", log_file_path=f"./{self.output_file_base}/logs/summary_details.log") + # Check that input is a dataframe if not isinstance(input_df, pd.DataFrame): - raise TypeError(f"Expected a Pandas DataFrame as input_df, but got {type(df).__name__})") - - print("Initializing Featurization...") + self.logger.error(f"Expected a Pandas DataFrame as input_df, but got {type(input_df).__name__}") + raise TypeError(f"Expected a Pandas DataFrame as input_df, but got {type(input_df).__name__}") input_df = input_df.reset_index(drop=True) # reset index to avoid issues with indexing later on - ###### Set all parameters ###### + print("Initializing Featurization...") + self.logger.info(f"=== Start Initializing FeatureBuilder for {self.file_base_name}.csv ===") - assert(all(0 <= x <= 1 for x in analyze_first_pct)) # first, type check that this is a list of numbers between 0 and 1 - self.first_pct = analyze_first_pct # Set first pct of conversation you want to analyze + ###### Set all parameters ###### + # assert(all(0 <= x <= 1 for x in analyze_first_pct)) # first, type check that this is a list of numbers between 0 and 1 + # self.first_pct = analyze_first_pct # Set first pct of conversation you want to analyze self.turns = turns self.conversation_id_col = conversation_id_col self.speaker_id_col = speaker_id_col @@ -184,6 +220,12 @@ def __init__( self.user_methods = user_methods self.user_columns = user_columns self.use_gpu = use_gpu + self.corr_thresh = corr_thresh + self.min_na_ratio = min_na_ratio + self.min_zero_ratio = min_zero_ratio + self.min_group_size = min_group_size + self.treat_zero_as_na = treat_zero_as_na + self.drop_redundant_columns = drop_redundant_columns # Defining input and output paths. self.chat_data = input_df.copy() self.orig_data = input_df.copy() @@ -236,7 +278,8 @@ def __init__( invalid_features.add(feat) if invalid_features: invalid_features_str = ', '.join(invalid_features) - warnings.warn(f"WARNING: Invalid custom features provided. Ignoring `{invalid_features_str}`.") + print(f"WARNING: Invalid custom features provided. Ignoring `{invalid_features_str}`.") + self.logger.warning(f"WARNING: Invalid custom features provided. Ignoring `{invalid_features_str}`.") # remove named entities if we didn't pass in the column if self.ner_training is None: self.feature_names.remove("Named Entity Recognition") @@ -320,29 +363,35 @@ def __init__( self.output_file_path_conv_level = output_file_path_conv_level self.output_file_path_user_level = output_file_path_user_level - # Ensure output_file_base is alphanumeric + hyphens - if(re.sub('[^A-Za-z0-9_]', '', output_file_base) != output_file_base): - print('here1') - output_file_base = re.sub('[^A-Za-z0-9_]', '', output_file_base) - warnings.warn("WARNING: Special characters detected in output_file_base. These characters have been automatically removed.") + if self.output_file_path_chat_level is None: - self.output_file_path_chat_level = "./" + output_file_base + "_chat_level.csv" + self.output_file_path_chat_level = "./" + self.output_file_base + "_chat_level.csv" if self.output_file_path_conv_level is None: - self.output_file_path_conv_level = "./" + output_file_base + "_conv_level.csv" + self.output_file_path_conv_level = "./" + self.output_file_base + "_conv_level.csv" if self.output_file_path_user_level is None: - self.output_file_path_user_level = "./" + output_file_base + "_user_level.csv" + self.output_file_path_user_level = "./" + self.output_file_base + "_user_level.csv" # Basic error detetection if not bool(self.output_file_path_conv_level) or not bool(re.sub('[^A-Za-z0-9_]', '', self.output_file_path_conv_level)): + self.logger.error("ERROR: Improper conversation-level output file name detected.") raise ValueError("ERROR: Improper conversation-level output file name detected.") if not bool(self.output_file_path_user_level) or not bool(re.sub('[^A-Za-z0-9_]', '', self.output_file_path_user_level)): + self.logger.error("ERROR: Improper user (speaker)-level output file name detected.") raise ValueError("ERROR: Improper user (speaker)-level output file name detected.") # We assume that the base file name is the last item in the output path; we will use this to name the stored vectors. if ('/' not in self.output_file_path_chat_level or '/' not in self.output_file_path_conv_level or '/' not in self.output_file_path_user_level): + self.logger.error( + "We expect you to pass a path in for your output files " + "(output_file_path_chat_level, output_file_path_user_level, and " + "output_file_path_conv_level). If you would like the output to be " + "the current directory, please append './' to the beginning of your " + "filename(s). Your filename should be in the format: " + "path/to/output_name.csv or ./output_name.csv for the current working directory." + ) raise ValueError( "We expect you to pass a path in for your output files " "(output_file_path_chat_level, output_file_path_user_level, and " @@ -355,9 +404,11 @@ def __init__( try: base_file_name = self.output_file_path_chat_level.split("/")[-1] except: + self.logger.error("ERROR: Improper chat-level output file name detected.") raise ValueError("ERROR: Improper chat-level output file name detected.") if not bool(base_file_name) or not bool(re.sub('[^A-Za-z0-9_]', '', base_file_name)): # user didn't specify a file name, or specified one with only nonalphanumeric chars + self.logger.error("ERROR: Improper chat-level output file name detected.") raise ValueError("ERROR: Improper chat-level output file name detected.") try: @@ -396,7 +447,7 @@ def __init__( self.vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name self.bert_path = vector_directory + "sentiment/" + ("turns" if self.turns else "chats") + "/" + base_file_name - check_embeddings(self.chat_data, self.vect_path, self.bert_path, need_sentence, need_sentiment, self.regenerate_vectors, self.use_gpu, message_col = self.vector_colname) + check_embeddings(self.chat_data, self.vect_path, self.bert_path, need_sentence, need_sentiment, self.regenerate_vectors, self.use_gpu, self.vector_colname, self.logger) if(need_sentence): self.vect_data = pd.read_csv(self.vect_path, encoding='mac_roman') @@ -410,7 +461,9 @@ def __init__( # Deriving the base conversation level dataframe. self.conv_data = self.chat_data[[self.conversation_id_col]].drop_duplicates() - + print("Initialization Complete.") + self.logger.info(f"=== Initialization Complete for {self.file_base_name}.csv ===") + self.logger.info("") def set_self_conv_data(self) -> None: @@ -473,9 +526,18 @@ def featurize(self) -> None: :return: None :rtype: None """ - + # Log start of run + start_time = perf_counter() + self.logger.info(f"=== Team Communication Toolkit FeatureBuilder Run initiated for {self.file_base_name}.csv ===") + self.logger.info(f"Featurize started at {datetime.now().astimezone().strftime('%Y-%m-%d %H:%M:%S %Z')}") + num_lines = self.chat_data.shape[0] + num_speakers = self.chat_data[self.speaker_id_col].nunique() + num_conversations = self.chat_data[self.conversation_id_col].nunique() + self.logger.info(f"Data file has {num_lines} lines (chats), {num_speakers} unique speakers, {num_conversations} unique conversations.") + # Step 1. Create chat level features. print("Chat Level Features ...") + self.logger.info("--- Chat Level Features ---") self.chat_level_features() # Things to store before we loop through truncations @@ -487,63 +549,76 @@ def featurize(self) -> None: # Step 2. # Run the chat-level features once, then produce different summaries based on # user specification. - for percentage in self.first_pct: + # for percentage in self.first_pct: # Reset chat, conv, and user objects - self.chat_data = self.chat_data_complete - self.user_data = self.chat_data[[self.conversation_id_col, self.speaker_id_col]].drop_duplicates() - self.set_self_conv_data() + self.chat_data = self.chat_data_complete + self.user_data = self.chat_data[[self.conversation_id_col, self.speaker_id_col]].drop_duplicates() + self.set_self_conv_data() - print("Generating features for the first " + str(percentage*100) + "% of messages...") - self.get_first_pct_of_chat(percentage) + # print("Generating features for the first " + str(percentage*100) + "% of messages...") + # self.logger.info("Generating features for the first " + str(percentage*100) + "% of messages...") + # self.get_first_pct_of_chat(percentage) # update output paths based on truncation percentage to save in a designated folder - if percentage != 1: # special folders for when the percentage is partial - self.output_file_path_user_level = re.sub('/output/', '/output/first_' + str(int(percentage*100)) + "/", self.output_file_path_user_level_original) - self.output_file_path_chat_level = re.sub('/output/', '/output/first_' + str(int(percentage*100)) + "/", self.output_file_path_chat_level_original) - self.output_file_path_conv_level = re.sub('/output/', '/output/first_' + str(int(percentage*100)) + "/", self.output_file_path_conv_level_original) - else: - self.output_file_path_user_level = self.output_file_path_user_level_original - self.output_file_path_chat_level = self.output_file_path_chat_level_original - self.output_file_path_conv_level = self.output_file_path_conv_level_original - - # Make it possible to create folders if they don't exist - Path(self.output_file_path_user_level).parent.mkdir(parents=True, exist_ok=True) - Path(self.output_file_path_chat_level).parent.mkdir(parents=True, exist_ok=True) - Path(self.output_file_path_conv_level).parent.mkdir(parents=True, exist_ok=True) + # if percentage != 1: # special folders for when the percentage is partial + # self.output_file_path_user_level = re.sub('/output/', '/output/first_' + str(int(percentage*100)) + "/", self.output_file_path_user_level_original) + # self.output_file_path_chat_level = re.sub('/output/', '/output/first_' + str(int(percentage*100)) + "/", self.output_file_path_chat_level_original) + # self.output_file_path_conv_level = re.sub('/output/', '/output/first_' + str(int(percentage*100)) + "/", self.output_file_path_conv_level_original) + # else: + self.output_file_path_user_level = self.output_file_path_user_level_original + self.output_file_path_chat_level = self.output_file_path_chat_level_original + self.output_file_path_conv_level = self.output_file_path_conv_level_original - # Store column names of what we generated, so that the user can easily access them - self.chat_features = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Chat"])) - if self.custom_liwc_dictionary: - self.chat_features += [lexicon_type + "_lexical_wordcount_custom" for lexicon_type in self.custom_liwc_dictionary.keys()] - self.conv_features_base = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Conversation"])) + # Make it possible to create folders if they don't exist + Path(self.output_file_path_user_level).parent.mkdir(parents=True, exist_ok=True) + Path(self.output_file_path_chat_level).parent.mkdir(parents=True, exist_ok=True) + Path(self.output_file_path_conv_level).parent.mkdir(parents=True, exist_ok=True) - # Step 3a. Create user level features. - print("Generating User Level Features ...") - self.user_level_features() - - # Step 3b. Create conversation level features. - print("Generating Conversation Level Features ...") - self.conv_level_features() - self.merge_conv_data_with_original() - - # Step 4. Write the features into the files defined in the output paths. - self.conv_features_all = [col for col in self.conv_data if col not in list(self.orig_data.columns) + ["conversation_num", self.message_col + "_original", "message_lower_with_punc"]] # save the column names that we generated! - print("All Done!") + # Store column names of what we generated, so that the user can easily access them + self.chat_features = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Chat"])) + if self.custom_liwc_dictionary: + self.chat_features += [lexicon_type + "_lexical_wordcount_custom" for lexicon_type in self.custom_liwc_dictionary.keys()] + self.conv_features_base = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Conversation"])) - self.save_features() + # Step 3a. Create user level features. + print("Generating User Level Features ...") + self.logger.info("--- User Level Features ---") + self.user_level_features() + + # Step 3b. Create conversation level features. + print("Generating Conversation Level Features ...") + self.logger.info("--- Conversation Level Features ---") + self.conv_level_features() + self.merge_conv_data_with_original() + + # Step 4. Write the features into the files defined in the output paths. + self.conv_features_all = [col for col in self.conv_data if col not in list(self.orig_data.columns) + ["conversation_num", self.message_col + "_original", "message_lower_with_punc"]] # save the column names that we generated! + end_time = perf_counter() + print("All Done!") + self.logger.info(f"=== Featurization Completed for {self.file_base_name}.csv in {end_time - start_time:.2f} seconds! ===") + self.logger.info("") + + self.logger.info(f"=== Feature Output Summary for {self.file_base_name}.csv (Please see summary_details.log for all the details) ===") + self.logger.info("--- Chat Level ---") + chat_data_reduced = self.generate_summary_stats(self.chat_data) + if self.drop_redundant_columns: + self.chat_data = chat_data_reduced + self.logger.info("--- Conversation Level ---") + conv_data_reduced = self.generate_summary_stats(self.conv_data) + if self.drop_redundant_columns: + self.conv_data = conv_data_reduced + self.logger.info("--- User Level ---") + user_data_reduced = self.generate_summary_stats(self.user_data) + if self.drop_redundant_columns: + self.user_data = user_data_reduced + + self.save_features() def preprocess_chat_data(self) -> None: """ Call all preprocessing modules needed to clean the chat text. This function groups the chat data as specified, verifies column presence, creates original and lowercased columns, preprocesses text, and optionally processes chat turns. - - :param turns: Whether to preprocess naive turns, defaults to False - :type turns: bool, optional - :param col: Columns to preprocess, including conversation_id, speaker_id and message, defaults to None - :type cumulative_grouping: bool, optional - :param within_task: Whether to group within tasks, defaults to False - :type within_task: bool, optional :return: None :rtype: None @@ -626,31 +701,32 @@ def chat_level_features(self) -> None: message_col = self.message_col, timestamp_col = self.timestamp_col, timestamp_unit = self.timestamp_unit, - custom_liwc_dictionary = self.custom_liwc_dictionary + custom_liwc_dictionary = self.custom_liwc_dictionary, + logger = self.logger ) # Calling the driver inside this class to create the features. self.chat_data = chat_feature_builder.calculate_chat_level_features(self.feature_methods_chat) # Remove special characters in column names self.chat_data.columns = ["".join(c for c in col if c.isalnum() or c == '_') for col in self.chat_data.columns] - def get_first_pct_of_chat(self, percentage) -> None: - """ - Truncate each conversation to the first X% of rows. + # def get_first_pct_of_chat(self, percentage) -> None: + # """ + # Truncate each conversation to the first X% of rows. - This function groups the chat data by `conversation_num` and retains only - the first X% of rows for each conversation. + # This function groups the chat data by `conversation_num` and retains only + # the first X% of rows for each conversation. - :param percentage: Percentage of rows to retain in each conversation - :type percentage: float + # :param percentage: Percentage of rows to retain in each conversation + # :type percentage: float - :return: None - :rtype: None - """ - chat_grouped = self.chat_data.groupby(self.conversation_id_col) - num_rows_to_retain = pd.DataFrame(np.ceil(chat_grouped.size() * percentage)).reset_index() - chat_truncated = pd.DataFrame() - for conversation_num, num_rows in num_rows_to_retain.itertuples(index=False): - chat_truncated = pd.concat([chat_truncated,chat_grouped.get_group(conversation_num).head(int(num_rows))], ignore_index = True) + # :return: None + # :rtype: None + # """ + # chat_grouped = self.chat_data.groupby(self.conversation_id_col) + # num_rows_to_retain = pd.DataFrame(np.ceil(chat_grouped.size() * percentage)).reset_index() + # chat_truncated = pd.DataFrame() + # for conversation_num, num_rows in num_rows_to_retain.itertuples(index=False): + # chat_truncated = pd.concat([chat_truncated,chat_grouped.get_group(conversation_num).head(int(num_rows))], ignore_index = True) def user_level_features(self) -> None: """ @@ -672,7 +748,8 @@ def user_level_features(self) -> None: user_aggregation = self.user_aggregation, user_methods = self.user_methods, user_columns = self.user_columns, - chat_features = self.chat_features + chat_features = self.chat_features, + logger=self.logger ) self.user_data = user_feature_builder.calculate_user_level_features() # Remove special characters in column names @@ -705,6 +782,7 @@ def conv_level_features(self) -> None: user_methods = self.user_methods, user_columns = self.user_columns, chat_features = self.chat_features, + logger=self.logger ) # Calling the driver inside this class to create the features. self.conv_data = conv_feature_builder.calculate_conversation_level_features(self.feature_methods_conv) @@ -727,9 +805,10 @@ def load_custem_liwc_dict(self, custom_liwc_dictionary_path: str) -> dict: """ Load the custom LIWC dictionary from the provided path. - This function reads the custom LIWC dictionary from the provided path and returns the dictionary. + This function reads the custom LIWC dictionary from the provided path and returns + the parsed dictionary. If the path is empty/invalid, returns an empty dict. - :param custom_liwc_dictionary_path: Path to the custom LIWC dictionary file + :param custom_liwc_dictionary_path: Path to the custom LIWC dictionary file. :type custom_liwc_dictionary_path: str :return: Custom LIWC dictionary @@ -760,7 +839,7 @@ def verify_timestamp_format(self, timestamp_col) -> None: Verifies that a column in a DataFrame is composed of values that can be parsed either as datetime or as numeric values suitable for time difference calculations. - :param timestamp_col: The name of the column to verify + :param timestamp_col: The name of the column to verify. :type timestamp_col: str :return: None @@ -786,4 +865,171 @@ def verify_timestamp_format(self, timestamp_col) -> None: raise ValueError( f"Column '{timestamp_col}' contains values that are neither parseable as datetime " f"nor convertible to numeric format." - ) \ No newline at end of file + ) + + def log_column_groups(self, groups, max_groups, max_cols_per_group): + """ + Log correlated feature groups to standard and detailed loggers. + + :param groups: Correlated column groups. + :type groups: list[list[str]] + :param max_groups: Maximum number of groups to print to the standard logger. + :type max_groups: int + :param max_cols_per_group: Maximum number of columns shown per group in + the standard logger. + :type max_cols_per_group: int + + :return: None + :rtype: None + """ + total_groups = len(groups) + self.logger.info("Found %s correlated feature groups", total_groups) + for i, group in enumerate(groups[:max_groups], 1): + size = len(group) + if size > max_cols_per_group: + shown = ", ".join(group[:max_cols_per_group]) + self.logger.info( + "[Group %02d | size=%d] %s ... (+%d more)", + i, size, shown, size - max_cols_per_group + ) + else: + self.logger.info( + "[Group %02d | size=%d] %s", + i, size, ", ".join(group) + ) + if total_groups > max_groups: + self.logger.info( + "... (%d more groups not shown)", + total_groups - max_groups + ) + self.summ_logger.info("Full correlated feature groups output:") + for i, group in enumerate(groups, 1): + self.summ_logger.info( + "[Group %02d | size=%d] %s", + i, len(group), ", ".join(group) + ) + + def keep_one_column_per_group(self, df, groups): + """ + Select one representative column from each correlated group. + + Non-grouped columns are preserved, and grouped columns are reduced to the + best-scoring representative based on valid-count and variance. + + :param df: Original dataframe. + :type df: pd.DataFrame + :param groups: Groups of similar columns. + :type groups: list[list[str]] + + :return: Final list of columns to keep. + :rtype: list[str] + """ + grouped_cols = set() + representative_map = {} + kept_group_cols = [] + + for group in groups: + grouped_cols.update(group) + + def score(col): + s = df[col] + if self.treat_zero_as_na: + valid_count = ((~s.isna()) & (s != 0)).sum() + else: + valid_count = s.notna().sum() + + variance = s.replace(0, pd.NA).dropna().var() if self.treat_zero_as_na else s.dropna().var() + variance = 0 if pd.isna(variance) else variance + + return (valid_count, variance) + + best_col = max(group, key=score) + kept_group_cols.append(best_col) + representative_map[best_col] = [c for c in group if c != best_col] + + ungrouped_cols = [c for c in df.columns if c not in grouped_cols] + + kept_columns = ungrouped_cols + kept_group_cols + return kept_columns + + def generate_summary_stats(self, df) -> pd.DataFrame: + """ + Log and optionally reduce redundant numeric feature columns. + + The method identifies numeric columns with high missingness and zero rates, + discovers highly correlated feature groups, and retains one representative + per group. Non-numeric columns are preserved and reattached before return. + + :param df: Input dataframe to summarize and optionally reduce. + :type df: pd.DataFrame + + :return: Dataframe with non-numeric columns plus filtered numeric columns. + :rtype: pd.DataFrame + """ + # only analyze columns generated by the FeatureBuilder (i.e., not in the original input data); + # original columns and non-numeric columns are preserved untouched. + original_cols = [c for c in self.orig_data.columns if c in df.columns] + df_reduced = df.drop(columns=original_cols).select_dtypes(include=[np.number]) + df_other = df.drop(columns=df_reduced.columns) + + # 1. list columns with lots of NAs + na_ratio = df_reduced.isna().mean() + cols_with_many_nas = na_ratio[na_ratio > self.min_na_ratio].index.tolist() + drop_str = " were dropped" if self.drop_redundant_columns else "" + self.logger.info( + f"{len(cols_with_many_nas)} columns with more than {self.min_na_ratio * 100}% NA's{drop_str}" + ) + self.summ_logger.info( + f"Columns with more than {self.min_na_ratio * 100}% NA's{drop_str}:\n"\ + + "\n".join(" "*30 + f"- {str(col)}" for col in cols_with_many_nas)) + df_reduced = df_reduced.drop(columns=cols_with_many_nas) + + # 2. list columns with lots of zeros + zero_ratio = (df_reduced == 0).mean() + cols_with_many_zeros = zero_ratio[zero_ratio > self.min_zero_ratio].index.tolist() + self.logger.info( + f"{len(cols_with_many_zeros)} columns with more than {self.min_zero_ratio * 100}% zeros{drop_str}" + ) + self.summ_logger.info( + f"Columns with more than {self.min_zero_ratio * 100}% zeros{drop_str}:\n"\ + + "\n".join(" "*30 + f"- {str(col)}" for col in cols_with_many_zeros)) + df_reduced = df_reduced.drop(columns=cols_with_many_zeros) + + # 3. cluster similar columns + if self.treat_zero_as_na: + df_reduced = df_reduced.replace(0, np.nan) + corr = df_reduced.corr(method="spearman", min_periods=max(10, int(0.05 * len(df_reduced)))).abs() + cols = corr.columns.tolist() + graph = {col: set() for col in cols} + for i in range(len(cols)): + for j in range(i + 1, len(cols)): + r = corr.iloc[i, j] + if pd.notna(r) and r >= self.corr_thresh: + a, b = cols[i], cols[j] + graph[a].add(b) + graph[b].add(a) + visited = set() + groups = [] + for col in cols: + if col in visited: + continue + stack = [col] + group = [] + while stack: + node = stack.pop() + if node in visited: + continue + visited.add(node) + group.append(node) + stack.extend(graph[node] - visited) + if len(group) >= self.min_group_size: + groups.append(sorted(group)) + groups.sort(key=lambda g: (-len(g), g)) + self.log_column_groups(groups, max_groups=10, max_cols_per_group=8) + + kept_columns = self.keep_one_column_per_group(df_reduced, groups) + df_reduced = df_reduced[kept_columns] + if self.drop_redundant_columns: + self.logger.info("For each group of similar columns, one representative with the most valid data and highest variance was retained") + df_final = pd.concat([df_other, df_reduced], axis=1) + return df_final \ No newline at end of file diff --git a/src/team_comm_tools/utils/calculate_chat_level_features.py b/src/team_comm_tools/utils/calculate_chat_level_features.py index 051544b7..5709fcc6 100644 --- a/src/team_comm_tools/utils/calculate_chat_level_features.py +++ b/src/team_comm_tools/utils/calculate_chat_level_features.py @@ -19,6 +19,8 @@ # Importing utils from .preload_word_lists import * from .zscore_chats_and_conversation import get_zscore_across_all_chats, get_zscore_across_all_conversations +from time import perf_counter +import logging # Loading bar from tqdm import tqdm @@ -69,7 +71,8 @@ def __init__( message_col: str, timestamp_col: str | tuple[str, str], timestamp_unit: str, - custom_liwc_dictionary: dict + custom_liwc_dictionary: dict, + logger: logging.Logger ) -> None: self.chat_data = chat_data @@ -86,6 +89,7 @@ def __init__( self.function_words = get_function_words() # load function words exactly once self.question_words = get_question_words() # load question words exactly once self.first_person = get_first_person_words() # load first person words exactly once + self.logger = logger def calculate_chat_level_features(self, feature_methods: list) -> pd.DataFrame: """ @@ -99,7 +103,10 @@ def calculate_chat_level_features(self, feature_methods: list) -> pd.DataFrame: """ for method in tqdm(feature_methods): + start_time = perf_counter() method(self) + end_time = perf_counter() + self.logger.info(f" - {method.__name__}: {end_time - start_time:.2f} seconds.") # Return the input dataset with the chat level features appended (as columns) return self.chat_data diff --git a/src/team_comm_tools/utils/calculate_conversation_level_features.py b/src/team_comm_tools/utils/calculate_conversation_level_features.py index 4e959559..f6ab7e1d 100644 --- a/src/team_comm_tools/utils/calculate_conversation_level_features.py +++ b/src/team_comm_tools/utils/calculate_conversation_level_features.py @@ -8,6 +8,7 @@ from team_comm_tools.utils.gini_coefficient import * from team_comm_tools.utils.preprocess import * from fuzzywuzzy import process +from time import perf_counter class ConversationLevelFeaturesCalculator: """ @@ -57,6 +58,7 @@ def __init__(self, chat_data: pd.DataFrame, user_methods: list, user_columns: list, chat_features: list, + logger ) -> None: # Initializing variables @@ -75,6 +77,7 @@ def __init__(self, chat_data: pd.DataFrame, self.user_methods = user_methods self.user_columns = user_columns self.chat_features = chat_features + self.logger = logger def clean_up_aggregation_method_names(aggregation_method_names:list, method_param:str) -> list: """ @@ -234,7 +237,10 @@ def calculate_conversation_level_features(self, feature_methods: list) -> pd.Dat """ for method in feature_methods: + start_time = perf_counter() method(self) + end_time = perf_counter() + self.logger.info(f" - {method.__name__}: {end_time - start_time:.2f} seconds.") return self.conv_data diff --git a/src/team_comm_tools/utils/calculate_user_level_features.py b/src/team_comm_tools/utils/calculate_user_level_features.py index 0d9043c9..cf85b47f 100644 --- a/src/team_comm_tools/utils/calculate_user_level_features.py +++ b/src/team_comm_tools/utils/calculate_user_level_features.py @@ -3,6 +3,7 @@ from team_comm_tools.features.get_user_network import * from team_comm_tools.features.user_centroids import * from fuzzywuzzy import process +from time import perf_counter class UserLevelFeaturesCalculator: """ @@ -38,7 +39,8 @@ def __init__(self, chat_data: pd.DataFrame, user_aggregation: bool, user_methods: list, user_columns: list, - chat_features: list) -> None: + chat_features: list, + logger) -> None: # Initializing variables self.chat_data = chat_data @@ -49,6 +51,7 @@ def __init__(self, chat_data: pd.DataFrame, self.user_aggregation = user_aggregation self.user_methods = user_methods self.chat_features = chat_features + self.logger = logger def clean_up_aggregation_method_names(aggregation_method_names:list) -> list: """ @@ -152,16 +155,28 @@ def calculate_user_level_features(self) -> pd.DataFrame: """ # Get total counts for features that need to be summed, regardless of what the user specified + start_time = perf_counter() self.get_user_level_summed_features() - + end_time = perf_counter() + self.logger.info(f" - user_level_summed_features: {end_time - start_time:.2f} seconds.") + # Get user summary statistics for all features (e.g. mean, min, max, stdev) + start_time = perf_counter() self.get_user_level_summary_statistics_features() - + end_time = perf_counter() + self.logger.info(f" - user_level_summary_statistics_features: {end_time - start_time:.2f} seconds.") + # Get 4 discursive features (discursive diversity, variance in DD, incongruent modulation, within-person discursive range) + start_time = perf_counter() self.get_centroids() + end_time = perf_counter() + self.logger.info(f" - user_centroids: {end_time - start_time:.2f} seconds.") # Get list of other users in a given conversation + start_time = perf_counter() self.get_user_network() + end_time = perf_counter() + self.logger.info(f" - user_network: {end_time - start_time:.2f} seconds.") return self.user_data diff --git a/src/team_comm_tools/utils/check_embeddings.py b/src/team_comm_tools/utils/check_embeddings.py index e5c6eba7..b94631d4 100644 --- a/src/team_comm_tools/utils/check_embeddings.py +++ b/src/team_comm_tools/utils/check_embeddings.py @@ -6,16 +6,16 @@ import warnings from tqdm import tqdm from pathlib import Path +from time import perf_counter -import torch -from sentence_transformers import SentenceTransformer, util +from torch import cuda, no_grad +from sentence_transformers import SentenceTransformer -from transformers import AutoTokenizer -from transformers import AutoModelForSequenceClassification +from transformers import AutoTokenizer, AutoModelForSequenceClassification, logging as hf_logging from scipy.special import softmax -from transformers import logging +import logging -logging.set_verbosity(40) # only log errors +hf_logging.set_verbosity(40) # only log errors MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest" tokenizer = AutoTokenizer.from_pretrained(MODEL) @@ -26,7 +26,7 @@ # Check if embeddings exist def check_embeddings(chat_data: pd.DataFrame, vect_path: str, bert_path: str, need_sentence: bool, - need_sentiment: bool, regenerate_vectors: bool, use_gpu: bool, message_col: str = "message"): + need_sentiment: bool, regenerate_vectors: bool, use_gpu: bool, message_col: str, logger): """ Check if embeddings and required lexicons exist, and generate them if they don't. @@ -47,45 +47,73 @@ def check_embeddings(chat_data: pd.DataFrame, vect_path: str, bert_path: str, ne :type regenerate_vectors: bool, optional :param use_gpu: If true, will use GPU for embeddings if available; otherwise, will use CPU. :type use_gpu: bool - :param message_col: A string representing the column name that should be selected as the message. Defaults to "message". - :type message_col: str, optional + :param message_col: A string representing the column name that should be selected as the message. + :type message_col: str + :param logger: Logger for logging messages + :type logger: logging.Logger :return: None :rtype: None """ device = "cpu" if use_gpu: - if torch.cuda.is_available(): + if cuda.is_available(): print("Using GPU for embeddings.") + logger.info("Using GPU for embeddings.") device = "cuda" else: print("GPU not available, using CPU for embeddings.") + logger.info("GPU not available, using CPU for embeddings.") if (regenerate_vectors or (not os.path.isfile(vect_path))) and need_sentence: + logger.info("Generating sentence vectors cache...") + start_time = perf_counter() generate_vect(chat_data, vect_path, message_col, device) + end_time = perf_counter() + logger.info(f"Sentence vectors generation completed in {end_time - start_time:.2f} seconds.") if (regenerate_vectors or (not os.path.isfile(bert_path))) and need_sentiment: + logger.info("Generating BERT vectors cache...") + start_time = perf_counter() generate_bert(chat_data, bert_path, message_col, device) + end_time = perf_counter() + logger.info(f"BERT vectors generation completed in {end_time - start_time:.2f} seconds.") try: vector_df = pd.read_csv(vect_path) # check whether the given vector and bert data matches length of chat data if len(vector_df) != len(chat_data): print("ERROR: The length of the vector data does not match the length of the chat data. Regenerating...") + logger.error("The length of the vector data does not match the length of the chat data. Regenerating...") + start_time = perf_counter() generate_vect(chat_data, vect_path, message_col, device) + end_time = perf_counter() + logger.info(f"Sentence vectors regeneration completed in {end_time - start_time:.2f} seconds.") except FileNotFoundError: # It's OK if we don't have the path, if the sentence vectors are not necessary if need_sentence: + logger.error("Vector embeddings file not found. Generating new vector embeddings.") + start_time = perf_counter() generate_vect(chat_data, vect_path, message_col, device) + end_time = perf_counter() + logger.info(f"Sentence vectors generation completed in {end_time - start_time:.2f} seconds.") try: bert_df = pd.read_csv(bert_path) if len(bert_df) != len(chat_data): print("ERROR: The length of the sentiment data does not match the length of the chat data. Regenerating...") + logger.error("The length of the sentiment data does not match the length of the chat data. Regenerating...") # delete the file + start_time = perf_counter() generate_bert(chat_data, bert_path, message_col, device) + end_time = perf_counter() + logger.info(f"BERT vectors regeneration completed in {end_time - start_time:.2f} seconds.") except FileNotFoundError: if need_sentiment: # It's OK if we don't have the path, if the sentiment features are not necessary + logger.error("BERT sentiment file not found. Generating new BERT sentiments.") + start_time = perf_counter() generate_bert(chat_data, bert_path, message_col, device) - + end_time = perf_counter() + logger.info(f"BERT vectors generation completed in {end_time - start_time:.2f} seconds.") + # Get the lexicon pickle(s) if they don't exist current_script_directory = Path(__file__).resolve().parent LEXICON_PATH_STATIC = current_script_directory.parent/"features/assets/lexicons_dict.pkl" @@ -365,8 +393,8 @@ def generate_vect(chat_data, output_path, message_col, device, batch_size=64): :type chat_data: pd.DataFrame :param output_path: Path to save the CSV file containing message embeddings. :type output_path: str - :param message_col: A string representing the column name that should be selected as the message. Defaults to "message". - :type message_col: str, optional + :param message_col: A string representing the column name that should be selected as the message. + :type message_col: str :param device: A string representing the device to use for computation, either "cpu" or "cuda". :type device: str :param batch_size: The size of each batch for processing sentiment analysis. Defaults to 64. @@ -403,8 +431,8 @@ def generate_bert(chat_data, output_path, message_col, device, batch_size=64): :type chat_data: pd.DataFrame :param output_path: Path to save the CSV file containing sentiment scores. :type output_path: str - :param message_col: A string representing the column name that should be selected as the message. Defaults to "message". - :type message_col: str, optional + :param message_col: A string representing the column name that should be selected as the message. + :type message_col: str :param batch_size: The size of each batch for processing sentiment analysis. Defaults to 64. :type batch_size: int :raises FileNotFoundError: If the output path is invalid. @@ -448,7 +476,7 @@ def get_sentiment(texts, model_bert, device): encoded = tokenizer(non_null_non_empty_texts, padding=True, truncation=True, max_length=512, return_tensors='pt') encoded = {k: v.to(device) for k, v in encoded.items()} - with torch.no_grad(): + with no_grad(): output = model_bert(**encoded) scores = output[0].detach().cpu().numpy() diff --git a/src/team_comm_tools/utils/preprocess.py b/src/team_comm_tools/utils/preprocess.py index 0234eba8..0f04d2f6 100644 --- a/src/team_comm_tools/utils/preprocess.py +++ b/src/team_comm_tools/utils/preprocess.py @@ -1,6 +1,7 @@ import re +import logging import pandas as pd -# import warnings +import os EMOJIS = { "(:", "(;", "):", "/:", ":(", ":)", ":/", ";)", # 8 emojis from LIWC 2017 @@ -296,3 +297,34 @@ def create_cumulative_rows(input_df, conversation_id, timestamp_col, grouping_ke ) return result_df + +def setup_logger(name: str, log_file_path: str, level: int=logging.INFO): + """Set up a logger + + :param name: The name of the logger. + :type name: str + :param log_file_path: Path to the log file, such as './output/logs/feature_builder.log'. + :type log_file_path: str + :param level: Logging level, defaults to logging.INFO. All levels: 0: NOTSET, 10: DEBUG, 20: INFO, 30: WARNING, 40: ERROR, 50: CRITICAL. + :type level: int, optional + :return: Configured logger. + :rtype: logging.Logger + """ + formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s") + log_dir = os.path.dirname(log_file_path) + if log_dir: + os.makedirs(log_dir, exist_ok=True) + logger = logging.getLogger(name) + logger.setLevel(level) + # Prevent “double logging” via parent/root handlers + logger.propagate = False + abs_path = os.path.abspath(log_file_path) + # If a FileHandler for this same file already exists, don’t add another + for h in logger.handlers: + if isinstance(h, logging.FileHandler) and os.path.abspath(getattr(h, "baseFilename", "")) == abs_path: + return logger + handler = logging.FileHandler(log_file_path) + handler.setFormatter(formatter) + logger.addHandler(handler) + + return logger \ No newline at end of file diff --git a/tests/data/cleaned_data/test_redundant_columns.csv b/tests/data/cleaned_data/test_redundant_columns.csv new file mode 100644 index 00000000..60034bdf --- /dev/null +++ b/tests/data/cleaned_data/test_redundant_columns.csv @@ -0,0 +1,21 @@ +conversation_num,speaker_nickname,message,timestamp,feat_base,feat_corr,feat_independent,sparse_zeros,mostly_na +1,s0,message number 0,0,1,,1,0,1.0 +1,s1,message number 1,1,2,,20,0,2.0 +1,s2,message number 2,2,3,6.0,2,0,3.0 +1,s0,message number 3,3,4,8.0,19,0,4.0 +1,s1,message number 4,4,5,10.0,3,0,5.0 +1,s2,message number 5,5,6,12.0,18,0,6.0 +1,s0,message number 6,6,7,14.0,4,0,7.0 +1,s1,message number 7,7,8,16.0,17,0,8.0 +1,s2,message number 8,8,9,18.0,5,0,9.0 +1,s0,message number 9,9,10,20.0,16,0,10.0 +1,s1,message number 10,10,11,22.0,6,0, +1,s2,message number 11,11,12,24.0,15,0, +1,s0,message number 12,12,13,26.0,7,0, +1,s1,message number 13,13,14,28.0,14,0, +1,s2,message number 14,14,15,30.0,8,0, +1,s0,message number 15,15,16,32.0,13,0, +1,s1,message number 16,16,17,34.0,9,0, +1,s2,message number 17,17,18,36.0,12,0, +1,s0,message number 18,18,19,38.0,10,0, +1,s1,message number 19,19,20,40.0,11,5, diff --git a/tests/test_drop_redundant_columns.py b/tests/test_drop_redundant_columns.py new file mode 100644 index 00000000..05fe7abc --- /dev/null +++ b/tests/test_drop_redundant_columns.py @@ -0,0 +1,153 @@ +""" +file: test_drop_redundant_columns.py +--- +Tests for the redundancy-reduction behavior introduced by the +``drop_redundant_columns`` option on the FeatureBuilder. + +The reducer (``FeatureBuilder.generate_summary_stats``) only inspects the +*generated* numeric feature columns -- columns present in the original input +(``self.orig_data``) are always preserved untouched. The ``drop_redundant_columns`` +flag itself only gates whether ``featurize`` *applies* the reduced frame: + + df_reduced = self.generate_summary_stats(self.chat_data) + if self.drop_redundant_columns: + self.chat_data = df_reduced + +These tests use a fixture dataset (data/cleaned_data/test_redundant_columns.csv) +with deliberately redundant generated-feature columns: + + feat_base monotonic 1..20 + feat_corr = 2 * feat_base (Spearman corr 1.0 with feat_base; has 2 NaNs) + feat_independent uncorrelated zigzag (Spearman ~0.08 with feat_base) + sparse_zeros 95% zeros -> exceeds min_zero_ratio (0.9) + mostly_na 50% NaN -> exceeds min_na_ratio (0.3) +""" + +import logging + +import pandas as pd +import pytest + +from team_comm_tools import FeatureBuilder + +# The four columns that stand in for the original/input data. Everything else in +# the CSV is treated as a generated feature column eligible for reduction. +ORIGINAL_COLS = ["conversation_num", "speaker_nickname", "message", "timestamp"] + +redundant_df = pd.read_csv("data/cleaned_data/test_redundant_columns.csv") + + +def make_reducer(drop_redundant_columns): + """ + Build a FeatureBuilder with only the attributes generate_summary_stats needs, + bypassing the (heavy) __init__/featurize pipeline so the reduction logic can + be exercised in isolation. + """ + fb = FeatureBuilder.__new__(FeatureBuilder) + fb.orig_data = redundant_df[ORIGINAL_COLS].copy() + fb.min_na_ratio = 0.3 + fb.min_zero_ratio = 0.9 + fb.corr_thresh = 0.9 + fb.min_group_size = 2 + fb.treat_zero_as_na = True + fb.drop_redundant_columns = drop_redundant_columns + fb.logger = logging.getLogger("test_drop_redundant") + fb.summ_logger = logging.getLogger("test_drop_redundant_summary") + return fb + + +def test_no_drop_default_preserves_all_columns(): + """ + (a) Default no-drop behavior: with drop_redundant_columns=False, featurize keeps + the original (un-reduced) frame, so every column -- including the redundant ones -- + survives. We mirror the exact gating used in FeatureBuilder.featurize. + """ + fb = make_reducer(drop_redundant_columns=False) + input_df = redundant_df.copy() + + df_reduced = fb.generate_summary_stats(input_df) + kept = df_reduced if fb.drop_redundant_columns else input_df + + # Nothing is dropped when the flag is off. + assert list(kept.columns) == list(redundant_df.columns) + for col in ["feat_base", "feat_corr", "feat_independent", "sparse_zeros", "mostly_na"]: + assert col in kept.columns + + +def test_generate_summary_stats_does_not_mutate_input(): + """ + The reducer must be non-mutating: it returns a new frame and leaves the input + untouched. This is what makes the no-drop path above safe. + """ + fb = make_reducer(drop_redundant_columns=False) + input_df = redundant_df.copy() + cols_before = list(input_df.columns) + + fb.generate_summary_stats(input_df) + + assert list(input_df.columns) == cols_before + + +def test_drop_enabled_reduces_redundant_columns(): + """ + (b) Opt-in drop behavior: with drop_redundant_columns=True, redundant generated + columns are removed -- correlated duplicates, high-zero columns, and high-NA + columns -- while the representative and uncorrelated features are retained. + """ + fb = make_reducer(drop_redundant_columns=True) + + reduced = fb.generate_summary_stats(redundant_df.copy()) + + # Correlated duplicate, sparse, and high-NA columns are dropped. + assert "feat_corr" not in reduced.columns # correlated with feat_base + assert "sparse_zeros" not in reduced.columns # >90% zeros + assert "mostly_na" not in reduced.columns # >30% NA + + # The group representative and the uncorrelated feature are kept. + assert "feat_base" in reduced.columns + assert "feat_independent" in reduced.columns + + +def test_original_columns_always_preserved_when_dropping(): + """ + Even with dropping enabled, original/input columns are never analyzed or removed. + """ + fb = make_reducer(drop_redundant_columns=True) + + reduced = fb.generate_summary_stats(redundant_df.copy()) + + for col in ORIGINAL_COLS: + assert col in reduced.columns + + +def test_correlated_group_keeps_best_representative(): + """ + Within a correlated group, the representative with the most valid data (and + highest variance as a tiebreak) is kept. feat_base has 20 valid values; feat_corr + has 18 (two NaNs), so feat_base must be the survivor and feat_corr dropped. + """ + fb = make_reducer(drop_redundant_columns=True) + + reduced = fb.generate_summary_stats(redundant_df.copy()) + + assert "feat_base" in reduced.columns + assert "feat_corr" not in reduced.columns + + +@pytest.mark.parametrize("sparse_col", ["sparse_zeros", "mostly_na"]) +def test_sparse_columns_removed_only_when_enabled(sparse_col): + """ + Sparse columns (high-zero / high-NA) are removed when dropping is enabled, but + preserved (via the no-drop gating) when it is disabled. + """ + # enabled -> removed + fb_on = make_reducer(drop_redundant_columns=True) + reduced = fb_on.generate_summary_stats(redundant_df.copy()) + assert sparse_col not in reduced.columns + + # disabled -> preserved (featurize keeps the original frame) + fb_off = make_reducer(drop_redundant_columns=False) + input_df = redundant_df.copy() + df_reduced = fb_off.generate_summary_stats(input_df) + kept = df_reduced if fb_off.drop_redundant_columns else input_df + assert sparse_col in kept.columns