import pandas as pd


from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


df = pd.read_csv('/content/drive/MyDrive/the-office-lines-csv.csv')

df.head()


# Check for missing values
missing_values = df.isnull().sum()
missing_values


# Check the data types of each column
data_types = df.dtypes
data_types


# The dataset includes an 'id' column which I'm going to drop because Pandas, by default, creates an index for me.

df.drop(columns=['id'], inplace=True)
df.head()


# Installing the vaderSentiment library
!pip install vaderSentiment

Requirement already satisfied: vaderSentiment in /usr/local/lib/python3.10/dist-packages (3.3.2)
Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from vaderSentiment) (2.32.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->vaderSentiment) (3.4.0)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->vaderSentiment) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->vaderSentiment) (2.2.3)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->vaderSentiment) (2024.8.30)


# Now that the VADER library is installed, I'm going to import
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


# Initializing the VADER sentiment analyzer.
analyzer = SentimentIntensityAnalyzer()

# Now to apply the sentiment analysis to each line of dialogue
df['sentiment'] = df['line_text'].apply(lambda line: analyzer.polarity_scores(line)["compound"])


df[["line_text", "sentiment"]].head(20)


# Grouping by season and calculate the average sentiment score for each season.
sentiment_by_season = df.groupby('season')['sentiment'].mean()

print(sentiment_by_season)

season
1    0.164965
2    0.142028
3    0.136380
4    0.140098
5    0.127326
6    0.129451
7    0.137139
8    0.138774
9    0.141170
Name: sentiment, dtype: float64


import matplotlib.pyplot as plt

plt.plot(sentiment_by_season.index, sentiment_by_season.values, marker='o')
plt.xlabel('Season')
plt.ylabel('Average Sentiment Score')
plt.title('Average Sentiment Across Seasons of The Office')
plt.grid(True)
plt.show()


# 1. average number of lines per season
lines_per_season = df.groupby('season')['line_text'].count()

# 2. Total number of unique characters
unique_characters = df['speaker'].nunique()

# 3. Top 5 characters with the most lines
top_characters = df['speaker'].value_counts().head(5)

# 4. Average sentiment score by character (top 5)
top_characters_sentiment = df[df['speaker'].isin(top_characters.index)].groupby('speaker')['sentiment'].mean()

# 5. Distribution of sentiment scores (histogram).
import matplotlib.pyplot as plt

plt.hist(df['sentiment'], bins=20, edgecolor='black')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.title('Distribution of Sentiment Scores for Lines in The Office')
plt.grid(True)
plt.show()

lines_per_season, unique_characters, top_characters, top_characters_sentiment

(season
 1    1996
 2    7492
 3    7483
 4    5642
 5    8170
 6    7630
 7    7302
 8    7083
 9    7111
 Name: line_text, dtype: int64,
 797,
 speaker
 Michael    12137
 Dwight      7529
 Jim         6814
 Pam         5375
 Andy        3968
 Name: count, dtype: int64,
 speaker
 Andy       0.154255
 Dwight     0.098032
 Jim        0.151209
 Michael    0.174460
 Pam        0.149948
 Name: sentiment, dtype: float64)


#import pandas as pd
#import time
#from pathlib import Path
#from pydantic import BaseModel
#from openai import OpenAI
#from typing import Optional, List
#import json
#from tqdm import tqdm
#import concurrent.futures
#import numpy as np

#class LineScore(BaseModel):
#    line_index: int
#    sentiment: float

#class BatchSentimentScore(BaseModel):
#    scores: List[LineScore]

#class DialogueProcessor:
#    def __init__(self, input_file: str, output_file: str, batch_size: int = 500, n_workers: int = 4):
#        self.client = OpenAI()
#        self.input_file = input_file
#        self.output_file = output_file
#        self.batch_size = batch_size
#        self.n_workers = n_workers

#    def get_batch_sentiment(self, batch_data: tuple) -> dict:
#        texts, indices = batch_data
#        try:
#            # Create a numbered list of texts for the model
#            numbered_texts = [f"Text {i}: {text}" for i, text in zip(indices, texts)]
#            all_texts = "\n\n".join(numbered_texts)

#            completion = self.client.beta.chat.completions.parse(
#                model="gpt-4o-2024-08-06",
#                messages=[
#                    {"role": "system", "content": """You are a sentiment analysis expert. Analyze each text and return a sentiment score between -1.0 (most negative) and 1.0 (most positive).
#                    Use 0.0 for neutral sentiment. Consider context, tone, and subtle emotional nuances in the dialogue.
#                    You must return a score for every text provided."""},
#                    {"role": "user", "content": all_texts},
#                ],
#                response_format=BatchSentimentScore,
#            )

#            return {score.line_index: score.sentiment for score in completion.choices[0].message.parsed.scores}
#        except Exception as e:
#            print(f"Error processing batch: {str(e)}")
#            return {}

#    def process_file(self):
#        try:
#            print(f"Reading input file: {self.input_file}")
#            df = pd.read_csv(self.input_file)
#            print(f"Found {len(df)} lines to process")

#            # split data into batches.
#            indices = list(range(len(df)))
#            texts = df["line_text"].tolist()

#            # Creating batches
#            batch_indices = [indices[i:i + self.batch_size] for i in range(0, len(indices), self.batch_size)]
#            batch_texts = [texts[i:i + self.batch_size] for i in range(0, len(texts), self.batch_size)]
#            batches = list(zip(batch_texts, batch_indices))

#            results = {}
#            processed_count = 0

#            print(f"Processing {len(batches)} batches with {self.n_workers} workers")

#            with concurrent.futures.ThreadPoolExecutor(max_workers=self.n_workers) as executor:
#                # Submit all batches to the thread pool
#                future_to_batch = {executor.submit(self.get_batch_sentiment, batch): i
#                                 for i, batch in enumerate(batches)}

#                # Process completed batches as they finish
#                for future in tqdm(concurrent.futures.as_completed(future_to_batch),
#                                 total=len(batches),
#                                 desc="Processing batches"):
#                    batch_results = future.result()
#                    results.update(batch_results)
#                    processed_count += len(batch_results)

#                    # Save intermediate results every 2000 lines
#                    if processed_count % 2000 == 0:
#                        self.save_results(df, results)

#            # Now to save the final results
#            self.save_results(df, results)
#            print("Processing completed successfully")

#        except Exception as e:
#            print(f"Error during file processing: {str(e)}")
#            if 'results' in locals() and 'df' in locals():
#                self.save_results(df, results)
#            raise

#    def save_results(self, df: pd.DataFrame, results: dict):
#        try:
#            output_df = df.copy()
#            output_df["sentiment_score"] = pd.Series(results)
#            output_df.to_csv(self.output_file, index=False)
#            print(f"Results saved to {self.output_file}")
#        except Exception as e:
#            print(f"Error saving results: {str(e)}")

#def main():
#    # Initializing processor with optimized parameters
#    processor = DialogueProcessor(
#        input_file="/Users/will5206/Desktop/JUST_TEXT_LINES_THE_OFFICE - the-office-lines-csv.csv",
#        output_file="processed-sentiment-lines-59909.csv",
#        batch_size=500,  # Process 500 lines at once
#        n_workers=4      # Use 4 parallel workers
#    )

#    # Process the file
#    processor.process_file()

#if __name__ == "__main__":
#    main()


# Loading the output file from the openai-api-line-processing script

openai_processed_lines = pd.read_csv("/content/drive/MyDrive/processed-sentiment-lines-59909.csv")
openai_processed_lines.drop("line_text", axis=1, inplace=True)
openai_processed_lines.head()


# Dropping the old 'sentiment' column
df.drop("sentiment", axis=1, inplace=True)

# Add 'sentiment_score' from 'openai_processed_lines' to 'df' based on row index
df['sentiment_score'] = openai_processed_lines['sentiment_score']
df.head()

# One thing I forgot to do in Milestone 1 was to remove all rows of scenes that were deleted from the show as indicated
# by a value of 'TRUE' in the 'deleted' column.
df = df[df['deleted'] != True]


# Grouping by season and calculate the average sentiment score for each season.
new_sentiment_by_season = df.groupby('season')['sentiment_score'].mean()

print(new_sentiment_by_season)

season
1    0.044336
2   -0.000909
3    0.012983
4   -0.008380
5   -0.027901
6   -0.021494
7    0.024541
8    0.009646
9    0.024871
Name: sentiment_score, dtype: float64


import matplotlib.pyplot as plt

plt.plot(new_sentiment_by_season.index, new_sentiment_by_season.values, marker='o')
plt.xlabel('Season')
plt.ylabel('Average Sentiment Score')
plt.title('Average Sentiment Across Seasons of The Office')
plt.grid(True)
plt.show()


# 1. average number of lines per season
lines_per_season = df.groupby('season')['line_text'].count()

# 2. Total number of unique characters
unique_characters = df['speaker'].nunique()

# 3. Top 5 characters with the most lines
top_characters = df['speaker'].value_counts().head(5)

# 4. Average sentiment score by character (top 5)
top_characters_sentiment = df[df['speaker'].isin(top_characters.index)].groupby('speaker')['sentiment_score'].mean()

# 5. Distribution of sentiment scores (histogram).
import matplotlib.pyplot as plt

plt.hist(df['sentiment_score'], bins=20, edgecolor='black')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.title('Distribution of Sentiment Scores for Lines in The Office')
plt.grid(True)
plt.show()

lines_per_season, unique_characters, top_characters, top_characters_sentiment

(season
 1    1536
 2    6051
 3    7448
 4    5642
 5    8170
 6    7630
 7    7302
 8    7083
 9    7111
 Name: line_text, dtype: int64,
 790,
 speaker
 Michael    11574
 Dwight      7167
 Jim         6609
 Pam         5205
 Andy        3968
 Name: count, dtype: int64,
 speaker
 Andy       0.027235
 Dwight    -0.048481
 Jim        0.045383
 Michael   -0.006531
 Pam        0.043709
 Name: sentiment_score, dtype: float64)


# 1. Average number of lines per season
plt.figure(figsize=(8, 6))
lines_per_season.plot(kind='bar', color='skyblue', edgecolor='black')
plt.xlabel('Season')
plt.ylabel('Number of Lines')
plt.title('Average Number of Lines per Season')
plt.grid(axis='y')
plt.show()

# 2. Total number of unique characters (displayed as a simple text output)
print(f"Total number of unique characters: {unique_characters}")

# 3. Top 5 characters with the most lines
plt.figure(figsize=(8, 6))
top_characters.plot(kind='bar', color='salmon', edgecolor='black')
plt.xlabel('Character')
plt.ylabel('Number of Lines')
plt.title('Top 5 Characters with the Most Lines')
plt.grid(axis='y')
plt.show()

# 4. Average sentiment score by character (top 5)
plt.figure(figsize=(8, 6))
top_characters_sentiment.plot(kind='bar', color='lightgreen', edgecolor='black')
plt.xlabel('Character')
plt.ylabel('Average Sentiment Score')
plt.title('Average Sentiment Score by Character (Top 5)')
plt.grid(axis='y')
plt.show()

Total number of unique characters: 790


# Grouping by season and episode, then calculate the mean sentiment score for each episode
sentiment_by_episode = df.groupby(['season', 'episode'])['sentiment_score'].mean().reset_index()

# Plotting the average sentiment score by episode
plt.figure(figsize=(12, 6))
plt.plot(sentiment_by_episode.index, sentiment_by_episode['sentiment_score'], marker='o', linestyle='-', color='blue')
plt.xlabel('Episode Index (by Season)')
plt.ylabel('Average Sentiment Score')
plt.title('Average Sentiment Score by Episode in The Office')
plt.grid(True)
plt.show()


# Since we're introducing a new dataset into the mix, I'm going to rename the 'df', the transcripts data frame
# to be more specific for data frame-distinguishing purposes
transcripts_df = df
transcripts_df.head()


import requests
from bs4 import BeautifulSoup
import pandas as pd

# imdb site url containing episode ratings
url = "https://www.imdb.com/title/tt0386676/ratings/"

# Sending request to the website
response = requests.get(url)

response.status_code

403


# imdb site url containing episode ratings
url = "https://www.imdb.com/title/tt0386676/ratings/"

# headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.google.com"
}


# Sending request to the website
response = requests.get(url, headers=headers)

response.status_code

200


soup = BeautifulSoup(response.content, 'html.parser')

# Lists to store extracted data
seasons = []
episodes = []
ratings = []

# Find all elements with the `a` tag containing the season, episode, and rating information in `aria-label`
for rating_div in soup.select('td.ratings-heatmap__table-data a[aria-label]'):
    # Extract the aria-label attribute
    aria_label = rating_div['aria-label']

    # Parsing the information: "Season X Episode Y, Rating Z.Z"
    parts = aria_label.split(", ")
    if len(parts) == 2:
        season_episode, rating = parts
        season = int(season_episode.split(" ")[1])
        episode = int(season_episode.split(" ")[3])
        rating_value = float(rating.split(" ")[1])

        # Append extracted data to lists
        seasons.append(season)
        episodes.append(episode)
        ratings.append(rating_value)

# Create a DataFrame from the lists
imdb_episode_ratings_df = pd.DataFrame({
    'Season': seasons,
    'Episode': episodes,
    'Rating': ratings
})

# verifying the table - there IS an issue! See next text cell.
imdb_episode_ratings_df.head()


missing_episodes = [
    {'Season': 2, 'Episode': 16, 'Rating': 8.1},
    {'Season': 2, 'Episode': 17, 'Rating': 8.3},
    {'Season': 2, 'Episode': 18, 'Rating': 8.2},
    {'Season': 2, 'Episode': 19, 'Rating': 8.0},
    {'Season': 2, 'Episode': 20, 'Rating': 8.2},
    {'Season': 2, 'Episode': 21, 'Rating': 8.6},
    {'Season': 2, 'Episode': 22, 'Rating': 9.3},
    {'Season': 3, 'Episode': 16, 'Rating': 8.8},
    {'Season': 3, 'Episode': 17, 'Rating': 8.3},
    {'Season': 3, 'Episode': 18, 'Rating': 8.9},
    {'Season': 3, 'Episode': 19, 'Rating': 8.6},
    {'Season': 3, 'Episode': 20, 'Rating': 8.5},
    {'Season': 3, 'Episode': 21, 'Rating': 8.6},
    {'Season': 3, 'Episode': 22, 'Rating': 9.0},
    {'Season': 3, 'Episode': 23, 'Rating': 9.2},
    {'Season': 5, 'Episode': 16, 'Rating': 7.8},
    {'Season': 5, 'Episode': 17, 'Rating': 8.5},
    {'Season': 5, 'Episode': 18, 'Rating': 8.1},
    {'Season': 5, 'Episode': 19, 'Rating': 8.2},
    {'Season': 5, 'Episode': 20, 'Rating': 8.1},
    {'Season': 5, 'Episode': 21, 'Rating': 8.5},
    {'Season': 5, 'Episode': 22, 'Rating': 8.5},
    {'Season': 5, 'Episode': 23, 'Rating': 9.2},
    {'Season': 5, 'Episode': 24, 'Rating': 8.1},
    {'Season': 5, 'Episode': 25, 'Rating': 8.7},
    {'Season': 5, 'Episode': 26, 'Rating': 8.9},
    {'Season': 6, 'Episode': 16, 'Rating': 7.8},
    {'Season': 6, 'Episode': 17, 'Rating': 8.3},
    {'Season': 6, 'Episode': 18, 'Rating': 8.3},
    {'Season': 6, 'Episode': 19, 'Rating': 7.5},
    {'Season': 6, 'Episode': 20, 'Rating': 7.6},
    {'Season': 6, 'Episode': 21, 'Rating': 8.4},
    {'Season': 6, 'Episode': 22, 'Rating': 7.6},
    {'Season': 6, 'Episode': 23, 'Rating': 7.7},
    {'Season': 6, 'Episode': 24, 'Rating': 8.0},
    {'Season': 6, 'Episode': 25, 'Rating': 7.6},
    {'Season': 6, 'Episode': 26, 'Rating': 7.8},
    {'Season': 7, 'Episode': 16, 'Rating': 9.3},
    {'Season': 7, 'Episode': 17, 'Rating': 7.3},
    {'Season': 7, 'Episode': 18, 'Rating': 9.3},
    {'Season': 7, 'Episode': 19, 'Rating': 7.5},
    {'Season': 7, 'Episode': 20, 'Rating': 8.9},
    {'Season': 7, 'Episode': 21, 'Rating': 9.8},
    {'Season': 7, 'Episode': 22, 'Rating': 7.4},
    {'Season': 7, 'Episode': 23, 'Rating': 8.5},
    {'Season': 7, 'Episode': 24, 'Rating': 8.5},
    {'Season': 8, 'Episode': 16, 'Rating': 7.9},
    {'Season': 8, 'Episode': 17, 'Rating': 7.6},
    {'Season': 8, 'Episode': 18, 'Rating': 7.6},
    {'Season': 8, 'Episode': 19, 'Rating': 6.3},
    {'Season': 8, 'Episode': 20, 'Rating': 6.8},
    {'Season': 8, 'Episode': 21, 'Rating': 6.7},
    {'Season': 8, 'Episode': 22, 'Rating': 6.8},
    {'Season': 8, 'Episode': 23, 'Rating': 7.4},
    {'Season': 8, 'Episode': 24, 'Rating': 7.5},
    {'Season': 9, 'Episode': 16, 'Rating': 7.7},
    {'Season': 9, 'Episode': 17, 'Rating': 7.3},
    {'Season': 9, 'Episode': 18, 'Rating': 7.7},
    {'Season': 9, 'Episode': 19, 'Rating': 7.8},
    {'Season': 9, 'Episode': 20, 'Rating': 7.8},
    {'Season': 9, 'Episode': 21, 'Rating': 9.0},
    {'Season': 9, 'Episode': 22, 'Rating': 9.4},
    {'Season': 9, 'Episode': 23, 'Rating': 9.8},
]

# Converting the list of missing episodes to a DataFrame
missing_df = pd.DataFrame(missing_episodes)

# Concatenate the existing and missing data
imdb_episode_ratings_df = pd.concat([imdb_episode_ratings_df, missing_df], ignore_index=True)

# Sort by Season and Episode for a cleaner view.
imdb_episode_ratings_df = imdb_episode_ratings_df.sort_values(by=['Season', 'Episode']).reset_index(drop=True)

imdb_episode_ratings_df.head()


# Checking dtypes
imdb_episode_ratings_df.dtypes


# First to make column names match between dataframes for clean merging
transcripts_df.rename(columns={'season': 'Season', 'episode': 'Episode'}, inplace=True)

# Now to merge the transcripts_df with imdb_episode_ratings_df on 'Season' and 'Episode' columns
merged_df = pd.merge(transcripts_df, imdb_episode_ratings_df, on=['Season', 'Episode'], how='left')

# verify the table
merged_df.head()
#merged_df.iloc[644]
#merged_df.iloc[644:680]


# Calculate average sentiment score per episode
average_sentiment_per_episode = merged_df.groupby(['Season', 'Episode']).sentiment_score.mean().reset_index()
average_sentiment_per_episode = average_sentiment_per_episode.rename(columns={'sentiment_score': 'Average_Sentiment_Score'})

# Merge with IMDb ratings
sentiment_rating_df = pd.merge(average_sentiment_per_episode, imdb_episode_ratings_df, on=['Season', 'Episode'])

# Plot Average Sentiment Score vs. IMDb Rating
plt.figure(figsize=(10, 6))
plt.scatter(sentiment_rating_df['Average_Sentiment_Score'], sentiment_rating_df['Rating'], alpha=0.6, color='b')
plt.title('Average Sentiment Score vs. IMDb Episode Rating')
plt.xlabel('Average Sentiment Score')
plt.ylabel('IMDb Rating')
plt.grid(True)
plt.show()


# Calculate the Pearson correlation coefficient between Average Sentiment Score and IMDb Rating
correlation = sentiment_rating_df['Average_Sentiment_Score'].corr(sentiment_rating_df['Rating'])
correlation

0.0702305298487174


from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler
scaler = MinMaxScaler()

# Normalize avg_sentiment_score and avg_rating
seasonal_sentiment_ratings[['avg_sentiment_score', 'avg_rating']] = scaler.fit_transform(
    seasonal_sentiment_ratings[['avg_sentiment_score', 'avg_rating']]
)

# Plot the normalized values
plt.figure(figsize=(10, 6))
plt.plot(seasonal_sentiment_ratings.index, seasonal_sentiment_ratings['avg_sentiment_score'], label="Normalized Average Sentiment Score", marker='o')
plt.plot(seasonal_sentiment_ratings.index, seasonal_sentiment_ratings['avg_rating'], label="Normalized Average Rating", marker='o', color="orange")
plt.xlabel("Season")
plt.ylabel("Normalized Score")
plt.title("Normalized Average Sentiment Score and Rating per Season")
plt.legend()
plt.show()


# correlation between the normalized avg_sentiment_score and avg_rating
correlation = seasonal_sentiment_ratings['avg_sentiment_score'].corr(seasonal_sentiment_ratings['avg_rating'])
print(f"The correlation between normalized average sentiment scores and ratings is: {correlation}")

The correlation between normalized average sentiment scores and ratings is: -0.23883641950296838


# top 3 characters by line count
top_characters = transcripts_df['speaker'].value_counts().nlargest(3).index

# Filter merged_df to only include these top characters
filtered_df = merged_df[merged_df['speaker'].isin(top_characters)]

# normalize sentiment and rating
filtered_df['normalized_sentiment'] = (filtered_df['sentiment_score'] - filtered_df['sentiment_score'].mean()) / filtered_df['sentiment_score'].std()
filtered_df['normalized_rating'] = (filtered_df['Rating'] - filtered_df['Rating'].mean()) / filtered_df['Rating'].std()


# Group by character and season to get average normalized sentiment and rating
character_season_avg = filtered_df.groupby(['speaker', 'Season']).agg({
    'normalized_sentiment': 'mean',
    'normalized_rating': 'mean'
}).reset_index()

import matplotlib.pyplot as plt

# separate plots for each character
for character in top_characters:
    character_data = character_season_avg[character_season_avg['speaker'] == character]

    plt.figure(figsize=(10, 6))
    plt.plot(character_data['Season'], character_data['normalized_sentiment'], marker='o', label='Sentiment')
    plt.plot(character_data['Season'], character_data['normalized_rating'], marker='s', label='Rating', linestyle='--')

    plt.xlabel('Season')
    plt.ylabel('Normalized Scores')
    plt.title(f'Normalized Sentiment and Rating for {character} Over Seasons')
    plt.legend()
    plt.show()

<ipython-input-224-3869b2fd66b8>:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['normalized_sentiment'] = (filtered_df['sentiment_score'] - filtered_df['sentiment_score'].mean()) / filtered_df['sentiment_score'].std()
<ipython-input-224-3869b2fd66b8>:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['normalized_rating'] = (filtered_df['Rating'] - filtered_df['Rating'].mean()) / filtered_df['Rating'].std()


character_correlations = {}

# Calculate correlation for each character
for character in top_characters:
    character_data = character_season_avg[character_season_avg['speaker'] == character]

    # Calculate Pearson correlation between normalized sentiment and normalized rating
    correlation = character_data['normalized_sentiment'].corr(character_data['normalized_rating'])
    character_correlations[character] = correlation
    print(f"Correlation for {character}: {correlation:.2f}")

# results
character_correlations

Correlation for Michael: 0.87
Correlation for Dwight: -0.19
Correlation for Jim: -0.19

{'Michael': 0.8678056678330158,
 'Dwight': -0.18761254430475763,
 'Jim': -0.19424055657114186}


import seaborn as sns

# 1. Distribution of ratings across all episodes
plt.figure(figsize=(10, 6))
sns.histplot(imdb_episode_ratings_df['Rating'], bins=15, kde=True)
plt.title("Distribution of Episode Ratings")
plt.xlabel("Episode Rating")
plt.ylabel("Frequency")
plt.show()

# 2. Seasonal average ratings trend
season_avg_ratings = imdb_episode_ratings_df.groupby('Season')['Rating'].mean()
plt.figure(figsize=(10, 6))
season_avg_ratings.plot(kind='line', marker='o')
plt.title("Average Episode Rating by Season")
plt.xlabel("Season")
plt.ylabel("Average Rating")
plt.show()

# 3. Summary stats for ratings
rating_mean = imdb_episode_ratings_df['Rating'].mean()
rating_std = imdb_episode_ratings_df['Rating'].std()
rating_range = imdb_episode_ratings_df['Rating'].max() - imdb_episode_ratings_df['Rating'].min()

print("Summary Statistics for Episode Ratings:")
print(f"Mean Rating: {rating_mean}")
print(f"Standard Deviation: {rating_std}")
print(f"Range: {rating_range}")

Summary Statistics for Episode Ratings:
Mean Rating: 8.062234042553191
Standard Deviation: 0.6354158253729384
Range: 3.500000000000001


# filtering the data to only include rows where Michael is the speaker
michael_df = merged_df[merged_df['speaker'] == 'Michael']

# grouping by Season and Episode to get the average sentiment score for Michael's lines per episode
michael_episode_sentiment = michael_df.groupby(['Season', 'Episode'])['sentiment_score'].mean().reset_index()

# merging Michael's episode-level sentiment with the overall episode ratings
michael_episode_sentiment = michael_episode_sentiment.merge(imdb_episode_ratings_df, on=['Season', 'Episode'])

# normalize the sentiment and rating scores for comparison.
michael_episode_sentiment['normalized_sentiment'] = (michael_episode_sentiment['sentiment_score'] - michael_episode_sentiment['sentiment_score'].mean()) / michael_episode_sentiment['sentiment_score'].std()
michael_episode_sentiment['normalized_rating'] = (michael_episode_sentiment['Rating'] - michael_episode_sentiment['Rating'].mean()) / michael_episode_sentiment['Rating'].std()

# Michael's normalized sentiment score per episode vs normalized rating plot
plt.figure(figsize=(14, 7))
plt.plot(michael_episode_sentiment['normalized_sentiment'], label='Normalized Sentiment Score (Michael)', color='blue', marker='o')
plt.plot(michael_episode_sentiment['normalized_rating'], label='Normalized Episode Rating', color='green', marker='o')
plt.xlabel('Episode (Chronological)')
plt.ylabel('Normalized Score')
plt.title("Comparison of Michael's Sentiment per Episode and Episode Ratings (Normalized)")
plt.legend()
plt.show()


# Pearson correlation between Michael's normalized sentiment and normalized episode ratings
correlation = michael_episode_sentiment['normalized_sentiment'].corr(michael_episode_sentiment['normalized_rating'])
print(f"The Pearson correlation between Michael's normalized sentiment per episode and the normalized episode ratings is: {correlation}")

The Pearson correlation between Michael's normalized sentiment per episode and the normalized episode ratings is: 0.1486110479217692


# grouping by Season, Episode, and Scene to calculate the average sentiment score per scene.
scene_sentiment_df = transcripts_df.groupby(['Season', 'Episode', 'scene']).agg({
    'sentiment_score': 'mean'
}).reset_index()

# calculating the average scene sentiment per episode.
episode_scene_sentiment_df = scene_sentiment_df.groupby(['Season', 'Episode']).agg({
    'sentiment_score': 'mean'
}).rename(columns={'sentiment_score': 'avg_scene_sentiment'}).reset_index()

# merging with the main dataframe containing episode ratings
merged_scene_ratings_df = pd.merge(
    imdb_episode_ratings_df,
    episode_scene_sentiment_df,
    how='inner',
    left_on=['Season', 'Episode'],
    right_on=['Season', 'Episode']
)

# Calculate the correlation between average scene sentiment and episode rating
scene_rating_correlation = merged_scene_ratings_df['avg_scene_sentiment'].corr(merged_scene_ratings_df['Rating'])

print(f"Correlation between average scene sentiment and episode rating: {scene_rating_correlation}")

Correlation between average scene sentiment and episode rating: 0.1673474050295768


# Filter for main characters (e.g., Michael, Jim, Dwight) to analyze their sentiment individually
key_characters = ['Michael', 'Jim', 'Dwight']

# filter dataframe for only the key characters
character_sentiment_df = transcripts_df[transcripts_df['speaker'].isin(key_characters)]

# finding the average sentiment score for each character per episode
character_avg_sentiment_df = character_sentiment_df.groupby(['Season', 'Episode', 'speaker']).agg({
    'sentiment_score': 'mean'
}).reset_index()

# Pivot to create separate columns for each character's sentiment
character_avg_sentiment_pivot = character_avg_sentiment_df.pivot(
    index=['Season', 'Episode'],
    columns='speaker',
    values='sentiment_score'
).reset_index()

# rename columns for clarity
character_avg_sentiment_pivot.columns.name = None  # Remove pivoted level name
character_avg_sentiment_pivot = character_avg_sentiment_pivot.rename(
    columns={'Michael': 'Michael_sentiment', 'Jim': 'Jim_sentiment', 'Dwight': 'Dwight_sentiment'}
)

# merge with episode ratings dataframe
merged_character_ratings_df = pd.merge(
    imdb_episode_ratings_df,
    character_avg_sentiment_pivot,
    how='inner',
    on=['Season', 'Episode']
)

# Calculate correlation for each character's sentiment with episode rating
michael_rating_corr = merged_character_ratings_df['Michael_sentiment'].corr(merged_character_ratings_df['Rating'])
jim_rating_corr = merged_character_ratings_df['Jim_sentiment'].corr(merged_character_ratings_df['Rating'])
dwight_rating_corr = merged_character_ratings_df['Dwight_sentiment'].corr(merged_character_ratings_df['Rating'])

print(f"Correlation between Michael's sentiment and episode rating: {michael_rating_corr}")
print(f"Correlation between Jim's sentiment and episode rating: {jim_rating_corr}")
print(f"Correlation between Dwight's sentiment and episode rating: {dwight_rating_corr}")

Correlation between Michael's sentiment and episode rating: 0.1486110479217692
Correlation between Jim's sentiment and episode rating: 0.029158344566363167
Correlation between Dwight's sentiment and episode rating: 0.07765156230632707


%%shell
jupyter nbconvert --to html '/content/drive/MyDrive/Colab Notebooks/FinalProject2024.ipynb'

[NbConvertApp] WARNING | pattern '/content/drive/MyDrive/Colab Notebooks/FinalProjectM22024.ipynb' matched no files
This application is used to convert notebook files (*.ipynb)
        to various other formats.

        WARNING: THE COMMANDLINE INTERFACE MAY CHANGE IN FUTURE RELEASES.

Options
=======
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePreprocessor.enabled=True]
--allow-errors
    Continue notebook execution even if one of the cells throws an error and include the error message in the cell output (the default behaviour is to abort conversion). This flag is only relevant if '--execute' was specified, too.
    Equivalent to: [--ExecutePreprocessor.allow_errors=True]
--stdin
    read a single notebook file from stdin. Write the resulting notebook with default basename 'notebook.*'
    Equivalent to: [--NbConvertApp.from_stdin=True]
--stdout
    Write notebook output to stdout instead of files.
    Equivalent to: [--NbConvertApp.writer_class=StdoutWriter]
--inplace
    Run nbconvert in place, overwriting the existing notebook (only
            relevant when converting to notebook format)
    Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory=]
--clear-output
    Clear output of current file and save in place,
            overwriting the existing notebook.
    Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory= --ClearOutputPreprocessor.enabled=True]
--no-prompt
    Exclude input and output prompts from converted document.
    Equivalent to: [--TemplateExporter.exclude_input_prompt=True --TemplateExporter.exclude_output_prompt=True]
--no-input
    Exclude input cells and output prompts from converted document.
            This mode is ideal for generating code-free reports.
    Equivalent to: [--TemplateExporter.exclude_output_prompt=True --TemplateExporter.exclude_input=True --TemplateExporter.exclude_input_prompt=True]
--allow-chromium-download
    Whether to allow downloading chromium if no suitable version is found on the system.
    Equivalent to: [--WebPDFExporter.allow_chromium_download=True]
--disable-chromium-sandbox
    Disable chromium security sandbox when converting to PDF..
    Equivalent to: [--WebPDFExporter.disable_sandbox=True]
--show-input
    Shows code input. This flag is only useful for dejavu users.
    Equivalent to: [--TemplateExporter.exclude_input=False]
--embed-images
    Embed the images as base64 dataurls in the output. This flag is only useful for the HTML/WebPDF/Slides exports.
    Equivalent to: [--HTMLExporter.embed_images=True]
--sanitize-html
    Whether the HTML in Markdown cells and cell outputs should be sanitized..
    Equivalent to: [--HTMLExporter.sanitize_html=True]
--log-level=<Enum>
    Set the log level by value or name.
    Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
    Default: 30
    Equivalent to: [--Application.log_level]
--config=<Unicode>
    Full path of a config file.
    Default: ''
    Equivalent to: [--JupyterApp.config_file]
--to=<Unicode>
    The export format to be used, either one of the built-in formats
            ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf']
            or a dotted object name that represents the import path for an
            ``Exporter`` class
    Default: ''
    Equivalent to: [--NbConvertApp.export_format]
--template=<Unicode>
    Name of the template to use
    Default: ''
    Equivalent to: [--TemplateExporter.template_name]
--template-file=<Unicode>
    Name of the template file to use
    Default: None
    Equivalent to: [--TemplateExporter.template_file]
--theme=<Unicode>
    Template specific theme(e.g. the name of a JupyterLab CSS theme distributed
    as prebuilt extension for the lab template)
    Default: 'light'
    Equivalent to: [--HTMLExporter.theme]
--sanitize_html=<Bool>
    Whether the HTML in Markdown cells and cell outputs should be sanitized.This
    should be set to True by nbviewer or similar tools.
    Default: False
    Equivalent to: [--HTMLExporter.sanitize_html]
--writer=<DottedObjectName>
    Writer class used to write the
                                        results of the conversion
    Default: 'FilesWriter'
    Equivalent to: [--NbConvertApp.writer_class]
--post=<DottedOrNone>
    PostProcessor class used to write the
                                        results of the conversion
    Default: ''
    Equivalent to: [--NbConvertApp.postprocessor_class]
--output=<Unicode>
    overwrite base name use for output files.
                can only be used when converting one notebook at a time.
    Default: ''
    Equivalent to: [--NbConvertApp.output_base]
--output-dir=<Unicode>
    Directory to write output(s) to. Defaults
                                  to output to the directory of each notebook. To recover
                                  previous default behaviour (outputting to the current
                                  working directory) use . as the flag value.
    Default: ''
    Equivalent to: [--FilesWriter.build_directory]
--reveal-prefix=<Unicode>
    The URL prefix for reveal.js (version 3.x).
            This defaults to the reveal CDN, but can be any url pointing to a copy
            of reveal.js.
            For speaker notes to work, this must be a relative path to a local
            copy of reveal.js: e.g., "reveal.js".
            If a relative path is given, it must be a subdirectory of the
            current directory (from which the server is run).
            See the usage documentation
            (https://nbconvert.readthedocs.io/en/latest/usage.html#reveal-js-html-slideshow)
            for more details.
    Default: ''
    Equivalent to: [--SlidesExporter.reveal_url_prefix]
--nbformat=<Enum>
    The nbformat version to write.
            Use this to downgrade notebooks.
    Choices: any of [1, 2, 3, 4]
    Default: 4
    Equivalent to: [--NotebookExporter.nbformat_version]

Examples
--------

    The simplest way to use nbconvert is

            > jupyter nbconvert mynotebook.ipynb --to html

            Options include ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf'].

            > jupyter nbconvert --to latex mynotebook.ipynb

            Both HTML and LaTeX support multiple output templates. LaTeX includes
            'base', 'article' and 'report'.  HTML includes 'basic', 'lab' and
            'classic'. You can specify the flavor of the format used.

            > jupyter nbconvert --to html --template lab mynotebook.ipynb

            You can also pipe the output to stdout, rather than a file

            > jupyter nbconvert mynotebook.ipynb --stdout

            PDF is generated via latex

            > jupyter nbconvert mynotebook.ipynb --to pdf

            You can get (and serve) a Reveal.js-powered slideshow

            > jupyter nbconvert myslides.ipynb --to slides --post serve

            Multiple notebooks can be given at the command line in a couple of
            different ways:

            > jupyter nbconvert notebook*.ipynb
            > jupyter nbconvert notebook1.ipynb notebook2.ipynb

            or you can specify the notebooks list in a config file, containing::

                c.NbConvertApp.notebooks = ["my_notebook.ipynb"]

            > jupyter nbconvert --config mycfg.py

To see all available configurables, use `--help-all`.

---------------------------------------------------------------------------
CalledProcessError                        Traceback (most recent call last)
<ipython-input-232-e412db7e8b7e> in <cell line: 1>()
----> 1 get_ipython().run_cell_magic('shell', '', "jupyter nbconvert --to html '/content/drive/MyDrive/Colab Notebooks/FinalProjectM22024.ipynb'\n")

/usr/local/lib/python3.10/dist-packages/google/colab/_shell.py in run_cell_magic(self, magic_name, line, cell)
    332     if line and not cell:
    333       cell = ' '
--> 334     return super().run_cell_magic(magic_name, line, cell)
    335 
    336 

/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
   2471             with self.builtin_trap:
   2472                 args = (magic_arg_s, cell)
-> 2473                 result = fn(*args, **kwargs)
   2474             return result
   2475 

/usr/local/lib/python3.10/dist-packages/google/colab/_system_commands.py in _shell_cell_magic(args, cmd)
    110   result = _run_command(cmd, clear_streamed_output=False)
    111   if not parsed_args.ignore_errors:
--> 112     result.check_returncode()
    113   return result
    114 

/usr/local/lib/python3.10/dist-packages/google/colab/_system_commands.py in check_returncode(self)
    135   def check_returncode(self):
    136     if self.returncode:
--> 137       raise subprocess.CalledProcessError(
    138           returncode=self.returncode, cmd=self.args, output=self.output
    139       )

CalledProcessError: Command 'jupyter nbconvert --to html '/content/drive/MyDrive/Colab Notebooks/FinalProjectM22024.ipynb'
' returned non-zero exit status 255.

	line_text	sentiment
0	All right Jim. Your quarterlies look very good...	0.4927
1	Oh, I told you. I couldn't close it. So...	0.0000
2	So you've come to the master for guidance? Is ...	0.0000
3	Actually, you called me in here, but yeah.	0.4215
4	All right. Well, let me show you how it's done.	0.2732
5	[on the phone] Yes, I'd like to speak to your ...	0.6712
6	I've, uh, I've been at Dunder Mifflin for 12 y...	0.2225
7	Well. I don't know.	0.2732
8	If you think she's cute now, you should have s...	0.4588
9	What?	0.0000
10	Any messages?	0.0000
11	Uh, yeah. Just a fax.	0.2960
12	Oh! Pam, this is from Corporate. How many time...	0.4574
13	You haven't told me.	0.0000
14	It's called the wastepaper basket! Look at tha...	0.0000
15	People say I am the best boss. They go, 'God w...	0.9745
16	[singing] Shall I play for you? Pa rum pump um...	0.0516
17	My job is to speak to clients on the phone abo...	-0.4019
18	Whassup!	0.0000
19	Whassup! I still love that after seven years.	0.6696

Exploring the Emotional Evolution of The Office: A Sentiment Analysis Across Seasons¶

Introduction¶

Initial ETL of The Office Transcripts Dataset¶

Milestone 2¶

Milestone 2 Analysis Exploration¶

Analysis 1¶

Analysis 2¶

Analysis 3¶

Analysis 4¶

Analysis 5¶

Analysis 6¶

Analysis 7¶

Models Models Models¶

	id	season	episode	scene	line_text	speaker	deleted
0	1	1	1	1	All right Jim. Your quarterlies look very good...	Michael	False
1	2	1	1	1	Oh, I told you. I couldn't close it. So...	Jim	False
2	3	1	1	1	So you've come to the master for guidance? Is ...	Michael	False
3	4	1	1	1	Actually, you called me in here, but yeah.	Jim	False
4	5	1	1	1	All right. Well, let me show you how it's done.	Michael	False

	0
id	int64
season	int64
episode	int64
scene	int64
line_text	object
speaker	object
deleted	bool

	sentiment_score
0	0.1
1	-0.8
2	-0.6
3	-0.4
4	-0.2

	Season	Episode	Rating
0	1	1	8.1
1	1	2	7.3
2	1	3	7.6
3	1	4	7.8
4	1	5	8.2

	Season	Episode	Rating
0	1	1	8.1
1	1	2	7.3
2	1	3	7.6
3	1	4	7.8
4	1	5	8.2