Udacity Engagement Analysis

Loading Data from CSVs

# Import csv library
import unicodecsv

# Define function to read and store data
def read_csv(filename):
    with open(filename, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

enrollments = read_csv('data/enrollments.csv')
daily_engagement = read_csv('data/daily_engagement.csv')
project_submissions = read_csv('data/project_submissions.csv')

# Print out first element of each list
print('Enrollments:')
print(enrollments[0])
print('Daily engagement:')
print(daily_engagement[0])
print('Project submissions:')
print(project_submissions[0])

Enrollments:
{'cancel_date': '2015-01-14', 'join_date': '2014-11-10', 'days_to_cancel': '65', 'account_key': '448', 'is_canceled': 'True', 'status': 'canceled', 'is_udacity': 'True'}
Daily engagement:
{'total_minutes_visited': '11.6793745', 'acct': '0', 'num_courses_visited': '1.0', 'utc_date': '2015-01-09', 'projects_completed': '0.0', 'lessons_completed': '0.0'}
Project submissions:
{'account_key': '256', 'lesson_key': '3176718735', 'creation_date': '2015-01-14', 'assigned_rating': 'UNGRADED', 'processing_state': 'EVALUATED', 'completion_date': '2015-01-16'}

Fixing Data Types

# Import library
from datetime import datetime as dt

# Takes a date as a string, and returns a Python datetime object 
# If there is no date given, returns None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')

# Takes a string which is either an empty string or represents an integer
# and returns an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])

print('Enrollments:')
enrollments[0]

Enrollments:





{'account_key': '448',
 'cancel_date': datetime.datetime(2015, 1, 14, 0, 0),
 'days_to_cancel': 65,
 'is_canceled': True,
 'is_udacity': True,
 'join_date': datetime.datetime(2014, 11, 10, 0, 0),
 'status': 'canceled'}

# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])

print('Daily engagement:')
daily_engagement[0]

Daily engagement:





{'acct': '0',
 'lessons_completed': 0,
 'num_courses_visited': 1,
 'projects_completed': 0,
 'total_minutes_visited': 11.6793745,
 'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}

# Clean up the data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

print('Project submissions:')
project_submissions[0]

Project submissions:





{'account_key': '256',
 'assigned_rating': 'UNGRADED',
 'completion_date': datetime.datetime(2015, 1, 16, 0, 0),
 'creation_date': datetime.datetime(2015, 1, 14, 0, 0),
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

Investigating the Data

# Rename the "acct" column in the daily_engagement table to "account_key".

for engagement_record in daily_engagement:
    engagement_record['account_key'] = engagement_record['acct']
    del engagement_record['acct']

print('Daily engagement - account key:')
daily_engagement[0]['account_key']

Daily engagement - account key:





'0'

# Define function to get unique students
def get_unique_students(data):
    unique_students = set()
    for data_point in data:
        unique_students.add(data_point['account_key'])
    return unique_students

# Total number of rows and the number of unique students (account keys)
# in each table.

print('Total enrollment records:')
print(len(enrollments))

unique_enrolled_students = get_unique_students(enrollments)
print('Total unique enrollment records:')
print(len(unique_enrolled_students))

print('Total daily engagement records:')
print(len(daily_engagement))

unique_engagement_students = set()
unique_engagement_students = get_unique_students(daily_engagement)
print('Total unique daily engagement records:')
print(len(unique_engagement_students))

print('Total project submission records:')
print(len(project_submissions))

unique_project_submitters = get_unique_students(project_submissions)
print('Total unique project submission records:')
print(len(unique_project_submitters))

Total enrollment records:
1640
Total unique enrollment records:
1302
Total daily engagement records:
136240
Total unique daily engagement records:
1237
Total project submission records:
3642
Total unique project submission records:
743

Problems in the Data

Missing Engagement Records

# Find one student enrollments where the student is missing from the daily engagement table
# Output that enrollment

counter = 0
for enrollment in enrollments:
    student = enrollment['account_key']
    if student not in unique_engagement_students:
        print('Enrollment:')
        print (enrollment)
        break

Enrollment:
{'cancel_date': datetime.datetime(2014, 11, 12, 0, 0), 'join_date': datetime.datetime(2014, 11, 12, 0, 0), 'days_to_cancel': 0, 'account_key': '1219', 'is_canceled': True, 'status': 'canceled', 'is_udacity': False}

Checking for More Problem Records

# Find the number of surprising data points (enrollments missing from
# the engagement table) that remain, if any

counter = 0
for enrollment in enrollments:
    student = enrollment['account_key']
    if student not in unique_engagement_students and enrollment['days_to_cancel'] != 0:
        counter+= 1
        print('Enrollment:')
        print(enrollment)

print('Counter:')
print(counter)

Enrollment:
{'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), 'join_date': datetime.datetime(2015, 1, 10, 0, 0), 'days_to_cancel': 59, 'account_key': '1304', 'is_canceled': True, 'status': 'canceled', 'is_udacity': True}
Enrollment:
{'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), 'join_date': datetime.datetime(2015, 3, 10, 0, 0), 'days_to_cancel': 99, 'account_key': '1304', 'is_canceled': True, 'status': 'canceled', 'is_udacity': True}
Enrollment:
{'cancel_date': None, 'join_date': datetime.datetime(2015, 2, 25, 0, 0), 'days_to_cancel': None, 'account_key': '1101', 'is_canceled': False, 'status': 'current', 'is_udacity': True}
Counter:
3

Tracking Down the Remaining Problems

# Create a set of the account keys for all Udacity test accounts
udacity_test_accounts = set()
for enrollment in enrollments:
    if enrollment['is_udacity']:
        udacity_test_accounts.add(enrollment['account_key'])

print('Number of Udacity test accounts:')        
print(len(udacity_test_accounts))

Number of Udacity test accounts:
6

# Given some data with an account_key field, removes any records corresponding to Udacity test accounts
def remove_udacity_accounts(data):
    non_udacity_data = []
    for data_point in data:
        if data_point['account_key'] not in udacity_test_accounts:
            non_udacity_data.append(data_point)
    return non_udacity_data

# Remove Udacity test accounts from all three tables
non_udacity_enrollments = remove_udacity_accounts(enrollments)
non_udacity_engagement = remove_udacity_accounts(daily_engagement)
non_udacity_submissions = remove_udacity_accounts(project_submissions)

print('Non Udacity enrollments:')
print(len(non_udacity_enrollments))
print('Non Udacity engagements:')
print(len(non_udacity_engagement))
print('Non Udacity submissions:')
print(len(non_udacity_submissions))

Non Udacity enrollments:
1622
Non Udacity engagements:
135656
Non Udacity submissions:
3634

Refining the Question

# Creates a dictionary named paid_students containing all students who either
# haven't canceled yet or who remained enrolled for more than 7 days. The keys
# are account keys, and the values are the date the student enrolled

paid_students = {}
for enrollment in non_udacity_enrollments:
    if not enrollment['is_canceled'] or enrollment['days_to_cancel'] > 7:
        account_key = enrollment['account_key']
        enrollment_date = enrollment['join_date']
        paid_students[account_key] = enrollment_date

        if account_key not in paid_students or \
                enrollment_date > paid_students[account_key]:
            paid_students[account_key] = enrollment_date

print('Paid students:')
print(len(paid_students))

Paid students:
995

Getting Data from First Week

# Takes a student's join date and the date of a specific engagement record,
# and returns True if that engagement record happened within one week
# of the student joining.
def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7 and time_delta.days >= 0

# Creates a list of rows from the engagement table including only rows where
# the student is one of the paid students we just found, and the date is within
# one week of the student's join date

def remove_free_trial_cancels(data):
    new_data = []
    for data_point in data:
        if data_point['account_key'] in paid_students:
            new_data.append(data_point)
    return new_data

paid_enrollments = remove_free_trial_cancels(non_udacity_enrollments)
paid_engagement = remove_free_trial_cancels(non_udacity_engagement)
paid_submissions = remove_free_trial_cancels(non_udacity_submissions)

print('Paid enrollments:')
print(len(paid_enrollments))
print('Paid engagements:')
print(len(paid_engagement))
print('Paid submissions:')
print(len(paid_submissions))

paid_engagement_in_first_week = []
for engagement_record in paid_engagement:
    account_key = engagement_record['account_key']
    join_date = paid_students[account_key]
    engagement_record_date = engagement_record['utc_date']

    if within_one_week(join_date, engagement_record_date):
        paid_engagement_in_first_week.append(engagement_record)

print('Paid engagements in first week:')
print(len(paid_engagement_in_first_week))

Paid enrollments:
1293
Paid engagements:
134549
Paid submissions:
3618
Paid engagements in first week:
6920

Exploring Student Engagement

# Import library
from collections import defaultdict

# Creates a dictionary of engagement grouped by student
# The keys are account keys, and the values are lists of engagement records.
engagement_by_account = defaultdict(list)
for engagement_record in paid_engagement_in_first_week:
    account_key = engagement_record['account_key']
    engagement_by_account[account_key].append(engagement_record)

# Creates a dictionary with the total minutes each student spent in the classroom during the first week
# The keys are account keys, and the values are numbers (total minutes)
total_minutes_by_account = {}
for account_key, engagement_for_student in engagement_by_account.items():
    total_minutes = 0
    for engagement_record in engagement_for_student:
        total_minutes += engagement_record['total_minutes_visited']
    total_minutes_by_account[account_key] = total_minutes

#Import library
import numpy as np

# Summarize the data about minutes spent in the classroom
total_minutes = np.array(list(total_minutes_by_account.values()))
print('Mean:', np.mean(total_minutes))
print('Standard deviation:', np.std(total_minutes))
print('Minimum:', np.min(total_minutes))
print('Maximum:', np.max(total_minutes))

Mean: 305.414718908
Standard deviation: 405.91261032
Minimum: 0.0
Maximum: 3564.7332645

Debugging Data Analysis Code

# Goes through a similar process as before to see if there is a problem
# Locates at least one surprising piece of data and outputs it
student_with_max_minutes = None
max_minutes = 0

for student, total_minutes in total_minutes_by_account.items():
    if total_minutes > max_minutes:
        max_minutes = total_minutes
        student_with_max_minutes = student
print('Max minutes:')
print(max_minutes)

for engagement_record in paid_engagement_in_first_week:
    if engagement_record['account_key'] == student_with_max_minutes:
        print('Engagement record:')
        print(engagement_record)

Max minutes:
3564.7332644989997
Engagement record:
{'total_minutes_visited': 850.519339666, 'num_courses_visited': 4, 'account_key': '163', 'utc_date': datetime.datetime(2015, 7, 9, 0, 0), 'projects_completed': 0, 'lessons_completed': 4}
Engagement record:
{'total_minutes_visited': 872.633923334, 'num_courses_visited': 6, 'account_key': '163', 'utc_date': datetime.datetime(2015, 7, 10, 0, 0), 'projects_completed': 0, 'lessons_completed': 6}
Engagement record:
{'total_minutes_visited': 777.018903666, 'num_courses_visited': 2, 'account_key': '163', 'utc_date': datetime.datetime(2015, 7, 11, 0, 0), 'projects_completed': 0, 'lessons_completed': 6}
Engagement record:
{'total_minutes_visited': 294.568774, 'num_courses_visited': 1, 'account_key': '163', 'utc_date': datetime.datetime(2015, 7, 12, 0, 0), 'projects_completed': 0, 'lessons_completed': 2}
Engagement record:
{'total_minutes_visited': 471.2139785, 'num_courses_visited': 3, 'account_key': '163', 'utc_date': datetime.datetime(2015, 7, 13, 0, 0), 'projects_completed': 0, 'lessons_completed': 1}
Engagement record:
{'total_minutes_visited': 298.778345333, 'num_courses_visited': 2, 'account_key': '163', 'utc_date': datetime.datetime(2015, 7, 14, 0, 0), 'projects_completed': 0, 'lessons_completed': 1}
Engagement record:
{'total_minutes_visited': 0.0, 'num_courses_visited': 0, 'account_key': '163', 'utc_date': datetime.datetime(2015, 7, 15, 0, 0), 'projects_completed': 0, 'lessons_completed': 0}

Lessons Completed in First Week

# Adapts the code above to find the mean, standard deviation, minimum, and maximum for
# the number of lessons completed by each student during the first week

# Load library
from collections import defaultdict

def group_data(data, key_name):
    grouped_data = defaultdict(list)
    for data_point in data:
        key = data_point[key_name]
        grouped_data[key].append(data_point)
    return grouped_data

engagement_by_account = group_data(paid_engagement_in_first_week,
                                   'account_key')

def sum_grouped_items(grouped_data, field_name):
    summed_data = {}
    for key, data_points in grouped_data.items():
        total = 0
        for data_point in data_points:
            total += data_point[field_name]
        summed_data[key] = total
    return summed_data

total_minutes_by_account = sum_grouped_items(engagement_by_account,
                                             'total_minutes_visited')

def describe_data(data):
    print('Mean:', np.mean(data))
    print('Standard deviation:', np.std(data))
    print('Minimum:', np.min(data))
    print('Maximum:', np.max(data))

print('Total minutes by account:')
describe_data(np.array(list(total_minutes_by_account.values())))

lessons_completed_by_account = sum_grouped_items(engagement_by_account,
                                                 'lessons_completed')
print('Lessons completed by account:')
describe_data(np.array(list(lessons_completed_by_account.values())))

Total minutes by account:
Mean: 305.414718908
Standard deviation: 405.91261032
Minimum: 0.0
Maximum: 3564.7332645
Lessons completed by account:
Mean: 1.63216080402
Standard deviation: 3.00140182563
Minimum: 0
Maximum: 36

Number of Visits in First Week

# Finds the mean, standard deviation, minimum, and maximum for the number of
# days each student visits the classroom during the first week

engagement_by_account = group_data(paid_engagement_in_first_week,
                                   'account_key')

for account_key, engagement_for_student in engagement_by_account.items():
    for data_points in engagement_for_student:
        if data_points['num_courses_visited'] > 0:
            data_points['has_visited'] = 1
        else:
            data_points['has_visited'] = 0

has_visited_by_account = sum_grouped_items(engagement_by_account,
                                                 'has_visited')

print('Students that visited in first week:')
describe_data(np.array(list(has_visited_by_account.values())))

Students that visited in first week:
Mean: 2.91256281407
Standard deviation: 2.22037005491
Minimum: 0
Maximum: 7

Splitting out Passing Students

# Creates two lists of engagement data for paid students in the first week
# The first list contains data for students who eventually pass the
# subway project, and the second list contains data for students
# who do not

subway_project_lesson_keys = ['746169184', '3176718735']

pass_subway_project = set()

passing_engagement = []
non_passing_engagement = []

for submission in paid_submissions:
    project = submission['lesson_key']
    rating = submission['assigned_rating']

    if ((project in subway_project_lesson_keys) and (rating == 'PASSED' or rating == 'DISTINCTION')):
        pass_subway_project.add(submission['account_key'])


for engagement_record in paid_engagement_in_first_week:
    if engagement_record['account_key'] in pass_subway_project:
        passing_engagement.append(engagement_record)
    else:
        non_passing_engagement.append(engagement_record)

print('Passing engagement:')
print(len(passing_engagement))
print('Non passing engagement')
print(len(non_passing_engagement))

Passing engagement:
4528
Non passing engagement
2392

Comparing the Two Student Groups

# Computes some metrics that are interesting and see how they differ for
# students who pass the subway project vs. students who don't

passing_engagement_by_account = group_data(passing_engagement,
                                   'account_key')

has_visited_by_account = sum_grouped_items(passing_engagement_by_account,
                                                 'has_visited')

print('Has visited by account:')
describe_data(np.array(list(has_visited_by_account.values())))

non_passing_engagement_by_account = group_data(non_passing_engagement,
                                   'account_key')

non_has_visited_by_account = sum_grouped_items(non_passing_engagement_by_account,
                                                 'has_visited')
print('Has not visited by account:')
describe_data(np.array(list(non_has_visited_by_account.values())))

Has visited by account:
Mean: 3.42967542504
Standard deviation: 2.21298340866
Minimum: 0
Maximum: 7
Has not visited by account:
Mean: 1.95114942529
Standard deviation: 1.88929952676
Minimum: 0
Maximum: 7

Making Histograms

# Makes histograms of the three metrics we looked at earlier for both
# students who passed the subway project and students who didn't

# Ensure plot is displayed in notebook
%pylab inline

# Load libraries
import matplotlib.pyplot as plt
import seaborn as sns

has_minutes_by_account = sum_grouped_items(passing_engagement_by_account,
                                                 'total_minutes_visited')
non_has_minutes_by_account = sum_grouped_items(non_passing_engagement_by_account,
                                                 'total_minutes_visited')

has_lessons_by_account = sum_grouped_items(passing_engagement_by_account,
                                                 'lessons_completed')
non_has_lessons_by_account = sum_grouped_items(non_passing_engagement_by_account,
                                                 'lessons_completed')

has_visited_by_account = sum_grouped_items(passing_engagement_by_account,
                                                 'has_visited')
non_has_visited_by_account = sum_grouped_items(non_passing_engagement_by_account,
                                                 'has_visited')

has_mins = []
for account_key, records in has_minutes_by_account.items():
    has_mins.append(records)

plt.figure(1)
plt.hist(has_mins)
plt.title("Minutes by account")
plt.xlabel("Days Visited")

non_has_mins = []
for account_key, records in non_has_minutes_by_account.items():
    non_has_mins.append(records)

plt.figure(2)
plt.hist(non_has_mins)
plt.title("Non Minutes by account")
plt.xlabel("Days Visited")

has_lessons = []
for account_key, records in has_lessons_by_account.items():
    has_lessons.append(records)

plt.figure(3)
plt.hist(has_lessons)
plt.title('Has lessons')
plt.xlabel("Days Visited")

non_has_lessons = []
for account_key, records in non_has_lessons_by_account.items():
    non_has_lessons.append(records)

plt.figure(4)
plt.hist(non_has_lessons)
plt.title('Non has lessons')
plt.xlabel("Days Visited")

has_visited = []
for account_key, records in has_visited_by_account.items():
    has_visited.append(records)

plt.figure(5)
plt.hist(has_visited)
plt.title('Has visited')
plt.xlabel("Days Visited")

non_has_visited = []
for account_key, records in non_has_visited_by_account.items():
    non_has_visited.append(records)

plt.figure(6)
plt.hist(non_has_visited, bins = 8)
plt.title('Non has visisted')
plt.xlabel("Days Visited")

Populating the interactive namespace from numpy and matplotlib





<matplotlib.text.Text at 0x10e3f6080>

png

Loading Data from CSVs

Fixing Data Types

Investigating the Data

Problems in the Data

Missing Engagement Records

Checking for More Problem Records

Tracking Down the Remaining Problems

Refining the Question

Getting Data from First Week

Exploring Student Engagement

Debugging Data Analysis Code

Lessons Completed in First Week

Number of Visits in First Week

Splitting out Passing Students

Comparing the Two Student Groups

Making Histograms

social