import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np

data_offers = pd.read_csv("datasets/data_offers.csv")
data_orders = pd.read_csv("datasets/data_orders.csv")

data_offers.head()

data_orders.head()

data_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10716 entries, 0 to 10715
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   order_datetime                 10716 non-null  object 
 1   origin_longitude               10716 non-null  float64
 2   origin_latitude                10716 non-null  float64
 3   m_order_eta                    2814 non-null   float64
 4   order_gk                       10716 non-null  int64  
 5   order_status_key               10716 non-null  int64  
 6   is_driver_assigned_key         10716 non-null  int64  
 7   cancellations_time_in_seconds  7307 non-null   float64
dtypes: float64(4), int64(3), object(1)
memory usage: 669.9+ KB

cancelled_by_client_no_driver = data_orders[(data_orders['order_status_key'] == 4) & (data_orders['is_driver_assigned_key'] == 0)]
print(len(cancelled_by_client_no_driver))

4496

cancelled_by_client_driver = data_orders[(data_orders['order_status_key'] == 4) & (data_orders['is_driver_assigned_key'] == 1)]
print(len(cancelled_by_client_driver))

2811

cancelled_by_system = data_orders[(data_orders['order_status_key'] == 9)]
print(len(cancelled_by_system))

3409

categories = [
    "Cancelled by client\n(no driver assigned)",
    "Cancelled by client\n(driver assigned)",
    "Cancelled by system"
]
counts = [len(cancelled_by_client_no_driver), len(cancelled_by_client_driver), len(cancelled_by_system)]

plt.figure(figsize=(8, 5))
sns.barplot(x=categories, y=counts, hue=counts, palette="colorblind")
plt.ylabel("Number of Cancelled Orders")
plt.title("Counts of Cancelled Orders by Category")
plt.tight_layout()
plt.show()

order_data_by_hour = data_orders[['order_datetime', 'order_status_key', 'is_driver_assigned_key']].copy()
order_data_by_hour['order_datetime'] = pd.to_datetime(order_data_by_hour['order_datetime'])
order_data_by_hour['hour'] = order_data_by_hour['order_datetime'].dt.hour

/tmp/ipykernel_77565/2463487170.py:2: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  order_data_by_hour['order_datetime'] = pd.to_datetime(order_data_by_hour['order_datetime'])

order_data_by_hour.head()

plt.figure(figsize=(10,5))
sns.histplot(order_data_by_hour, x='hour', discrete=True)
plt.xticks(range(24))
plt.title("Total Cancelled Orders per Hour")
plt.xlabel("Hour of Day")
plt.ylabel("Number of Cancelled Orders")
plt.tight_layout()
plt.show()

def cancellation_type(row):
    if row['order_status_key'] == 4 and row['is_driver_assigned_key'] == 1:
        return 'Cancelled by client (driver assigned)'
    elif row['order_status_key'] == 4 and row['is_driver_assigned_key'] == 0:
        return 'Cancelled by client (no driver assigned)'
    elif row['order_status_key'] == 9:
        return 'Cancelled by system'
    else:
        return None

order_data_by_hour['cancel_type'] = order_data_by_hour.apply(cancellation_type, axis=1)
filtered = order_data_by_hour[order_data_by_hour['cancel_type'].notnull()]

plt.figure(figsize=(12,6))
sns.histplot(data=filtered, x='hour', hue='cancel_type', multiple='stack', discrete=True, palette='colorblind')
plt.xticks(range(24))
plt.title("Cancelled Orders per Hour by Type")
plt.xlabel("Hour of Day")
plt.ylabel("Number of Cancelled Orders")
plt.tight_layout()
plt.show()

data_orders_cancellation = data_orders[['order_datetime', 'is_driver_assigned_key', 'cancellations_time_in_seconds']].copy()
data_orders_cancellation['order_datetime'] = pd.to_datetime(data_orders_cancellation['order_datetime'])
data_orders_cancellation['hour'] = data_orders_cancellation['order_datetime'].dt.hour

/tmp/ipykernel_77565/1789851112.py:2: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  data_orders_cancellation['order_datetime'] = pd.to_datetime(data_orders_cancellation['order_datetime'])

data_orders_cancellation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10716 entries, 0 to 10715
Data columns (total 4 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_datetime                 10716 non-null  datetime64[ns]
 1   is_driver_assigned_key         10716 non-null  int64         
 2   cancellations_time_in_seconds  7307 non-null   float64       
 3   hour                           10716 non-null  int32         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1)
memory usage: 293.1 KB

data_orders_cancellation = data_orders_cancellation.dropna()

data_orders_cancellation.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7307 entries, 0 to 10715
Data columns (total 4 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_datetime                 7307 non-null   datetime64[ns]
 1   is_driver_assigned_key         7307 non-null   int64         
 2   cancellations_time_in_seconds  7307 non-null   float64       
 3   hour                           7307 non-null   int32         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1)
memory usage: 256.9 KB

Q1 = data_orders_cancellation['cancellations_time_in_seconds'].quantile(0.25)
Q3 = data_orders_cancellation['cancellations_time_in_seconds'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = data_orders_cancellation[
    (data_orders_cancellation['cancellations_time_in_seconds'] < lower_bound) |
    (data_orders_cancellation['cancellations_time_in_seconds'] > upper_bound)
]

print(f"Number of outliers: {len(outliers)}")

Number of outliers: 592

data_orders_cancellation_no_outliers = data_orders_cancellation[
    (data_orders_cancellation['cancellations_time_in_seconds'] >= lower_bound) &
    (data_orders_cancellation['cancellations_time_in_seconds'] <= upper_bound)
]
print(f"Rows after removing outliers: {len(data_orders_cancellation_no_outliers)}")

Rows after removing outliers: 6715

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# With outliers
sns.boxplot(ax=axes[0], data=data_orders_cancellation, y='cancellations_time_in_seconds')
axes[0].set_title("With Outliers")

# Without outliers
sns.boxplot(ax=axes[1], data=data_orders_cancellation_no_outliers, y='cancellations_time_in_seconds', color='skyblue')
axes[1].set_title("Without Outliers")

plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 5))
ax = sns.histplot(
    data=data_orders_cancellation_no_outliers,
    x='cancellations_time_in_seconds',
    hue='is_driver_assigned_key',
    multiple='stack',
    bins=30,
    kde=True,
    palette='colorblind'
)

plt.title("Distribution of Cancellation Time")
plt.xlabel("Cancellation Time (seconds)")
plt.ylabel("Frequency")
plt.legend(title='Driver Assigned', labels=['Yes', 'No'])
plt.show()

avg_time_to_cancel = data_orders_cancellation_no_outliers.groupby('is_driver_assigned_key').mean()

avg_time_df = avg_time_to_cancel.reset_index()

plt.figure(figsize=(6, 4))
sns.barplot(
    data=avg_time_df,
    x='is_driver_assigned_key',
    y='cancellations_time_in_seconds',
    palette='colorblind'
)
plt.title('Average Cancellation Time by Driver Assignment')
plt.xlabel('Driver Assigned')
plt.ylabel('Average Cancellation Time (seconds)')
plt.tight_layout()
plt.show()

/tmp/ipykernel_77565/2608496666.py:4: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

# Group by hour and driver assignment, then calculate mean cancellation time
avg_cancel_time_by_hour = data_orders_cancellation_no_outliers.groupby(
    ['hour', 'is_driver_assigned_key']
)['cancellations_time_in_seconds'].mean().reset_index()

plt.figure(figsize=(12,6))
sns.lineplot(
    data=avg_cancel_time_by_hour,
    x='hour',
    y='cancellations_time_in_seconds',
    hue='is_driver_assigned_key',
    marker='o',
    palette='colorblind'
)
plt.title('Average Cancellation Time by Hour and Driver Assignment')
plt.xlabel('Hour of Day')
plt.ylabel('Average Cancellation Time (seconds)')
plt.xticks(range(24))
plt.legend(title='Driver Assigned')
plt.tight_layout()
plt.show()

data_orders_eta = data_orders[data_orders['is_driver_assigned_key'] == 1]

data_orders_eta = data_orders_eta[['order_datetime', 'm_order_eta']]

data_orders_eta.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2814 entries, 0 to 10715
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   order_datetime  2814 non-null   object 
 1   m_order_eta     2814 non-null   float64
dtypes: float64(1), object(1)
memory usage: 66.0+ KB

data_orders_eta['order_datetime'] = pd.to_datetime(data_orders_eta['order_datetime'])
data_orders_eta['hour'] = data_orders_eta['order_datetime'].dt.hour

/tmp/ipykernel_77565/3767668566.py:1: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  data_orders_eta['order_datetime'] = pd.to_datetime(data_orders_eta['order_datetime'])

data_orders_eta.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2814 entries, 0 to 10715
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   order_datetime  2814 non-null   datetime64[ns]
 1   m_order_eta     2814 non-null   float64       
 2   hour            2814 non-null   int32         
dtypes: datetime64[ns](1), float64(1), int32(1)
memory usage: 76.9 KB

avg_data_order_eta = data_orders_eta.groupby('hour')['m_order_eta'].mean().reset_index()
avg_data_order_eta.head()

plt.figure(figsize=(10,7))
sns.barplot(data=avg_data_order_eta, x='hour', y='m_order_eta', palette='colorblind')
plt.title('Estimated Time of Arrival (seconds) by Hours')
plt.xlabel('Hour')
plt.ylabel('Estimated Time of Arrival (seconds)')
plt.tight_layout()
plt.show()

/tmp/ipykernel_77565/3068694397.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=avg_data_order_eta, x='hour', y='m_order_eta', palette='colorblind')

import h3, folium

H3_RESOLUTION = 8

data_orders['hex_id'] = data_orders.apply(
    lambda row: h3.latlng_to_cell(row['origin_latitude'], row['origin_longitude'], H3_RESOLUTION),
    axis=1
)

data_orders.head()

hex_total_orders = data_orders.groupby('hex_id').size().reset_index(name='total_orders')

failed_orders = data_orders[data_orders['order_status_key'].isin([4, 9])]
hex_failed_orders = failed_orders.groupby('hex_id').size().reset_index(name='fail_count')

hex_data = pd.merge(hex_total_orders, hex_failed_orders, on='hex_id', how='left')
hex_data['fail_count'] = hex_data['fail_count'].fillna(0).astype(int)

hex_data_sorted = hex_data.sort_values(by='total_orders', ascending=False)
eighty_percent_target = data_orders.shape[0] * 0.80

hex_data_sorted['cumulative_orders'] = hex_data_sorted['total_orders'].cumsum()
cutoff_index = hex_data_sorted[hex_data_sorted['cumulative_orders'] >= eighty_percent_target].index[0]
top_hexes = hex_data_sorted.loc[:cutoff_index]

num_top_hexes = len(top_hexes)
print(f"Number of size 8 hexes containing 80% of all orders: {num_top_hexes}")

Number of size 8 hexes containing 80% of all orders: 24

map_center = [data_orders['origin_latitude'].mean(), data_orders['origin_longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=12, tiles='cartodbpositron')

for index, row in top_hexes.iterrows():
    hex_id = row['hex_id']
    fail_count = row['fail_count']
    total_orders = row['total_orders']
    

    geo_boundary = h3.cell_to_boundary(hex_id)
    boundary_tuples = [(lat, lon) for lat, lon in geo_boundary]
    
    if fail_count > 150:
        color = '#d73027'
    elif fail_count > 75:
        color = '#fdae61'
    elif fail_count > 25:
        color = '#fee090'
    else:
        color = '#abd9e9'
    
    folium.Polygon(
        locations=boundary_tuples,
        color="black",
        weight=1,
        fill_color=color,
        fill_opacity=0.7,
        tooltip=f"Hex ID: {hex_id}<br>Failed Orders: {fail_count}<br>Total Orders: {total_orders}"
    ).add_to(m)

map_filename = 'failure_hotspot_map.html'
m.save(map_filename)

from IPython.display import IFrame

IFrame(src='failure_hotspot_map.html', width=700, height=600)

	order_gk	offer_id
0	3000579625629	300050936206
1	3000627306450	300052064651
2	3000632920686	300052408812
3	3000632771725	300052393030
4	3000583467642	300051001196

	order_datetime	origin_longitude	origin_latitude	m_order_eta	order_gk	order_status_key	is_driver_assigned_key	cancellations_time_in_seconds
0	18:08:07	-0.978916	51.456173	60.0	3000583041974	4	1	198.0
1	20:57:32	-0.950385	51.456843	NaN	3000583116437	4	0	128.0
2	12:07:50	-0.969520	51.455544	477.0	3000582891479	4	1	46.0
3	13:50:20	-1.054671	51.460544	658.0	3000582941169	4	1	62.0
4	21:24:45	-0.967605	51.458236	NaN	3000583140877	9	0	NaN

	order_datetime	order_status_key	is_driver_assigned_key	hour
0	2025-07-04 18:08:07	4	1	18
1	2025-07-04 20:57:32	4	0	20
2	2025-07-04 12:07:50	4	1	12
3	2025-07-04 13:50:20	4	1	13
4	2025-07-04 21:24:45	9	0	21

	hour	m_order_eta
0	0	357.959016
1	1	324.750000
2	2	391.012821
3	3	388.093750
4	4	299.658537

	order_datetime	origin_longitude	origin_latitude	m_order_eta	order_gk	order_status_key	is_driver_assigned_key	cancellations_time_in_seconds	hex_id
0	18:08:07	-0.978916	51.456173	60.0	3000583041974	4	1	198.0	88195d2b03fffff
1	20:57:32	-0.950385	51.456843	NaN	3000583116437	4	0	128.0	88195d2b19fffff
2	12:07:50	-0.969520	51.455544	477.0	3000582891479	4	1	46.0	88195d2b1dfffff
3	13:50:20	-1.054671	51.460544	658.0	3000582941169	4	1	62.0	88195d7497fffff
4	21:24:45	-0.967605	51.458236	NaN	3000583140877	9	0	NaN	88195d2b1dfffff

Insights from Failed Orders¶

Section 1: Distribution of Orders by Reasons for Failure¶

Analysis¶

Section 2: How do failed orders change throughout the day?¶

Analysis¶

Section 3: How long are customers waiting before they cancel?¶

Analysis¶

Section 4: What do the Estimated Times of Arrival (ETAs) look like by hour?¶

Analysis¶

Section 5: Where are the failure hotspots?¶

Analysis¶