Complete Data Operations Cheatsheet: SQLite, NumPy, Pandas

To Panda

Table of Contents

🟢 Basic Operations

Fundamental operations for working with data across different tools and libraries.

Basic Creating Data

Purpose: Initialize arrays, tables, and DataFrames for analysis.

NumPy Creating Arrays

import numpy as np

# Create a 1D array from a list
arr1 = np.array([1, 2, 3, 4, 5])

# Create a 2D array (matrix)
arr2 = np.array([[1, 2, 3], [4, 5, 6]])

# Create specialized arrays
zeros = np.zeros((3, 3))              # 3x3 matrix of zeros
ones = np.ones((2, 4))               # 2x4 matrix of ones
identity = np.eye(3)                 # 3x3 identity matrix
range_arr = np.arange(0, 10, 2)      # Array from 0 to 8 with step 2
lin_space = np.linspace(0, 1, 5)     # 5 evenly spaced values from 0 to 1
random_arr = np.random.rand(3, 3)    # 3x3 array of random values [0,1)
empty_arr = np.empty((2, 3))         # Uninitialized array (faster)
full_arr = np.full((2, 2), 7)        # Array filled with a specific value
diag_arr = np.diag([1, 2, 3])        # Diagonal matrix
Scenario: Used in simulations, machine learning initializations, or creating matrices for scientific computing.

Pandas Creating DataFrames

import pandas as pd

# Create DataFrame from dictionary
df1 = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "city": ["New York", "San Francisco", "Boston"]
})

# Create DataFrame from lists
data = [["Alice", 25, "New York"], 
        ["Bob", 30, "San Francisco"], 
        ["Charlie", 35, "Boston"]]
df2 = pd.DataFrame(data, columns=["name", "age", "city"])

# Create DataFrame from external sources
df3 = pd.read_csv("data.csv")
df4 = pd.read_excel("data.xlsx", sheet_name="Sheet1")
df5 = pd.read_json("data.json")
df6 = pd.read_sql("SELECT * FROM users", con=connection)
Scenario: Loading customer data from a CSV, importing survey results, or retrieving user profiles from a database.

SQLite Creating Tables and Inserting Data

-- Create a new table
CREATE TABLE users (
  id INTEGER PRIMARY KEY,
  name TEXT NOT NULL,
  age INTEGER,
  email TEXT UNIQUE,
  signup_date DATE DEFAULT CURRENT_DATE
);

-- Insert a single row
INSERT INTO users (name, age, email) 
VALUES ('Alice', 25, '[email protected]');

-- Insert multiple rows
INSERT INTO users (name, age, email) VALUES
  ('Bob', 30, '[email protected]'),
  ('Charlie', 35, '[email protected]'),
  ('Diana', 28, '[email protected]');

-- Create a table from a query (useful for temporary tables)
CREATE TABLE active_users AS
SELECT * FROM users WHERE last_login > date('now', '-30 days');
Scenario: Setting up a database schema for a web application, creating a customer tracking system, or importing data from a CSV into a database.

Selecting Distinct/Unique Values

Operation NumPy Pandas SQLite
Get unique values np.unique(arr) df['column'].unique() SELECT DISTINCT column FROM table
Count unique values len(np.unique(arr)) df['column'].nunique() SELECT COUNT(DISTINCT column) FROM table
Unique with counts np.unique(arr, return_counts=True) df['column'].value_counts() SELECT column, COUNT(*) FROM table GROUP BY column
Unique combinations np.unique(arr, axis=0) df.drop_duplicates(subset=['col1', 'col2']) SELECT DISTINCT col1, col2 FROM table

Advanced Data Transformation Example

def top_actors_by_film_count(df):
    # Drop duplicate film-actor combinations
    unique_films = df[['actor', 'film_id']].drop_duplicates()
    
    # Count number of films per actor
    actor_counts = unique_films.groupby('actor').size().reset_index(name='count')
    
    # Sort by count descending, then actor name ascending
    actor_counts = actor_counts.sort_values(by=['count', 'actor'], ascending=[False, True])
    
    # Get the 10th highest film count value (or less if there are fewer actors)
    top_10_cutoff = actor_counts.iloc[9]['count'] if len(actor_counts) >= 10 else actor_counts['count'].min()
    
    # Include all actors with at least as many films as the 10th
    top_actors = actor_counts[actor_counts['count'] >= top_10_cutoff].reset_index(drop=True)
    
    return top_actors
Scenario: Analyzing a film database to find the top actors by film count, with a tie-breaking mechanism that includes all actors with the same count as the 10th ranked actor.

Advanced Advanced Data Transformations

Purpose: Perform complex multi-step transformations on data to derive insights and prepare data for analysis.

Pandas Advanced Data Transformation Patterns

import pandas as pd

# Example 1: Finding top actors by film count
def top_actors_by_film_count(df):
    # Drop duplicate film-actor combinations
    unique_films = df[['actor', 'film_id']].drop_duplicates()
    
    # Count number of films per actor
    actor_counts = unique_films.groupby('actor').size().reset_index(name='count')
    
    # Sort by count descending, then actor name ascending
    actor_counts = actor_counts.sort_values(by=['count', 'actor'], ascending=[False, True])
    
    # Get the 10th highest film count value (or less if there are fewer actors)
    top_10_cutoff = actor_counts.iloc[9]['count'] if len(actor_counts) >= 10 else actor_counts['count'].min()
    
    # Include all actors with at least as many films as the 10th
    top_actors = actor_counts[actor_counts['count'] >= top_10_cutoff].reset_index(drop=True)
    
    return top_actors

# Example 2: Get the years in which a given actor has appeared in films
# Returns a dictionary {actor_name: [year1, year2, ...]}
def actor_years(df, actor_name):
    # Filter rows by the given actor
    actor_df = df[df['actor'] == actor_name]
    
    # Extract unique years and sort them
    years = sorted(actor_df['year'].drop_duplicates())
    
    return {actor_name: years}

# Example 3: Analyze co-occurrences using cross-tabulation
def analyze_actor_collaborations(df):
    # Get unique film-actor pairs
    film_actors = df[['film_id', 'actor']].drop_duplicates()
    
    # Create a cross-tabulation matrix
    # Each cell represents if an actor (row) was in a film (column)
    actor_film_matrix = pd.crosstab(film_actors['actor'], film_actors['film_id'])
    
    # Convert to binary values (1 = appeared in film, 0 = did not appear)
    binary_matrix = (actor_film_matrix > 0).astype(int)
    
    # Calculate co-occurrence matrix (actor × actor)
    # Each cell represents how many films two actors appeared in together
    co_occurrence = binary_matrix.dot(binary_matrix.T)
    
    # Set diagonal to zero (to exclude self-collaborations)
    np.fill_diagonal(co_occurrence.values, 0)
    
    return co_occurrence

# Example 4: Time series trend analysis with rolling windows and resampling
def analyze_film_trends(df):
    # Convert to datetime and set as index
    df['release_date'] = pd.to_datetime(df['release_date'])
    df = df.set_index('release_date')
    
    # Count films per month
    monthly_counts = df.resample('M').size()
    
    # Calculate trailing 12-month average
    yearly_trend = monthly_counts.rolling(window=12).mean()
    
    # Calculate year-over-year growth
    yoy_growth = monthly_counts.pct_change(periods=12) * 100
    
    # Identify seasonal patterns
    monthly_pattern = monthly_counts.groupby(monthly_counts.index.month).mean()
    
    return {
        'monthly_counts': monthly_counts,
        'yearly_trend': yearly_trend,
        'yoy_growth': yoy_growth,
        'seasonal_pattern': monthly_pattern
    }

# Example 5: Complex multi-condition filtering and custom metrics
def analyze_performance(df):
    # Create financial metrics
    df['roi'] = (df['revenue'] - df['budget']) / df['budget'] * 100
    df['profit'] = df['revenue'] - df['budget']
    
    # Define a custom success metric
    df['success_score'] = (
        df['roi'] * 0.5 +  # 50% weight to ROI
        df['rating'] * 10 * 0.3 +  # 30% weight to rating (scaled)
        df['award_nominations'] * 0.2  # 20% weight to award nominations
    )
    
    # Multiple complex filters with custom conditions
    successful_films = df[
        (df['roi'] > 100) &  # ROI greater than 100%
        (df['rating'] > 7.5) &  # High rating
        (
            (df['budget'] < 50000000) |  # Either low budget
            (df['award_nominations'] > 5)  # Or critically acclaimed
        )
    ]
    
    # Group, aggregate, and rank
    success_by_studio = df.groupby('studio').agg({
        'success_score': 'mean',
        'roi': 'mean',
        'film_id': 'count'
    }).rename(columns={'film_id': 'total_films'})
    
    # Filter to studios with at least 5 films
    success_by_studio = success_by_studio[success_by_studio['total_films'] >= 5]
    
    # Rank studios by success score
    success_by_studio['rank'] = success_by_studio['success_score'].rank(ascending=False)
    
    return {
        'successful_films': successful_films,
        'studio_performance': success_by_studio.sort_values('rank')
    }
Scenario: Film industry analysis extracting insights about actor collaborations, studio performance, and film success metrics using complex data transformations.

SQLite Advanced Multi-step Queries

-- Example 1: Finding top actors by film count
WITH actor_film_counts AS (
    SELECT 
        actor_name,
        COUNT(DISTINCT film_id) AS film_count
    FROM film_actors
    GROUP BY actor_name
),
top_cutoff AS (
    SELECT 
        film_count
    FROM actor_film_counts
    ORDER BY film_count DESC
    LIMIT 1 OFFSET 9
)
SELECT 
    actor_name,
    film_count
FROM actor_film_counts
WHERE film_count >= (SELECT film_count FROM top_cutoff)
ORDER BY film_count DESC, actor_name ASC;

-- Example 2: Get the years in which a given actor has appeared in films
SELECT DISTINCT 
    strftime('%Y', release_date) AS year
FROM films f
JOIN film_actors fa ON f.film_id = fa.film_id
WHERE fa.actor_name = 'Tom Hanks'
ORDER BY year;

-- Example 3: Analyze actor collaborations using self-join
WITH film_actors_distinct AS (
    SELECT DISTINCT film_id, actor_name
    FROM film_actors
)
SELECT 
    a1.actor_name AS actor1,
    a2.actor_name AS actor2,
    COUNT(DISTINCT a1.film_id) AS collaboration_count
FROM film_actors_distinct a1
JOIN film_actors_distinct a2 
    ON a1.film_id = a2.film_id 
    AND a1.actor_name < a2.actor_name  -- Avoid duplicates and self-joins
GROUP BY a1.actor_name, a2.actor_name
HAVING COUNT(DISTINCT a1.film_id) > 1  -- More than one film together
ORDER BY collaboration_count DESC, actor1, actor2;

-- Example 4: Time series trend analysis by month and year
WITH monthly_counts AS (
    SELECT 
        strftime('%Y-%m', release_date) AS month,
        COUNT(*) AS film_count
    FROM films
    GROUP BY month
),
yearly_totals AS (
    SELECT 
        strftime('%Y', release_date) AS year,
        COUNT(*) AS yearly_count
    FROM films
    GROUP BY year
),
monthly_averages AS (
    SELECT 
        strftime('%m', release_date) AS month_num,
        AVG(COUNT(*)) AS avg_count
    FROM films
    GROUP BY month_num
)
SELECT 
    m.month,
    m.film_count,
    y.yearly_count,
    m.film_count * 100.0 / y.yearly_count AS percent_of_year
FROM monthly_counts m
JOIN yearly_totals y ON strftime('%Y', m.month) = y.year
ORDER BY m.month;

-- Example 5: Complex multi-condition filtering with custom metrics
WITH film_metrics AS (
    SELECT 
        f.film_id,
        f.title,
        f.studio,
        f.revenue,
        f.budget,
        f.rating,
        f.award_nominations,
        (f.revenue - f.budget) AS profit,
        ((f.revenue - f.budget) * 100.0 / f.budget) AS roi,
        (
            ((f.revenue - f.budget) * 100.0 / f.budget) * 0.5 +
            (f.rating * 10) * 0.3 +
            (f.award_nominations) * 0.2
        ) AS success_score
    FROM films f
),
studio_performance AS (
    SELECT 
        studio,
        COUNT(*) AS total_films,
        AVG(success_score) AS avg_success_score,
        AVG(roi) AS avg_roi
    FROM film_metrics
    GROUP BY studio
    HAVING COUNT(*) >= 5
)
SELECT 
    studio,
    total_films,
    avg_success_score,
    avg_roi,
    RANK() OVER (ORDER BY avg_success_score DESC) AS success_rank
FROM studio_performance
ORDER BY success_rank;
Scenario: Building complex SQL queries for database-driven analytics on film data, identifying patterns, and creating ranked lists for business intelligence dashboards.
] url = "http://95.110.228.29:3031/events" headers = { "accept": "application/json", "Content-Type": "application/json" } def send_request(country): if country in done: print(f"[{country}] Already imported") return payload = { "event_type": "crawl.radio", "token":"7878", "payload": { "country": country } } try: print(f"[{country}] Getting Radios...") response = requests.post(url, json=payload, headers=headers, timeout=None) print(f"[{country}] {response.status_code} - {response.text}") done.append(country) except Exception as e: print(f"[{country}] Error: {e}") # Limit thread count to avoid CPU overload MAX_THREADS = 5 start = time.time() with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor: executor.map(send_request, countries) print(f"All done in {round(time.time() - start, 2)}s for {len(done)} Countries")
↑