Fundamental operations for working with data across different tools and libraries.
import numpy as np # Create a 1D array from a list arr1 = np.array([1, 2, 3, 4, 5]) # Create a 2D array (matrix) arr2 = np.array([[1, 2, 3], [4, 5, 6]]) # Create specialized arrays zeros = np.zeros((3, 3)) # 3x3 matrix of zeros ones = np.ones((2, 4)) # 2x4 matrix of ones identity = np.eye(3) # 3x3 identity matrix range_arr = np.arange(0, 10, 2) # Array from 0 to 8 with step 2 lin_space = np.linspace(0, 1, 5) # 5 evenly spaced values from 0 to 1 random_arr = np.random.rand(3, 3) # 3x3 array of random values [0,1) empty_arr = np.empty((2, 3)) # Uninitialized array (faster) full_arr = np.full((2, 2), 7) # Array filled with a specific value diag_arr = np.diag([1, 2, 3]) # Diagonal matrix
import pandas as pd # Create DataFrame from dictionary df1 = pd.DataFrame({ "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35], "city": ["New York", "San Francisco", "Boston"] }) # Create DataFrame from lists data = [["Alice", 25, "New York"], ["Bob", 30, "San Francisco"], ["Charlie", 35, "Boston"]] df2 = pd.DataFrame(data, columns=["name", "age", "city"]) # Create DataFrame from external sources df3 = pd.read_csv("data.csv") df4 = pd.read_excel("data.xlsx", sheet_name="Sheet1") df5 = pd.read_json("data.json") df6 = pd.read_sql("SELECT * FROM users", con=connection)
-- Create a new table CREATE TABLE users ( id INTEGER PRIMARY KEY, name TEXT NOT NULL, age INTEGER, email TEXT UNIQUE, signup_date DATE DEFAULT CURRENT_DATE ); -- Insert a single row INSERT INTO users (name, age, email) VALUES ('Alice', 25, '[email protected]'); -- Insert multiple rows INSERT INTO users (name, age, email) VALUES ('Bob', 30, '[email protected]'), ('Charlie', 35, '[email protected]'), ('Diana', 28, '[email protected]'); -- Create a table from a query (useful for temporary tables) CREATE TABLE active_users AS SELECT * FROM users WHERE last_login > date('now', '-30 days');
Operation | NumPy | Pandas | SQLite |
---|---|---|---|
Get unique values | np.unique(arr) |
df['column'].unique() |
SELECT DISTINCT column FROM table |
Count unique values | len(np.unique(arr)) |
df['column'].nunique() |
SELECT COUNT(DISTINCT column) FROM table |
Unique with counts | np.unique(arr, return_counts=True) |
df['column'].value_counts() |
SELECT column, COUNT(*) FROM table GROUP BY column |
Unique combinations | np.unique(arr, axis=0) |
df.drop_duplicates(subset=['col1', 'col2']) |
SELECT DISTINCT col1, col2 FROM table |
def top_actors_by_film_count(df): # Drop duplicate film-actor combinations unique_films = df[['actor', 'film_id']].drop_duplicates() # Count number of films per actor actor_counts = unique_films.groupby('actor').size().reset_index(name='count') # Sort by count descending, then actor name ascending actor_counts = actor_counts.sort_values(by=['count', 'actor'], ascending=[False, True]) # Get the 10th highest film count value (or less if there are fewer actors) top_10_cutoff = actor_counts.iloc[9]['count'] if len(actor_counts) >= 10 else actor_counts['count'].min() # Include all actors with at least as many films as the 10th top_actors = actor_counts[actor_counts['count'] >= top_10_cutoff].reset_index(drop=True) return top_actors
import pandas as pd # Example 1: Finding top actors by film count def top_actors_by_film_count(df): # Drop duplicate film-actor combinations unique_films = df[['actor', 'film_id']].drop_duplicates() # Count number of films per actor actor_counts = unique_films.groupby('actor').size().reset_index(name='count') # Sort by count descending, then actor name ascending actor_counts = actor_counts.sort_values(by=['count', 'actor'], ascending=[False, True]) # Get the 10th highest film count value (or less if there are fewer actors) top_10_cutoff = actor_counts.iloc[9]['count'] if len(actor_counts) >= 10 else actor_counts['count'].min() # Include all actors with at least as many films as the 10th top_actors = actor_counts[actor_counts['count'] >= top_10_cutoff].reset_index(drop=True) return top_actors # Example 2: Get the years in which a given actor has appeared in films # Returns a dictionary {actor_name: [year1, year2, ...]} def actor_years(df, actor_name): # Filter rows by the given actor actor_df = df[df['actor'] == actor_name] # Extract unique years and sort them years = sorted(actor_df['year'].drop_duplicates()) return {actor_name: years} # Example 3: Analyze co-occurrences using cross-tabulation def analyze_actor_collaborations(df): # Get unique film-actor pairs film_actors = df[['film_id', 'actor']].drop_duplicates() # Create a cross-tabulation matrix # Each cell represents if an actor (row) was in a film (column) actor_film_matrix = pd.crosstab(film_actors['actor'], film_actors['film_id']) # Convert to binary values (1 = appeared in film, 0 = did not appear) binary_matrix = (actor_film_matrix > 0).astype(int) # Calculate co-occurrence matrix (actor × actor) # Each cell represents how many films two actors appeared in together co_occurrence = binary_matrix.dot(binary_matrix.T) # Set diagonal to zero (to exclude self-collaborations) np.fill_diagonal(co_occurrence.values, 0) return co_occurrence # Example 4: Time series trend analysis with rolling windows and resampling def analyze_film_trends(df): # Convert to datetime and set as index df['release_date'] = pd.to_datetime(df['release_date']) df = df.set_index('release_date') # Count films per month monthly_counts = df.resample('M').size() # Calculate trailing 12-month average yearly_trend = monthly_counts.rolling(window=12).mean() # Calculate year-over-year growth yoy_growth = monthly_counts.pct_change(periods=12) * 100 # Identify seasonal patterns monthly_pattern = monthly_counts.groupby(monthly_counts.index.month).mean() return { 'monthly_counts': monthly_counts, 'yearly_trend': yearly_trend, 'yoy_growth': yoy_growth, 'seasonal_pattern': monthly_pattern } # Example 5: Complex multi-condition filtering and custom metrics def analyze_performance(df): # Create financial metrics df['roi'] = (df['revenue'] - df['budget']) / df['budget'] * 100 df['profit'] = df['revenue'] - df['budget'] # Define a custom success metric df['success_score'] = ( df['roi'] * 0.5 + # 50% weight to ROI df['rating'] * 10 * 0.3 + # 30% weight to rating (scaled) df['award_nominations'] * 0.2 # 20% weight to award nominations ) # Multiple complex filters with custom conditions successful_films = df[ (df['roi'] > 100) & # ROI greater than 100% (df['rating'] > 7.5) & # High rating ( (df['budget'] < 50000000) | # Either low budget (df['award_nominations'] > 5) # Or critically acclaimed ) ] # Group, aggregate, and rank success_by_studio = df.groupby('studio').agg({ 'success_score': 'mean', 'roi': 'mean', 'film_id': 'count' }).rename(columns={'film_id': 'total_films'}) # Filter to studios with at least 5 films success_by_studio = success_by_studio[success_by_studio['total_films'] >= 5] # Rank studios by success score success_by_studio['rank'] = success_by_studio['success_score'].rank(ascending=False) return { 'successful_films': successful_films, 'studio_performance': success_by_studio.sort_values('rank') }
-- Example 1: Finding top actors by film count WITH actor_film_counts AS ( SELECT actor_name, COUNT(DISTINCT film_id) AS film_count FROM film_actors GROUP BY actor_name ), top_cutoff AS ( SELECT film_count FROM actor_film_counts ORDER BY film_count DESC LIMIT 1 OFFSET 9 ) SELECT actor_name, film_count FROM actor_film_counts WHERE film_count >= (SELECT film_count FROM top_cutoff) ORDER BY film_count DESC, actor_name ASC; -- Example 2: Get the years in which a given actor has appeared in films SELECT DISTINCT strftime('%Y', release_date) AS year FROM films f JOIN film_actors fa ON f.film_id = fa.film_id WHERE fa.actor_name = 'Tom Hanks' ORDER BY year; -- Example 3: Analyze actor collaborations using self-join WITH film_actors_distinct AS ( SELECT DISTINCT film_id, actor_name FROM film_actors ) SELECT a1.actor_name AS actor1, a2.actor_name AS actor2, COUNT(DISTINCT a1.film_id) AS collaboration_count FROM film_actors_distinct a1 JOIN film_actors_distinct a2 ON a1.film_id = a2.film_id AND a1.actor_name < a2.actor_name -- Avoid duplicates and self-joins GROUP BY a1.actor_name, a2.actor_name HAVING COUNT(DISTINCT a1.film_id) > 1 -- More than one film together ORDER BY collaboration_count DESC, actor1, actor2; -- Example 4: Time series trend analysis by month and year WITH monthly_counts AS ( SELECT strftime('%Y-%m', release_date) AS month, COUNT(*) AS film_count FROM films GROUP BY month ), yearly_totals AS ( SELECT strftime('%Y', release_date) AS year, COUNT(*) AS yearly_count FROM films GROUP BY year ), monthly_averages AS ( SELECT strftime('%m', release_date) AS month_num, AVG(COUNT(*)) AS avg_count FROM films GROUP BY month_num ) SELECT m.month, m.film_count, y.yearly_count, m.film_count * 100.0 / y.yearly_count AS percent_of_year FROM monthly_counts m JOIN yearly_totals y ON strftime('%Y', m.month) = y.year ORDER BY m.month; -- Example 5: Complex multi-condition filtering with custom metrics WITH film_metrics AS ( SELECT f.film_id, f.title, f.studio, f.revenue, f.budget, f.rating, f.award_nominations, (f.revenue - f.budget) AS profit, ((f.revenue - f.budget) * 100.0 / f.budget) AS roi, ( ((f.revenue - f.budget) * 100.0 / f.budget) * 0.5 + (f.rating * 10) * 0.3 + (f.award_nominations) * 0.2 ) AS success_score FROM films f ), studio_performance AS ( SELECT studio, COUNT(*) AS total_films, AVG(success_score) AS avg_success_score, AVG(roi) AS avg_roi FROM film_metrics GROUP BY studio HAVING COUNT(*) >= 5 ) SELECT studio, total_films, avg_success_score, avg_roi, RANK() OVER (ORDER BY avg_success_score DESC) AS success_rank FROM studio_performance ORDER BY success_rank;