Pular para conteúdo

Home

Python Panel Select com Gráfico

Criando um um página com a biblioteca Panel, onde ao selecionar uma opção modifica o Gráfico.

Gráfico com Matplotlib

Select com Gráfico
import panel as pn
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
import numpy as np

pn.extension() # para exibir o bokeh no notebook


# Carregando dados
def carregar_dados():
    df = pd.read_excel('dados.xlsx')

    # melhor apresentar como string
    # df['COD'] = df['COD'].astype(str)

    # Criando dados para o combo select
    combo_depe = df['COD'].unique().tolist()
    return df, combo_depe


df, combo_depe = carregar_dados()
depes_select  = pn.widgets.Select(options=combo_depe, name="Dependência")


# Para gráficos com Matplotlib é com Figure
def create_chart(cod=1126):
    selected_category = cod
    filtered_df = df.query('COD == @selected_category')
    fig1 = Figure()
    ax1 = fig1.subplots()
    ax1.plot(filtered_df['anomes'], filtered_df['rlz'])
    return fig1



# Adicionando o select e o gráficos
# Para matplotlib deve ser exibido dessa forma.
pn.Column(
    depes_select, 
    pn.pane.Matplotlib(
        pn.bind(create_chart, event_new=depes_select)
    )
).servable()

Pandas

Criar DataFrame

  • lendo dados
# de aquivo csv
titanic = pd.read_csv("data/titanic.csv")

# de excel
titanic = pd.read_excel("titanic.xlsx", sheet_name="passengers")

# primeiras 8 linhas
titanic.head(8)

# últimas dez linhas
titanic.tail(10)
  • exportando dados
# para excel
titanic.to_excel("titanic.xlsx", sheet_name="passengers", index=False)
  • de um array
estados = ["Ceará", "Piauí", "Bahia"]
pd.DataFrame(estados, columns=["Estado"])
  • com dicionário
1
2
3
4
5
import pandas as pd

data = {"Name": ["John", "Jane", "Mary", "Adam"],
        "City": ["New York", "Los Angeles", "Chicago", "Houston"]}
df = pd.DataFrame(data)
  • coluna nula
frame = pd.DataFrame(
    {"col1": ["A", "B", np.nan, "C", "D"], "col2": ["F", np.nan, "G", "H", "I"]}
)

Series

  • criar Series
ages = pd.Series([22, 35, 58], name="Age")

Cada coluna do DataFrame é uma Series

df["Age"]

Estrutura do DataFrame

  • tipo das colunas
titanic.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
  • informações do dataframe como tipos de dados e colunas
titanic.info()
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
  • criar coluna derivada:
air_quality["london_mg_per_cubic"] = air_quality["station_london"] * 1.882

air_quality["ratio_paris_antwerp"] = (
    air_quality["station_paris"] / air_quality["station_antwerp"]
)
  • renomear colunas
air_quality_renamed = air_quality.rename(
    columns={
        "station_antwerp": "BETR801",
        "station_paris": "FR04014",
        "station_london": "London Westminster",
    }
)

air_quality_renamed = air_quality_renamed.rename(columns=str.lower)
  • ordernar dataframe
links = pd.DataFrame(projects_data).sort_values(by=['pipeline_status'], ascending=True)
  • selecionar ordem das colunas
`links = links[['Projeto','Relatorio','pipeline_status','pipeline_duration','emoji']]

Projeção e Seleção no Dataframe

  • selecionar uma coluna
# idade dos passageiros do Titanic.
ages = titanic["Age"]

type(titanic["Age"])
pandas.core.series.Series

titanic["Age"].shape
  • selecionar mais colunas
# age and sex of the Titanic passengers.
age_sex = titanic[["Age", "Sex"]]

type(titanic[["Age", "Sex"]])
Out[10]: pandas.core.frame.DataFrame
  • filtrar linhas
# passengers older than 35 years.
above_35 = titanic[titanic["Age"] > 35]


# interested in the Titanic passengers from cabin class 2 and 3.
class_23 = titanic[titanic["Pclass"].isin([2, 3])]
class_23 = titanic[(titanic["Pclass"] == 2) | (titanic["Pclass"] == 3)]


# the age is known.
age_no_na = titanic[titanic["Age"].notna()]


# interested in the names of the passengers older than 35 years.
adult_names = titanic.loc[titanic["Age"] > 35, "Name"]
  • selecionar linhas e colunas específicas
# interested in rows 10 till 25 and columns 3 to 5.
titanic.iloc[9:25, 2:5]
  • filtrar coluna por string
import pandas as pd

data = {"Name": ["John", "Jane", "Mary", "Adam"],
        "City": ["New York", "Los Angeles", "Chicago", "Houston"]}
df = pd.DataFrame(data)

# filtrando por uma string
filtered_df = df[df["City"].str.contains("Los Angeles")]
print(filtered_df)

# filtro por mais de uma string
filtered_df = df[df["City"].str.contains("Los Angeles|Chicago")]
print(filtered_df)

#filtro por uma lista de strings
filtered_df = df[df["City"].isin(["Los Angeles", "Chicago"])]
print(filtered_df)
  • setar valor em linhas e colunas específicas
# to assign the name anonymous to the first 3 elements of the fourth column:
titanic.iloc[0:3, 3] = "anonymous"

Data wrangling

Make all name characters lowercase.

titanic["Name"].str.lower()

Create a new column Surname that contains the surname of the passengers by extracting the part before the comma.

titanic["Name"].str.split(",")
titanic["Surname"] = titanic["Name"].str.split(",").str.get(0)

Extract the passenger data about the countesses on board of the Titanic.

titanic["Name"].str.contains("Countess")
titanic[titanic["Name"].str.contains("Countess")]

Which passenger of the Titanic has the longest name?

titanic["Name"].str.len()

titanic["Name"].str.len().idxmax()

titanic.loc[titanic["Name"].str.len().idxmax(), "Name"]

In the “Sex” column, replace values of “male” by “M” and values of “female” by “F”.

titanic["Sex_short"] = titanic["Sex"].replace({"male": "M", "female": "F"})

Estatísticas

  • sobre dados numéricos
df.describe()

Age
count   3.000000
mean   38.333333
std    18.230012
min    22.000000
25%    28.500000
50%    35.000000
75%    46.500000
max    58.000000
df["Age"].max()
ages.max()
# average age of the Titanic passengers?
titanic["Age"].mean()


# median age and ticket fare price of the Titanic passengers?
titanic[["Age", "Fare"]].median()


titanic[["Age", "Fare"]].describe()
Age        Fare
count  714.000000  891.000000
mean    29.699118   32.204208
std     14.526497   49.693429
min      0.420000    0.000000
25%     20.125000    7.910400
50%     28.000000   14.454200
75%     38.000000   31.000000
max     80.000000  512.329200
  • specific combinations of aggregating statistics for given columns
titanic.agg(
    {
        "Age": ["min", "max", "median", "skew"],
        "Fare": ["min", "max", "median", "mean"],
    }
)

Age        Fare
min      0.420000    0.000000
max     80.000000  512.329200
median  28.000000   14.454200
skew     0.389108         NaN
mean          NaN   32.204208

Agregações

statistics grouped by category

What is the average age for male versus female Titanic passengers?
titanic[["Sex", "Age"]].groupby("Sex").mean()


method is applied to each column containing numerical columns by passing numeric_only=True:

titanic.groupby("Sex").mean(numeric_only=True)


If we are only interested in the average age for each gender, the selection of columns (rectangular brackets [] as usual) is supported on the grouped data as well:

titanic.groupby("Sex")["Age"].mean()



What is the mean ticket fare price for each of the sex and cabin class combinations?

titanic.groupby(["Sex", "Pclass"])["Fare"].mean()

Count number of records by category

# What is the number of passengers in each of the cabin classes?
titanic["Pclass"].value_counts()
# shortcut for
titanic.groupby("Pclass")["Pclass"].count()

Outro

reshape the layout of tables Sort table rows

I want to sort the Titanic data according to the age of the passengers.

titanic.sort_values(by="Age").head()

sort the Titanic data according to the cabin class and age in descending order.

titanic.sort_values(by=['Pclass', 'Age'], ascending=False).head()

only use the first two measurements of each location no2 = air_quality[air_quality["parameter"] == "no2"] use 2 measurements (head) for each location (groupby)

no2_subset = no2.sort_index().groupby(["location"]).head(2)

I want the values for the three stations as separate columns next to each other.

no2_subset.pivot(columns="location", values="value")

I want the mean concentrations for and

in each of the stations in table form.

air_quality.pivot_table( values="value", index="location", columns="parameter", aggfunc="mean" )

In the case of pivot(), the data is only rearranged. When multiple values need to be aggregated (in this specific case, the values on different time steps), pivot_table() can be used, providing an aggregation function (e.g. mean) on how to combine these values.

air_quality.pivot_table(
    values="value",
    index="location",
    columns="parameter",
    aggfunc="mean",
    margins=True,
)

same result of

air_quality.groupby(["parameter", "location"])[["value"]].mean()

Wide to long format

Starting again from the wide format table created in the previous section, we add a new index to the DataFrame with reset_index().

no2_pivoted = no2.pivot(columns="location", values="value").reset_index()

location                  date.utc  BETR801  FR04014  London Westminster
0        2019-04-09 01:00:00+00:00     22.5     24.4                 NaN
1        2019-04-09 02:00:00+00:00     53.5     27.4                67.0
2        2019-04-09 03:00:00+00:00     54.5     34.2                67.0
3        2019-04-09 04:00:00+00:00     34.5     48.5                41.0
4        2019-04-09 05:00:00+00:00     46.5     59.5                41.0

I want to collect all air quality

measurements in a single column (long format).

no_2 = no2_pivoted.melt(id_vars="date.utc")

date.utc location  value
0 2019-04-09 01:00:00+00:00  BETR801   22.5
1 2019-04-09 02:00:00+00:00  BETR801   53.5
2 2019-04-09 03:00:00+00:00  BETR801   54.5
3 2019-04-09 04:00:00+00:00  BETR801   34.5
4 2019-04-09 05:00:00+00:00  BETR801   46.5

pandas.melt() method on a DataFrame converts the data table from wide format to long format. The column headers become the variable names in a newly created column.

The parameters passed to pandas.melt() can be defined in more detail:

no_2 = no2_pivoted.melt(
    id_vars="date.utc",
    value_vars=["BETR801", "FR04014", "London Westminster"],
    value_name="NO_2",
    var_name="id_location",
)

date.utc id_location  NO_2
0 2019-04-09 01:00:00+00:00     BETR801  22.5
1 2019-04-09 02:00:00+00:00     BETR801  53.5
2 2019-04-09 03:00:00+00:00     BETR801  54.5
3 2019-04-09 04:00:00+00:00     BETR801  34.5
4 2019-04-09 05:00:00+00:00     BETR801  46.5

The additional parameters have the following effects:

value_vars defines which columns to melt together

value_name provides a custom column name for the values column instead of the default column name value

var_name provides a custom column name for the column collecting the column header names. Otherwise it takes the index name or a default variable

Hence, the arguments value_name and var_name are just user-defined names for the two generated columns. The columns to melt are defined by id_vars and value_vars.

Combinando dados

combine data from multiple tables

want to combine the measurements of and

, two tables with a similar structure, in a single table.

air_quality = pd.concat([air_quality_pm25, air_quality_no2], axis=0)

Join tables using a common identifier

air_quality = pd.merge(air_quality, stations_coord, how="left", on="location")


air_quality = pd.merge(air_quality, air_quality_parameters,

                       how='left', left_on='parameter', right_on='id')

Time series

How to handle time series data with ease Using pandas datetime properties

I want to work with the dates in the column datetime as datetime objects instead of plain text

air_quality["datetime"] = pd.to_datetime(air_quality["datetime"])



pd.read_csv("../data/air_quality_no2_long.csv", parse_dates=["datetime"])

pandas.Timestamp for datetimes enables us to calculate with date information and make them comparable. Hence, we can use this to get the length of our time series:

air_quality["datetime"].max() - air_quality["datetime"].min()
Out[10]: Timedelta('44 days 23:00:00')

I want to add a new column to the DataFrame containing only the month of the measurement

air_quality["month"] = air_quality["datetime"].dt.month

What is the average

concentration for each day of the week for each of the measurement locations?

air_quality.groupby(
    [air_quality["datetime"].dt.weekday, "location"])["value"].mean()

Plot the typical

pattern during the day of our time series of all stations together. In other words, what is the average value for each hour of the day?

fig, axs = plt.subplots(figsize=(12, 4))

air_quality.groupby(air_quality["datetime"].dt.hour)["value"].mean().plot(
    kind='bar', rot=0, ax=axs
)

Out[15]: <Axes: xlabel='datetime'>

plt.xlabel("Hour of the day");  # custom x label using Matplotlib

plt.ylabel("$NO_2 (µg/m^3)$");

data table with each of the measurements locations as a separate column:

no_2 = air_quality.pivot(index="datetime", columns="location", values="value")

no_2.head()

no_2.index.year, no_2.index.weekday

Create a plot of the

values in the different stations from the 20th of May till the end of 21st of May

no_2["2019-05-20":"2019-05-21"].plot();

Aggregate the current hourly time series values to the monthly maximum value in each of the stations.

monthly_max = no_2.resample("ME").max() A very powerful method on time series data with a datetime index, is the ability to resample() time series to another frequency (e.g., converting secondly data into 5-minutely data).

Make a plot of the daily mean

value in each of the stations.

no_2.resample("D").mean().plot(style="-o", figsize=(10, 5));

concat() takes a list or dict of homogeneously-typed objects and concatenates them.

df1 = pd.DataFrame(
    {
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    },
    index=[0, 1, 2, 3],
)

df2 = pd.DataFrame(
    {
        "A": ["A4", "A5", "A6", "A7"],
        "B": ["B4", "B5", "B6", "B7"],
        "C": ["C4", "C5", "C6", "C7"],
        "D": ["D4", "D5", "D6", "D7"],
    },
    index=[4, 5, 6, 7],
)

df3 = pd.DataFrame(
    {
        "A": ["A8", "A9", "A10", "A11"],
        "B": ["B8", "B9", "B10", "B11"],
        "C": ["C8", "C9", "C10", "C11"],
        "D": ["D8", "D9", "D10", "D11"],
    },
    index=[8, 9, 10, 11],
)

frames = [df1, df2, df3]
result = pd.concat(frames)
result
Out[6]: 
      A    B    C    D
0    A0   B0   C0   D0
1    A1   B1   C1   D1
2    A2   B2   C2   D2
3    A3   B3   C3   D3
4    A4   B4   C4   D4
5    A5   B5   C5   D5
6    A6   B6   C6   D6
7    A7   B7   C7   D7
8    A8   B8   C8   D8
9    A9   B9   C9   D9
10  A10  B10  C10  D10
11  A11  B11  C11  D11

Joining logic of the resulting axis

The join keyword specifies how to handle axis values that don’t exist in the first DataFrame.

join='outer' takes the union of all axis values

df4 = pd.DataFrame(
    {
        "B": ["B2", "B3", "B6", "B7"],
        "D": ["D2", "D3", "D6", "D7"],
        "F": ["F2", "F3", "F6", "F7"],
    },
    index=[2, 3, 6, 7],
)

result = pd.concat([df1, df4], axis=1)

result
Out[9]: 
     A    B    C    D    B    D    F
0   A0   B0   C0   D0  NaN  NaN  NaN
1   A1   B1   C1   D1  NaN  NaN  NaN
2   A2   B2   C2   D2   B2   D2   F2
3   A3   B3   C3   D3   B3   D3   F3
6  NaN  NaN  NaN  NaN   B6   D6   F6
7  NaN  NaN  NaN  NaN   B7   D7   F7

Concatenating Series and DataFrame together

You can concatenate a mix of Series and DataFrame objects. The Series will be transformed to DataFrame with the column name as the name of the Series.

s1 = pd.Series(["X0", "X1", "X2", "X3"], name="X")

result = pd.concat([df1, s1], axis=1)

result
Out[18]: 
    A   B   C   D   X
0  A0  B0  C0  D0  X0
1  A1  B1  C1  D1  X1
2  A2  B2  C2  D2  X2
3  A3  B3  C3  D3  X3

Resulting keys

The keys argument adds another axis level to the resulting index or column (creating a MultiIndex) associate specific keys with each original DataFrame.

result = pd.concat(frames, keys=["x", "y", "z"])

result
Out[25]: 
        A    B    C    D
x 0    A0   B0   C0   D0
  1    A1   B1   C1   D1
  2    A2   B2   C2   D2
  3    A3   B3   C3   D3
y 4    A4   B4   C4   D4
  5    A5   B5   C5   D5
  6    A6   B6   C6   D6
  7    A7   B7   C7   D7
z 8    A8   B8   C8   D8
  9    A9   B9   C9   D9
  10  A10  B10  C10  D10
  11  A11  B11  C11  D11

result.loc["y"]
Out[26]: 
    A   B   C   D
4  A4  B4  C4  D4
5  A5  B5  C5  D5
6  A6  B6  C6  D6
7  A7  B7  C7  D7

Appending rows to a DataFrame

If you have a Series that you want to append as a single row to a DataFrame, you can convert the row into a DataFrame and use concat()

s2 = pd.Series(["X0", "X1", "X2", "X3"], index=["A", "B", "C", "D"])

result = pd.concat([df1, s2.to_frame().T], ignore_index=True)

MERGE

merge() performs join operations similar to relational databases like SQL.

For a many-to-many join, if a key combination appears more than once in both tables, the DataFrame will have the Cartesian product of the associated data.

left = pd.DataFrame(
    {
        "key": ["K0", "K1", "K2", "K3"],
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
    }
)

right = pd.DataFrame(
    {
        "key": ["K0", "K1", "K2", "K3"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    }
)

result = pd.merge(left, right, on="key")

result
Out[47]: 
  key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2
3  K3  A3  B3  C3  D3

left = pd.DataFrame(
   {
      "key1": ["K0", "K0", "K1", "K2"],
      "key2": ["K0", "K1", "K0", "K1"],
      "A": ["A0", "A1", "A2", "A3"],
      "B": ["B0", "B1", "B2", "B3"],
   }
)

right = pd.DataFrame(
   {
      "key1": ["K0", "K1", "K1", "K2"],
      "key2": ["K0", "K0", "K0", "K0"],
      "C": ["C0", "C1", "C2", "C3"],
      "D": ["D0", "D1", "D2", "D3"],
   }
)

result = pd.merge(left, right, how="left", on=["key1", "key2"])

result
Out[51]: 
  key1 key2   A   B    C    D
0   K0   K0  A0  B0   C0   D0
1   K0   K1  A1  B1  NaN  NaN
2   K1   K0  A2  B2   C1   D1
3   K1   K0  A2  B2   C2   D2
4   K2   K1  A3  B3  NaN  NaN

result = pd.merge(left, right, how="right", on=["key1", "key2"])

result
Out[53]: 
  key1 key2    A    B   C   D
0   K0   K0   A0   B0  C0  D0
1   K1   K0   A2   B2  C1  D1
2   K1   K0   A2   B2  C2  D2
3   K2   K0  NaN  NaN  C3  D3

result = pd.merge(left, right, how="outer", on=["key1", "key2"])

result
Out[55]: 
  key1 key2    A    B    C    D
0   K0   K0   A0   B0   C0   D0
1   K0   K1   A1   B1  NaN  NaN
2   K1   K0   A2   B2   C1   D1
3   K1   K0   A2   B2   C2   D2
4   K2   K0  NaN  NaN   C3   D3
5   K2   K1   A3   B3  NaN  NaN

result = pd.merge(left, right, how="inner", on=["key1", "key2"])

result
Out[57]: 
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2

result = pd.merge(left, right, how="cross")

result
Out[59]: 
   key1_x key2_x   A   B key1_y key2_y   C   D
0      K0     K0  A0  B0     K0     K0  C0  D0
1      K0     K0  A0  B0     K1     K0  C1  D1
2      K0     K0  A0  B0     K1     K0  C2  D2
3      K0     K0  A0  B0     K2     K0  C3  D3
4      K0     K1  A1  B1     K0     K0  C0  D0
..    ...    ...  ..  ..    ...    ...  ..  ..
11     K1     K0  A2  B2     K2     K0  C3  D3
12     K2     K1  A3  B3     K0     K0  C0  D0
13     K2     K1  A3  B3     K1     K0  C1  D1
14     K2     K1  A3  B3     K1     K0  C2  D2
15     K2     K1  A3  B3     K2     K0  C3  D3

Merge key uniqueness

The validate argument checks whether the uniqueness of merge keys. Key uniqueness is checked before merge operations and can protect against memory overflows and unexpected key duplication.

left = pd.DataFrame({"A": [1, 2], "B": [1, 2]})

right = pd.DataFrame({"A": [4, 5, 6], "B": [2, 2, 2]})

result = pd.merge(left, right, on="B", how="outer", validate="one_to_one")
---------------------------------------------------------------------------
MergeError                                Traceback (most recent call last)
Cell In[71], line 1
----> 1 result = pd.merge(left, right, on="B", how="outer", validate="one_to_one")

MergeError: Merge keys are not unique in right dataset; not a one-to-one merge

If the user is aware of the duplicates in the right DataFrame but wants to ensure there are no duplicates in the left DataFrame, one can use the validate='one_to_many' argument instead, which will not raise an exception.

pd.merge(left, right, on="B", how="outer", validate="one_to_many")
Out[72]: 
   A_x  B  A_y
0    1  1  NaN
1    2  2  4.0
2    2  2  5.0
3    2  2  6.0

Merge result indicator

merge() accepts the argument indicator. If True, a Categorical-type column called _merge will be added to the output object that takes on values:

Observation Origin


_merge value

Merge key only in 'left' frame


left_only

Merge key only in 'right' frame


right_only

Merge key in both frames


both
df1 = pd.DataFrame({"col1": [0, 1], "col_left": ["a", "b"]})

df2 = pd.DataFrame({"col1": [1, 2, 2], "col_right": [2, 2, 2]})

pd.merge(df1, df2, on="col1", how="outer", indicator=True)
Out[75]: 
   col1 col_left  col_right      _merge
0     0        a        NaN   left_only
1     1        b        2.0        both
2     2      NaN        2.0  right_only
3     2      NaN        2.0  right_only

A string argument to indicator will use the value as the name for the indicator column.

pd.merge(df1, df2, on="col1", how="outer", indicator="indicator_column")
Out[76]: 
   col1 col_left  col_right indicator_column
0     0        a        NaN        left_only
1     1        b        2.0             both
2     2      NaN        2.0       right_only
3     2      NaN        2.0       right_only

Overlapping value columns

The merge suffixes argument takes a tuple of list of strings to append to overlapping column names in the input DataFrame to disambiguate the result columns:

left = pd.DataFrame({"k": ["K0", "K1", "K2"], "v": [1, 2, 3]})

right = pd.DataFrame({"k": ["K0", "K0", "K3"], "v": [4, 5, 6]})

result = pd.merge(left, right, on="k")

result
Out[80]: 
    k  v_x  v_y
0  K0    1    4
1  K0    1    5

result = pd.merge(left, right, on="k", suffixes=("_l", "_r"))

result
Out[82]: 
    k  v_l  v_r
0  K0    1    4
1  K0    1    5

JOIN

JOINs can be performed with join() or merge(). By default, join() will join the DataFrames on their indices. Each method has parameters allowing you to specify the type of join to perform (LEFT, RIGHT, INNER, FULL) or the columns to join on (column names or indices).

df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)})
df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)})
INNER JOIN
SELECT *
FROM df1
INNER JOIN df2
  ON df1.key = df2.key;

merge performs an INNER JOIN by default

pd.merge(df1, df2, on="key")
Out[26]: 
  key   value_x   value_y
0   B -0.282863  1.212112
1   D -1.135632 -0.173215
2   D -1.135632  0.119209
LEFT OUTER JOIN

Show all records from df1.

SELECT *
FROM df1
LEFT OUTER JOIN df2
  ON df1.key = df2.key;
pd.merge(df1, df2, on="key", how="left")
Out[29]: 
  key   value_x   value_y
0   A  0.469112       NaN
1   B -0.282863  1.212112
2   C -1.509059       NaN
3   D -1.135632 -0.173215
4   D -1.135632  0.119209
RIGHT JOIN

Show all records from df2.

SELECT *
FROM df1
RIGHT OUTER JOIN df2
  ON df1.key = df2.key;
pd.merge(df1, df2, on="key", how="right")
Out[30]: 
  key   value_x   value_y
0   B -0.282863  1.212112
1   D -1.135632 -0.173215
2   D -1.135632  0.119209
3   E       NaN -1.044236
FULL JOIN

pandas also allows for FULL JOINs, which display both sides of the dataset, whether or not the joined columns find a match. As of writing, FULL JOINs are not supported in all RDBMS (MySQL).

Show all records from both tables.

SELECT *
FROM df1
FULL OUTER JOIN df2
  ON df1.key = df2.key;
pd.merge(df1, df2, on="key", how="outer")
Out[31]: 
  key   value_x   value_y
0   A  0.469112       NaN
1   B -0.282863  1.212112
2   C -1.509059       NaN
3   D -1.135632 -0.173215
4   D -1.135632  0.119209
5   E       NaN -1.044236

UNION ALL

UNION ALL can be performed using concat().

df1 = pd.DataFrame(
    {"city": ["Chicago", "San Francisco", "New York City"], "rank": range(1, 4)}
)

df2 = pd.DataFrame(
    {"city": ["Chicago", "Boston", "Los Angeles"], "rank": [1, 4, 5]}
)
SELECT city, rank
FROM df1
UNION ALL
SELECT city, rank
FROM df2;
/*
         city  rank
      Chicago     1
San Francisco     2
New York City     3
      Chicago     1
       Boston     4
  Los Angeles     5
*/
pd.concat([df1, df2])
Out[34]: 
            city  rank
0        Chicago     1
1  San Francisco     2
2  New York City     3
0        Chicago     1
1         Boston     4
2    Los Angeles     5

UNION

SQL’s UNION is similar to UNION ALL, however UNION will remove duplicate rows.

SELECT city, rank
FROM df1
UNION
SELECT city, rank
FROM df2;
-- notice that there is only one Chicago record this time
/*
         city  rank
      Chicago     1
San Francisco     2
New York City     3
       Boston     4
  Los Angeles     5
*/

In pandas, you can use concat() in conjunction with drop_duplicates().

pd.concat([df1, df2]).drop_duplicates()
Out[35]: 
            city  rank
0        Chicago     1
1  San Francisco     2
2  New York City     3
1         Boston     4
2    Los Angeles     5

LIMIT

SELECT * FROM tips LIMIT 10;

tips.head(10)
Out[36]: 
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
5       25.29  4.71    Male     No  Sun  Dinner     4
6        8.77  2.00    Male     No  Sun  Dinner     2
7       26.88  3.12    Male     No  Sun  Dinner     4
8       15.04  1.96    Male     No  Sun  Dinner     2
9       14.78  3.23    Male     No  Sun  Dinner     2