import pandas as pd
Wczytujemy wiele tabel. Każda z nich ma pierwszą kolumnę pełniącą rolę klucza (indeksu).
employees = pd.read_csv('tabele/employees.csv', index_col=0, parse_dates=['hire_date'])
jobs = pd.read_csv('tabele/jobs.csv', index_col=0)
departments = pd.read_csv('tabele/departments.csv', index_col=0)
locations = pd.read_csv('tabele/locations.csv', index_col=0)
countries = pd.read_csv('tabele/countries.csv', index_col=0)
employees
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id | department_id | ||
---|---|---|---|---|---|---|---|---|---|---|
employee_id | ||||||||||
100 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 |
101 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 |
102 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 |
103 | Alexander | Hunold | AHUNOLD | 590.423.4567 | 1990-01-03 | IT_PROG | 9000.0 | NaN | 102.0 | 60.0 |
104 | Bruce | Ernst | BERNST | 590.423.4568 | 1991-05-21 | IT_PROG | 6000.0 | NaN | 103.0 | 60.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
202 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | 20.0 |
203 | Susan | Mavris | SMAVRIS | 515.123.7777 | 1994-06-07 | HR_REP | 6500.0 | NaN | 101.0 | 40.0 |
204 | Hermann | Baer | HBAER | 515.123.8888 | 1994-06-07 | PR_REP | 10000.0 | NaN | 101.0 | 70.0 |
205 | Shelley | Higgins | SHIGGINS | 515.123.8080 | 1994-06-07 | AC_MGR | 12000.0 | NaN | 101.0 | 110.0 |
206 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | 110.0 |
107 rows × 10 columns
Dla pracownika kolumna department_id
zawiera numer departamentu, w którym pracownik jest zatrudniony, a w tabeli departments
dla podanego department_id
są podane szczegóły departamentu: nazwa, kto jest szefem, jaki jest numer lokalizacji. kolei w tabeli locations
dla numeru lokalizacji podane są dane adresowe.
departments
department_name | manager_id | location_id | |
---|---|---|---|
department_id | |||
10 | Administration | 200.0 | 1700 |
20 | Marketing | 201.0 | 1800 |
30 | Purchasing | 114.0 | 1700 |
40 | Human Resources | 203.0 | 2400 |
50 | Shipping | 121.0 | 1500 |
60 | IT | 103.0 | 1400 |
70 | Public Relations | 204.0 | 2700 |
80 | Sales | 145.0 | 2500 |
90 | Executive | 100.0 | 1700 |
100 | Finance | 108.0 | 1700 |
110 | Accounting | 205.0 | 1700 |
120 | Treasury | NaN | 1700 |
130 | Corporate Tax | NaN | 1700 |
140 | Control And Credit | NaN | 1700 |
150 | Shareholder Services | NaN | 1700 |
160 | Benefits | NaN | 1700 |
170 | Manufacturing | NaN | 1700 |
180 | Construction | NaN | 1700 |
190 | Contracting | NaN | 1700 |
200 | Operations | NaN | 1700 |
210 | IT Support | NaN | 1700 |
220 | NOC | NaN | 1700 |
230 | IT Helpdesk | NaN | 1700 |
240 | Government Sales | NaN | 1700 |
250 | Retail Sales | NaN | 1700 |
260 | Recruiting | NaN | 1700 |
270 | Payroll | NaN | 1700 |
locations
street_address | postal_code | city | state_province | country_id | |
---|---|---|---|---|---|
location_id | |||||
1000 | 1297 Via Cola di Rie | 00989 | Roma | NaN | IT |
1100 | 93091 Calle della Testa | 10934 | Venice | NaN | IT |
1200 | 2017 Shinjuku-ku | 1689 | Tokyo | Tokyo Prefecture | JP |
1300 | 9450 Kamiya-cho | 6823 | Hiroshima | NaN | JP |
1400 | 2014 Jabberwocky Rd | 26192 | Southlake | Texas | US |
1500 | 2011 Interiors Blvd | 99236 | South San Francisco | California | US |
1600 | 2007 Zagora St | 50090 | South Brunswick | New Jersey | US |
1700 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
1800 | 147 Spadina Ave | M5V 2L7 | Toronto | Ontario | CA |
1900 | 6092 Boxwood St | YSW 9T2 | Whitehorse | Yukon | CA |
2000 | 40-5-12 Laogianggen | 190518 | Beijing | NaN | CN |
2100 | 1298 Vileparle (E) | 490231 | Bombay | Maharashtra | IN |
2200 | 12-98 Victoria Street | 2901 | Sydney | New South Wales | AU |
2300 | 198 Clementi North | 540198 | Singapore | NaN | SG |
2400 | 8204 Arthur St | NaN | London | NaN | UK |
2500 | Magdalen Centre, The Oxford Science Park | OX9 9ZB | Oxford | Oxford | UK |
2600 | 9702 Chester Road | 09629850293 | Stretford | Manchester | UK |
2700 | Schwanthalerstr. 7031 | 80925 | Munich | Bavaria | DE |
2800 | Rua Frei Caneca 1360 | 01307-002 | Sao Paulo | Sao Paulo | BR |
2900 | 20 Rue des Corps-Saints | 1730 | Geneva | Geneve | CH |
3000 | Murtenstrasse 921 | 3095 | Bern | BE | CH |
3100 | Pieter Breughelstraat 837 | 3029SK | Utrecht | Utrecht | NL |
3200 | Mariano Escobedo 9991 | 11932 | Mexico City | Distrito Federal, | MX |
jobs
job_title | min_salary | max_salary | |
---|---|---|---|
job_id | |||
AD_PRES | President | 20000.0 | 40000.0 |
AD_VP | Administration Vice President | 15000.0 | 30000.0 |
AD_ASST | Administration Assistant | 3000.0 | 6000.0 |
FI_MGR | Finance Manager | 8200.0 | 16000.0 |
FI_ACCOUNT | Accountant | 4200.0 | 9000.0 |
AC_MGR | Accounting Manager | 8200.0 | 16000.0 |
AC_ACCOUNT | Public Accountant | 4200.0 | 9000.0 |
SA_MAN | Sales Manager | 10000.0 | 20000.0 |
SA_REP | Sales Representative | 6000.0 | 12000.0 |
PU_MAN | Purchasing Manager | 8000.0 | 15000.0 |
PU_CLERK | Purchasing Clerk | 2500.0 | 5500.0 |
ST_MAN | Stock Manager | 5500.0 | 8500.0 |
ST_CLERK | Stock Clerk | 2000.0 | 5000.0 |
SH_CLERK | Shipping Clerk | 2500.0 | 5500.0 |
IT_PROG | Programmer | 4000.0 | 10000.0 |
MK_MAN | Marketing Manager | 9000.0 | 15000.0 |
MK_REP | Marketing Representative | 4000.0 | 9000.0 |
HR_REP | Human Resources Representative | 4000.0 | 9000.0 |
PR_REP | Public Relations Representative | 4500.0 | 10500.0 |
Aby pokazać łączenie na różne sposoby, tablę jobs
wczytamy także z innymi ustawieniami.
W tej wersji job_id
nie jest indeksem, tylko jest zwykłą kolumną.
jobs_bez_indeksu = pd.read_csv('tabele/jobs.csv')
jobs_bez_indeksu
job_id | job_title | min_salary | max_salary | |
---|---|---|---|---|
0 | AD_PRES | President | 20000.0 | 40000.0 |
1 | AD_VP | Administration Vice President | 15000.0 | 30000.0 |
2 | AD_ASST | Administration Assistant | 3000.0 | 6000.0 |
3 | FI_MGR | Finance Manager | 8200.0 | 16000.0 |
4 | FI_ACCOUNT | Accountant | 4200.0 | 9000.0 |
5 | AC_MGR | Accounting Manager | 8200.0 | 16000.0 |
6 | AC_ACCOUNT | Public Accountant | 4200.0 | 9000.0 |
7 | SA_MAN | Sales Manager | 10000.0 | 20000.0 |
8 | SA_REP | Sales Representative | 6000.0 | 12000.0 |
9 | PU_MAN | Purchasing Manager | 8000.0 | 15000.0 |
10 | PU_CLERK | Purchasing Clerk | 2500.0 | 5500.0 |
11 | ST_MAN | Stock Manager | 5500.0 | 8500.0 |
12 | ST_CLERK | Stock Clerk | 2000.0 | 5000.0 |
13 | SH_CLERK | Shipping Clerk | 2500.0 | 5500.0 |
14 | IT_PROG | Programmer | 4000.0 | 10000.0 |
15 | MK_MAN | Marketing Manager | 9000.0 | 15000.0 |
16 | MK_REP | Marketing Representative | 4000.0 | 9000.0 |
17 | HR_REP | Human Resources Representative | 4000.0 | 9000.0 |
18 | PR_REP | Public Relations Representative | 4500.0 | 10500.0 |
Łączenie tabel (w pandas nazywa się to merge
, a odpowiada konstrukcji JOIN
z SQL) polega na tym, że obok danych pochodzących z jendej tabeli umieszczamy (w dodatkowych kolumnach) dane podzące z innej tabeli.
Są różne spsooby, aby powiedzieć, które rekordy z "prawej" tabeli mają zostać dobrane do rekordów z "lewej".
Łączenie w oparciu o wartości w kolumnach¶
W tabeli employees
oraz w tabeli jobs_bez_id
istnieje jedna kolumna o wspólnej nazwie job_id
.
Nazwa jest ta sama. I to właśnie w oparciu o wartość tego pola chcemy dopasować dane stanowisk do danych pracowników.
employees.merge(jobs_bez_indeksu)
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id | department_id | job_title | min_salary | max_salary | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | President | 20000.0 | 40000.0 |
1 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Administration Vice President | 15000.0 | 30000.0 |
2 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Administration Vice President | 15000.0 | 30000.0 |
3 | Alexander | Hunold | AHUNOLD | 590.423.4567 | 1990-01-03 | IT_PROG | 9000.0 | NaN | 102.0 | 60.0 | Programmer | 4000.0 | 10000.0 |
4 | Bruce | Ernst | BERNST | 590.423.4568 | 1991-05-21 | IT_PROG | 6000.0 | NaN | 103.0 | 60.0 | Programmer | 4000.0 | 10000.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
102 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | 20.0 | Marketing Representative | 4000.0 | 9000.0 |
103 | Susan | Mavris | SMAVRIS | 515.123.7777 | 1994-06-07 | HR_REP | 6500.0 | NaN | 101.0 | 40.0 | Human Resources Representative | 4000.0 | 9000.0 |
104 | Hermann | Baer | HBAER | 515.123.8888 | 1994-06-07 | PR_REP | 10000.0 | NaN | 101.0 | 70.0 | Public Relations Representative | 4500.0 | 10500.0 |
105 | Shelley | Higgins | SHIGGINS | 515.123.8080 | 1994-06-07 | AC_MGR | 12000.0 | NaN | 101.0 | 110.0 | Accounting Manager | 8200.0 | 16000.0 |
106 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | 110.0 | Public Accountant | 4200.0 | 9000.0 |
107 rows × 13 columns
Zazwyczaj nie idzie aż tak łatwo, dlatego poznamy różne sposoby wskazywania, co do czego ma pasować. Utwórzmy jeszcze jedną wersję tabeli jobs - tym razem taką, gdzie kolumna ma inną nazwę.
jobs_inna_nazwa = jobs_bez_indeksu.rename(columns={'job_id': 'stanowisko'})
jobs_inna_nazwa
stanowisko | job_title | min_salary | max_salary | |
---|---|---|---|---|
0 | AD_PRES | President | 20000.0 | 40000.0 |
1 | AD_VP | Administration Vice President | 15000.0 | 30000.0 |
2 | AD_ASST | Administration Assistant | 3000.0 | 6000.0 |
3 | FI_MGR | Finance Manager | 8200.0 | 16000.0 |
4 | FI_ACCOUNT | Accountant | 4200.0 | 9000.0 |
5 | AC_MGR | Accounting Manager | 8200.0 | 16000.0 |
6 | AC_ACCOUNT | Public Accountant | 4200.0 | 9000.0 |
7 | SA_MAN | Sales Manager | 10000.0 | 20000.0 |
8 | SA_REP | Sales Representative | 6000.0 | 12000.0 |
9 | PU_MAN | Purchasing Manager | 8000.0 | 15000.0 |
10 | PU_CLERK | Purchasing Clerk | 2500.0 | 5500.0 |
11 | ST_MAN | Stock Manager | 5500.0 | 8500.0 |
12 | ST_CLERK | Stock Clerk | 2000.0 | 5000.0 |
13 | SH_CLERK | Shipping Clerk | 2500.0 | 5500.0 |
14 | IT_PROG | Programmer | 4000.0 | 10000.0 |
15 | MK_MAN | Marketing Manager | 9000.0 | 15000.0 |
16 | MK_REP | Marketing Representative | 4000.0 | 9000.0 |
17 | HR_REP | Human Resources Representative | 4000.0 | 9000.0 |
18 | PR_REP | Public Relations Representative | 4500.0 | 10500.0 |
Najogólniej kolumny, których wartość mają do siebie pasować podczas łączenia tabel, podajemy w parametrach left_on
i right_on
.
employees.merge(jobs_inna_nazwa, left_on='job_id', right_on='stanowisko')
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id | department_id | stanowisko | job_title | min_salary | max_salary | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | AD_PRES | President | 20000.0 | 40000.0 |
1 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | AD_VP | Administration Vice President | 15000.0 | 30000.0 |
2 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | AD_VP | Administration Vice President | 15000.0 | 30000.0 |
3 | Alexander | Hunold | AHUNOLD | 590.423.4567 | 1990-01-03 | IT_PROG | 9000.0 | NaN | 102.0 | 60.0 | IT_PROG | Programmer | 4000.0 | 10000.0 |
4 | Bruce | Ernst | BERNST | 590.423.4568 | 1991-05-21 | IT_PROG | 6000.0 | NaN | 103.0 | 60.0 | IT_PROG | Programmer | 4000.0 | 10000.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
102 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | 20.0 | MK_REP | Marketing Representative | 4000.0 | 9000.0 |
103 | Susan | Mavris | SMAVRIS | 515.123.7777 | 1994-06-07 | HR_REP | 6500.0 | NaN | 101.0 | 40.0 | HR_REP | Human Resources Representative | 4000.0 | 9000.0 |
104 | Hermann | Baer | HBAER | 515.123.8888 | 1994-06-07 | PR_REP | 10000.0 | NaN | 101.0 | 70.0 | PR_REP | Public Relations Representative | 4500.0 | 10500.0 |
105 | Shelley | Higgins | SHIGGINS | 515.123.8080 | 1994-06-07 | AC_MGR | 12000.0 | NaN | 101.0 | 110.0 | AC_MGR | Accounting Manager | 8200.0 | 16000.0 |
106 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | 110.0 | AC_ACCOUNT | Public Accountant | 4200.0 | 9000.0 |
107 rows × 14 columns
employees.merge(jobs_bez_indeksu, left_on='job_id', right_on='job_id')
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id | department_id | job_title | min_salary | max_salary | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | President | 20000.0 | 40000.0 |
1 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Administration Vice President | 15000.0 | 30000.0 |
2 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Administration Vice President | 15000.0 | 30000.0 |
3 | Alexander | Hunold | AHUNOLD | 590.423.4567 | 1990-01-03 | IT_PROG | 9000.0 | NaN | 102.0 | 60.0 | Programmer | 4000.0 | 10000.0 |
4 | Bruce | Ernst | BERNST | 590.423.4568 | 1991-05-21 | IT_PROG | 6000.0 | NaN | 103.0 | 60.0 | Programmer | 4000.0 | 10000.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
102 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | 20.0 | Marketing Representative | 4000.0 | 9000.0 |
103 | Susan | Mavris | SMAVRIS | 515.123.7777 | 1994-06-07 | HR_REP | 6500.0 | NaN | 101.0 | 40.0 | Human Resources Representative | 4000.0 | 9000.0 |
104 | Hermann | Baer | HBAER | 515.123.8888 | 1994-06-07 | PR_REP | 10000.0 | NaN | 101.0 | 70.0 | Public Relations Representative | 4500.0 | 10500.0 |
105 | Shelley | Higgins | SHIGGINS | 515.123.8080 | 1994-06-07 | AC_MGR | 12000.0 | NaN | 101.0 | 110.0 | Accounting Manager | 8200.0 | 16000.0 |
106 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | 110.0 | Public Accountant | 4200.0 | 9000.0 |
107 rows × 13 columns
Gdy nazwa kolumny w jednej i drugiej tabeli jest taka sama, to zamiast left_on
i right_on
z jednakową nazwą, możemy użyć parametru on
. To odpowiada JOIN USING
z SQL-a.
employees.merge(jobs_bez_indeksu, on='job_id')
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id | department_id | job_title | min_salary | max_salary | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | President | 20000.0 | 40000.0 |
1 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Administration Vice President | 15000.0 | 30000.0 |
2 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Administration Vice President | 15000.0 | 30000.0 |
3 | Alexander | Hunold | AHUNOLD | 590.423.4567 | 1990-01-03 | IT_PROG | 9000.0 | NaN | 102.0 | 60.0 | Programmer | 4000.0 | 10000.0 |
4 | Bruce | Ernst | BERNST | 590.423.4568 | 1991-05-21 | IT_PROG | 6000.0 | NaN | 103.0 | 60.0 | Programmer | 4000.0 | 10000.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
102 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | 20.0 | Marketing Representative | 4000.0 | 9000.0 |
103 | Susan | Mavris | SMAVRIS | 515.123.7777 | 1994-06-07 | HR_REP | 6500.0 | NaN | 101.0 | 40.0 | Human Resources Representative | 4000.0 | 9000.0 |
104 | Hermann | Baer | HBAER | 515.123.8888 | 1994-06-07 | PR_REP | 10000.0 | NaN | 101.0 | 70.0 | Public Relations Representative | 4500.0 | 10500.0 |
105 | Shelley | Higgins | SHIGGINS | 515.123.8080 | 1994-06-07 | AC_MGR | 12000.0 | NaN | 101.0 | 110.0 | Accounting Manager | 8200.0 | 16000.0 |
106 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | 110.0 | Public Accountant | 4200.0 | 9000.0 |
107 rows × 13 columns
Gdy nie podamy żadnego kryterium, to merge
działa tak jak NATURAL JOIN
w SQL i sam znajduje kolumny o jednakowych nazwach w obu tabelach i dobiera rekordy tak, aby wartości w tych polach były równe.
employees.merge(jobs_bez_indeksu)
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id | department_id | job_title | min_salary | max_salary | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | President | 20000.0 | 40000.0 |
1 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Administration Vice President | 15000.0 | 30000.0 |
2 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Administration Vice President | 15000.0 | 30000.0 |
3 | Alexander | Hunold | AHUNOLD | 590.423.4567 | 1990-01-03 | IT_PROG | 9000.0 | NaN | 102.0 | 60.0 | Programmer | 4000.0 | 10000.0 |
4 | Bruce | Ernst | BERNST | 590.423.4568 | 1991-05-21 | IT_PROG | 6000.0 | NaN | 103.0 | 60.0 | Programmer | 4000.0 | 10000.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
102 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | 20.0 | Marketing Representative | 4000.0 | 9000.0 |
103 | Susan | Mavris | SMAVRIS | 515.123.7777 | 1994-06-07 | HR_REP | 6500.0 | NaN | 101.0 | 40.0 | Human Resources Representative | 4000.0 | 9000.0 |
104 | Hermann | Baer | HBAER | 515.123.8888 | 1994-06-07 | PR_REP | 10000.0 | NaN | 101.0 | 70.0 | Public Relations Representative | 4500.0 | 10500.0 |
105 | Shelley | Higgins | SHIGGINS | 515.123.8080 | 1994-06-07 | AC_MGR | 12000.0 | NaN | 101.0 | 110.0 | Accounting Manager | 8200.0 | 16000.0 |
106 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | 110.0 | Public Accountant | 4200.0 | 9000.0 |
107 rows × 13 columns
Dla jobów to działa, ale technika jest ryzykowna - łatwo może dojść do przypadkowej zbieżności nazw, której nie jesteśmy świadomi. Np. w tabelach employees
oraz departments
istnieje kolumna manager_id
, która oznacza kot jest przełożonym pracownika lub kto jest szefem departamentu. Pandas spróbuje połączyć dane w oparciu o tę kolumnę, a da to dziwnacze efekty:
employees.merge(departments)
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id | department_id | department_name | location_id | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Treasury | 1700 |
1 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Corporate Tax | 1700 |
2 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Control And Credit | 1700 |
3 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Shareholder Services | 1700 |
4 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Benefits | 1700 |
5 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Manufacturing | 1700 |
6 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Construction | 1700 |
7 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Contracting | 1700 |
8 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Operations | 1700 |
9 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | IT Support | 1700 |
10 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | NOC | 1700 |
11 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | IT Helpdesk | 1700 |
12 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Government Sales | 1700 |
13 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Retail Sales | 1700 |
14 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Recruiting | 1700 |
15 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Payroll | 1700 |
16 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 1700 |
17 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 1700 |
18 | Den | Raphaely | DRAPHEAL | 515.127.4561 | 1994-12-07 | PU_MAN | 11000.0 | NaN | 100.0 | 30.0 | Executive | 1700 |
19 | Matthew | Weiss | MWEISS | 650.123.1234 | 1996-07-18 | ST_MAN | 8000.0 | NaN | 100.0 | 50.0 | Executive | 1700 |
20 | Adam | Fripp | AFRIPP | 650.123.2234 | 1997-04-10 | ST_MAN | 8200.0 | NaN | 100.0 | 50.0 | Executive | 1700 |
21 | Payam | Kaufling | PKAUFLIN | 650.123.3234 | 1995-05-01 | ST_MAN | 7900.0 | NaN | 100.0 | 50.0 | Executive | 1700 |
22 | Shanta | Vollman | SVOLLMAN | 650.123.4234 | 1997-10-10 | ST_MAN | 6500.0 | NaN | 100.0 | 50.0 | Executive | 1700 |
23 | Kevin | Mourgos | KMOURGOS | 650.123.5234 | 1999-11-16 | ST_MAN | 5800.0 | NaN | 100.0 | 50.0 | Executive | 1700 |
24 | John | Russell | JRUSSEL | 011.44.1344.429268 | 1996-10-01 | SA_MAN | 14000.0 | 0.40 | 100.0 | 80.0 | Executive | 1700 |
25 | Karen | Partners | KPARTNER | 011.44.1344.467268 | 1997-01-05 | SA_MAN | 13500.0 | 0.30 | 100.0 | 80.0 | Executive | 1700 |
26 | Alberto | Errazuriz | AERRAZUR | 011.44.1344.429278 | 1997-03-10 | SA_MAN | 12000.0 | 0.30 | 100.0 | 80.0 | Executive | 1700 |
27 | Gerald | Cambrault | GCAMBRAU | 011.44.1344.619268 | 1999-10-15 | SA_MAN | 11000.0 | 0.30 | 100.0 | 80.0 | Executive | 1700 |
28 | Eleni | Zlotkey | EZLOTKEY | 011.44.1344.429018 | 2000-01-29 | SA_MAN | 10500.0 | 0.20 | 100.0 | 80.0 | Executive | 1700 |
29 | Michael | Hartstein | MHARTSTE | 515.123.5555 | 1996-02-17 | MK_MAN | 13000.0 | NaN | 100.0 | 20.0 | Executive | 1700 |
30 | Bruce | Ernst | BERNST | 590.423.4568 | 1991-05-21 | IT_PROG | 6000.0 | NaN | 103.0 | 60.0 | IT | 1400 |
31 | David | Austin | DAUSTIN | 590.423.4569 | 1997-06-25 | IT_PROG | 4800.0 | NaN | 103.0 | 60.0 | IT | 1400 |
32 | Valli | Pataballa | VPATABAL | 590.423.4560 | 1998-02-05 | IT_PROG | 4800.0 | NaN | 103.0 | 60.0 | IT | 1400 |
33 | Diana | Lorentz | DLORENTZ | 590.423.5567 | 1999-02-07 | IT_PROG | 4200.0 | NaN | 103.0 | 60.0 | IT | 1400 |
34 | Daniel | Faviet | DFAVIET | 515.124.4169 | 1994-08-16 | FI_ACCOUNT | 9000.0 | NaN | 108.0 | 100.0 | Finance | 1700 |
35 | John | Chen | JCHEN | 515.124.4269 | 1997-09-28 | FI_ACCOUNT | 8200.0 | NaN | 108.0 | 100.0 | Finance | 1700 |
36 | Ismael | Sciarra | ISCIARRA | 515.124.4369 | 1997-09-30 | FI_ACCOUNT | 7700.0 | NaN | 108.0 | 100.0 | Finance | 1700 |
37 | Jose Manuel | Urman | JMURMAN | 515.124.4469 | 1998-03-07 | FI_ACCOUNT | 7800.0 | NaN | 108.0 | 100.0 | Finance | 1700 |
38 | Luis | Popp | LPOPP | 515.124.4567 | 1999-12-07 | FI_ACCOUNT | 6900.0 | NaN | 108.0 | 100.0 | Finance | 1700 |
39 | Alexander | Khoo | AKHOO | 515.127.4562 | 1995-05-18 | PU_CLERK | 3100.0 | NaN | 114.0 | 30.0 | Purchasing | 1700 |
40 | Shelli | Baida | SBAIDA | 515.127.4563 | 1997-12-24 | PU_CLERK | 2900.0 | NaN | 114.0 | 30.0 | Purchasing | 1700 |
41 | Sigal | Tobias | STOBIAS | 515.127.4564 | 1997-07-24 | PU_CLERK | 2800.0 | NaN | 114.0 | 30.0 | Purchasing | 1700 |
42 | Guy | Himuro | GHIMURO | 515.127.4565 | 1998-11-15 | PU_CLERK | 2600.0 | NaN | 114.0 | 30.0 | Purchasing | 1700 |
43 | Karen | Colmenares | KCOLMENA | 515.127.4566 | 1999-08-10 | PU_CLERK | 2500.0 | NaN | 114.0 | 30.0 | Purchasing | 1700 |
44 | Laura | Bissot | LBISSOT | 650.124.5234 | 1997-08-20 | ST_CLERK | 3300.0 | NaN | 121.0 | 50.0 | Shipping | 1500 |
45 | Mozhe | Atkinson | MATKINSO | 650.124.6234 | 1997-10-30 | ST_CLERK | 2800.0 | NaN | 121.0 | 50.0 | Shipping | 1500 |
46 | James | Marlow | JAMRLOW | 650.124.7234 | 1997-02-16 | ST_CLERK | 2500.0 | NaN | 121.0 | 50.0 | Shipping | 1500 |
47 | TJ | Olson | TJOLSON | 650.124.8234 | 1999-04-10 | ST_CLERK | 2100.0 | NaN | 121.0 | 50.0 | Shipping | 1500 |
48 | Nandita | Sarchand | NSARCHAN | 650.509.1876 | 1996-01-27 | SH_CLERK | 4200.0 | NaN | 121.0 | 50.0 | Shipping | 1500 |
49 | Alexis | Bull | ABULL | 650.509.2876 | 1997-02-20 | SH_CLERK | 4100.0 | NaN | 121.0 | 50.0 | Shipping | 1500 |
50 | Julia | Dellinger | JDELLING | 650.509.3876 | 1998-06-24 | SH_CLERK | 3400.0 | NaN | 121.0 | 50.0 | Shipping | 1500 |
51 | Anthony | Cabrio | ACABRIO | 650.509.4876 | 1999-02-07 | SH_CLERK | 3000.0 | NaN | 121.0 | 50.0 | Shipping | 1500 |
52 | Peter | Tucker | PTUCKER | 011.44.1344.129268 | 1997-01-30 | SA_REP | 10000.0 | 0.30 | 145.0 | 80.0 | Sales | 2500 |
53 | David | Bernstein | DBERNSTE | 011.44.1344.345268 | 1997-03-24 | SA_REP | 9500.0 | 0.25 | 145.0 | 80.0 | Sales | 2500 |
54 | Peter | Hall | PHALL | 011.44.1344.478968 | 1997-08-20 | SA_REP | 9000.0 | 0.25 | 145.0 | 80.0 | Sales | 2500 |
55 | Christopher | Olsen | COLSEN | 011.44.1344.498718 | 1998-03-30 | SA_REP | 8000.0 | 0.20 | 145.0 | 80.0 | Sales | 2500 |
56 | Nanette | Cambrault | NCAMBRAU | 011.44.1344.987668 | 1998-12-09 | SA_REP | 7500.0 | 0.20 | 145.0 | 80.0 | Sales | 2500 |
57 | Oliver | Tuvault | OTUVAULT | 011.44.1344.486508 | 1999-11-23 | SA_REP | 7000.0 | 0.15 | 145.0 | 80.0 | Sales | 2500 |
58 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | 20.0 | Marketing | 1800 |
59 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | 110.0 | Accounting | 1700 |
Łączenie w oparciu o indeksy¶
Bardzo często w dołączanej tabeli wartością, wg której dobieramy rekordy, jest indeks.
Np. w tabeli jobs
to indeks pełni rolę job_id
.
Wtedy dla tej tabeli zamiast right_on=nazwa_kolumny
piszemy right_index=True
.
Zauważmy, że w tym przypadku prawidłowo zachowywany jest indeks pracowników. W poprzednich przypadkach był gubiony.
employees.merge(jobs, left_on='job_id', right_index=True)
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id | department_id | job_title | min_salary | max_salary | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
employee_id | |||||||||||||
100 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | President | 20000.0 | 40000.0 |
101 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Administration Vice President | 15000.0 | 30000.0 |
102 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Administration Vice President | 15000.0 | 30000.0 |
103 | Alexander | Hunold | AHUNOLD | 590.423.4567 | 1990-01-03 | IT_PROG | 9000.0 | NaN | 102.0 | 60.0 | Programmer | 4000.0 | 10000.0 |
104 | Bruce | Ernst | BERNST | 590.423.4568 | 1991-05-21 | IT_PROG | 6000.0 | NaN | 103.0 | 60.0 | Programmer | 4000.0 | 10000.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
202 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | 20.0 | Marketing Representative | 4000.0 | 9000.0 |
203 | Susan | Mavris | SMAVRIS | 515.123.7777 | 1994-06-07 | HR_REP | 6500.0 | NaN | 101.0 | 40.0 | Human Resources Representative | 4000.0 | 9000.0 |
204 | Hermann | Baer | HBAER | 515.123.8888 | 1994-06-07 | PR_REP | 10000.0 | NaN | 101.0 | 70.0 | Public Relations Representative | 4500.0 | 10500.0 |
205 | Shelley | Higgins | SHIGGINS | 515.123.8080 | 1994-06-07 | AC_MGR | 12000.0 | NaN | 101.0 | 110.0 | Accounting Manager | 8200.0 | 16000.0 |
206 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | 110.0 | Public Accountant | 4200.0 | 9000.0 |
107 rows × 13 columns
Można też odwrotnie:
departments.merge(employees, right_on='department_id', left_index=True)
department_name | manager_id_x | location_id | first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id_y | department_id | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
employee_id | |||||||||||||
200 | Administration | 200.0 | 1700 | Jennifer | Whalen | JWHALEN | 515.123.4444 | 1987-09-17 | AD_ASST | 4400.0 | NaN | 101.0 | 10.0 |
201 | Marketing | 201.0 | 1800 | Michael | Hartstein | MHARTSTE | 515.123.5555 | 1996-02-17 | MK_MAN | 13000.0 | NaN | 100.0 | 20.0 |
202 | Marketing | 201.0 | 1800 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | 20.0 |
114 | Purchasing | 114.0 | 1700 | Den | Raphaely | DRAPHEAL | 515.127.4561 | 1994-12-07 | PU_MAN | 11000.0 | NaN | 100.0 | 30.0 |
115 | Purchasing | 114.0 | 1700 | Alexander | Khoo | AKHOO | 515.127.4562 | 1995-05-18 | PU_CLERK | 3100.0 | NaN | 114.0 | 30.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
111 | Finance | 108.0 | 1700 | Ismael | Sciarra | ISCIARRA | 515.124.4369 | 1997-09-30 | FI_ACCOUNT | 7700.0 | NaN | 108.0 | 100.0 |
112 | Finance | 108.0 | 1700 | Jose Manuel | Urman | JMURMAN | 515.124.4469 | 1998-03-07 | FI_ACCOUNT | 7800.0 | NaN | 108.0 | 100.0 |
113 | Finance | 108.0 | 1700 | Luis | Popp | LPOPP | 515.124.4567 | 1999-12-07 | FI_ACCOUNT | 6900.0 | NaN | 108.0 | 100.0 |
205 | Accounting | 205.0 | 1700 | Shelley | Higgins | SHIGGINS | 515.123.8080 | 1994-06-07 | AC_MGR | 12000.0 | NaN | 101.0 | 110.0 |
206 | Accounting | 205.0 | 1700 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | 110.0 |
106 rows × 13 columns
Do wyniku złączenia można dołączać kolejne tabele. W końcu może pojawić się całkiem rozbudowane wyrażenie.
Przy okazji: \
na końcu linii mówi Pythonowi "to jeszcze nie koniec polecenia, kontynuacja jest w nast. linii".
employees.merge(jobs, left_on='job_id', right_index=True) \
.merge(departments, left_on='department_id', right_index=True) \
.merge(locations, left_on='location_id', right_index=True) \
.merge(countries, left_on='country_id', right_index=True)
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id_x | department_id | ... | department_name | manager_id_y | location_id | street_address | postal_code | city | state_province | country_id | country_name | region | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
employee_id | |||||||||||||||||||||
100 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | ... | Executive | 100.0 | 1700 | 2004 Charade Rd | 98199 | Seattle | Washington | US | United States of America | Americas |
101 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | ... | Executive | 100.0 | 1700 | 2004 Charade Rd | 98199 | Seattle | Washington | US | United States of America | Americas |
102 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | ... | Executive | 100.0 | 1700 | 2004 Charade Rd | 98199 | Seattle | Washington | US | United States of America | Americas |
108 | Nancy | Greenberg | NGREENBE | 515.124.4569 | 1994-08-17 | FI_MGR | 12000.0 | NaN | 101.0 | 100.0 | ... | Finance | 108.0 | 1700 | 2004 Charade Rd | 98199 | Seattle | Washington | US | United States of America | Americas |
109 | Daniel | Faviet | DFAVIET | 515.124.4169 | 1994-08-16 | FI_ACCOUNT | 9000.0 | NaN | 108.0 | 100.0 | ... | Finance | 108.0 | 1700 | 2004 Charade Rd | 98199 | Seattle | Washington | US | United States of America | Americas |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
179 | Charles | Johnson | CJOHNSON | 011.44.1644.429262 | 2000-01-04 | SA_REP | 6200.0 | 0.1 | 149.0 | 80.0 | ... | Sales | 145.0 | 2500 | Magdalen Centre, The Oxford Science Park | OX9 9ZB | Oxford | Oxford | UK | United Kingdom | Europe |
203 | Susan | Mavris | SMAVRIS | 515.123.7777 | 1994-06-07 | HR_REP | 6500.0 | NaN | 101.0 | 40.0 | ... | Human Resources | 203.0 | 2400 | 8204 Arthur St | NaN | London | NaN | UK | United Kingdom | Europe |
201 | Michael | Hartstein | MHARTSTE | 515.123.5555 | 1996-02-17 | MK_MAN | 13000.0 | NaN | 100.0 | 20.0 | ... | Marketing | 201.0 | 1800 | 147 Spadina Ave | M5V 2L7 | Toronto | Ontario | CA | Canada | Americas |
202 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | 20.0 | ... | Marketing | 201.0 | 1800 | 147 Spadina Ave | M5V 2L7 | Toronto | Ontario | CA | Canada | Americas |
204 | Hermann | Baer | HBAER | 515.123.8888 | 1994-06-07 | PR_REP | 10000.0 | NaN | 101.0 | 70.0 | ... | Public Relations | 204.0 | 2700 | Schwanthalerstr. 7031 | 80925 | Munich | Bavaria | DE | Germany | Europe |
106 rows × 23 columns
"Kierunki" złączeń¶
Złączenia domyślnie są "wewnętrzne" (inner), co oznacza, że w wynikach znajdą się tylko te rekordy z jednej i drugiej tabeli, dla których znaleziono dopasowanie po drugiej stronie.
W przypadku parcowników i departamentów wyniki obejmują 106 rekordów. Nie ma osoby Kimberely Grant, bo ona "nie pracuje w żadnym departamencie" (nie ma podanego department_id - w bazie danych był NULL, w pliku CSV puste pole, w Pandas jest NaN). Nie ma również departamentów, w których nikt nie pracuje, np. Payroll.
employees[employees.last_name == 'Grant']
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id | department_id | ||
---|---|---|---|---|---|---|---|---|---|---|
employee_id | ||||||||||
178 | Kimberely | Grant | KGRANT | 011.44.1644.429263 | 1999-05-24 | SA_REP | 7000.0 | 0.15 | 149.0 | NaN |
199 | Douglas | Grant | DGRANT | 650.507.9844 | 2000-01-13 | SH_CLERK | 2600.0 | NaN | 124.0 | 50.0 |
employees.merge(departments, left_on='department_id', right_index=True)
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id_x | department_id | department_name | manager_id_y | location_id | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
employee_id | |||||||||||||
100 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Executive | 100.0 | 1700 |
101 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 100.0 | 1700 |
102 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 100.0 | 1700 |
103 | Alexander | Hunold | AHUNOLD | 590.423.4567 | 1990-01-03 | IT_PROG | 9000.0 | NaN | 102.0 | 60.0 | IT | 103.0 | 1400 |
104 | Bruce | Ernst | BERNST | 590.423.4568 | 1991-05-21 | IT_PROG | 6000.0 | NaN | 103.0 | 60.0 | IT | 103.0 | 1400 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
202 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | 20.0 | Marketing | 201.0 | 1800 |
203 | Susan | Mavris | SMAVRIS | 515.123.7777 | 1994-06-07 | HR_REP | 6500.0 | NaN | 101.0 | 40.0 | Human Resources | 203.0 | 2400 |
204 | Hermann | Baer | HBAER | 515.123.8888 | 1994-06-07 | PR_REP | 10000.0 | NaN | 101.0 | 70.0 | Public Relations | 204.0 | 2700 |
205 | Shelley | Higgins | SHIGGINS | 515.123.8080 | 1994-06-07 | AC_MGR | 12000.0 | NaN | 101.0 | 110.0 | Accounting | 205.0 | 1700 |
206 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | 110.0 | Accounting | 205.0 | 1700 |
106 rows × 13 columns
employees.merge(departments, left_on='department_id', right_index=True, how='inner')
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id_x | department_id | department_name | manager_id_y | location_id | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
employee_id | |||||||||||||
100 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Executive | 100.0 | 1700 |
101 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 100.0 | 1700 |
102 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 100.0 | 1700 |
103 | Alexander | Hunold | AHUNOLD | 590.423.4567 | 1990-01-03 | IT_PROG | 9000.0 | NaN | 102.0 | 60.0 | IT | 103.0 | 1400 |
104 | Bruce | Ernst | BERNST | 590.423.4568 | 1991-05-21 | IT_PROG | 6000.0 | NaN | 103.0 | 60.0 | IT | 103.0 | 1400 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
202 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | 20.0 | Marketing | 201.0 | 1800 |
203 | Susan | Mavris | SMAVRIS | 515.123.7777 | 1994-06-07 | HR_REP | 6500.0 | NaN | 101.0 | 40.0 | Human Resources | 203.0 | 2400 |
204 | Hermann | Baer | HBAER | 515.123.8888 | 1994-06-07 | PR_REP | 10000.0 | NaN | 101.0 | 70.0 | Public Relations | 204.0 | 2700 |
205 | Shelley | Higgins | SHIGGINS | 515.123.8080 | 1994-06-07 | AC_MGR | 12000.0 | NaN | 101.0 | 110.0 | Accounting | 205.0 | 1700 |
206 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | 110.0 | Accounting | 205.0 | 1700 |
106 rows × 13 columns
Złączenie lewostronne (how='left'
, a w SQL LEFT JOIN
) zwróci wszystkie rekordy z lewej tabeli, a tylko te z prawej, które uda się dopasować.
Jest K.Grant , a nie ma departamentu Payroll itp.
employees.merge(departments, left_on='department_id', right_index=True, how='left')
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id_x | department_id | department_name | manager_id_y | location_id | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
employee_id | |||||||||||||
100 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Executive | 100.0 | 1700.0 |
101 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 100.0 | 1700.0 |
102 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 100.0 | 1700.0 |
103 | Alexander | Hunold | AHUNOLD | 590.423.4567 | 1990-01-03 | IT_PROG | 9000.0 | NaN | 102.0 | 60.0 | IT | 103.0 | 1400.0 |
104 | Bruce | Ernst | BERNST | 590.423.4568 | 1991-05-21 | IT_PROG | 6000.0 | NaN | 103.0 | 60.0 | IT | 103.0 | 1400.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
202 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | 20.0 | Marketing | 201.0 | 1800.0 |
203 | Susan | Mavris | SMAVRIS | 515.123.7777 | 1994-06-07 | HR_REP | 6500.0 | NaN | 101.0 | 40.0 | Human Resources | 203.0 | 2400.0 |
204 | Hermann | Baer | HBAER | 515.123.8888 | 1994-06-07 | PR_REP | 10000.0 | NaN | 101.0 | 70.0 | Public Relations | 204.0 | 2700.0 |
205 | Shelley | Higgins | SHIGGINS | 515.123.8080 | 1994-06-07 | AC_MGR | 12000.0 | NaN | 101.0 | 110.0 | Accounting | 205.0 | 1700.0 |
206 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | 110.0 | Accounting | 205.0 | 1700.0 |
107 rows × 13 columns
employees.merge(departments, left_on='department_id', right_index=True, how='left').loc[175:180]
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id_x | department_id | department_name | manager_id_y | location_id | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
employee_id | |||||||||||||
175 | Alyssa | Hutton | AHUTTON | 011.44.1644.429266 | 1997-03-19 | SA_REP | 8800.0 | 0.25 | 149.0 | 80.0 | Sales | 145.0 | 2500.0 |
176 | Jonathon | Taylor | JTAYLOR | 011.44.1644.429265 | 1998-03-24 | SA_REP | 8600.0 | 0.20 | 149.0 | 80.0 | Sales | 145.0 | 2500.0 |
177 | Jack | Livingston | JLIVINGS | 011.44.1644.429264 | 1998-04-23 | SA_REP | 8400.0 | 0.20 | 149.0 | 80.0 | Sales | 145.0 | 2500.0 |
178 | Kimberely | Grant | KGRANT | 011.44.1644.429263 | 1999-05-24 | SA_REP | 7000.0 | 0.15 | 149.0 | NaN | NaN | NaN | NaN |
179 | Charles | Johnson | CJOHNSON | 011.44.1644.429262 | 2000-01-04 | SA_REP | 6200.0 | 0.10 | 149.0 | 80.0 | Sales | 145.0 | 2500.0 |
180 | Winston | Taylor | WTAYLOR | 650.507.9876 | 1998-01-24 | SH_CLERK | 3200.0 | NaN | 120.0 | 50.0 | Shipping | 121.0 | 1500.0 |
Złączenie prawostronne (how='right'
, a w SQL RIGHT JOIN
) zwróci wszystkie rekordy z prawej tabeli, a tylko te z lewej, które uda się dopasować.
Nie ma K.Grant, a są Payroll itp.
employees.merge(departments, left_on='department_id', right_index=True, how='right')
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id_x | department_id | department_name | manager_id_y | location_id | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
200.0 | Jennifer | Whalen | JWHALEN | 515.123.4444 | 1987-09-17 | AD_ASST | 4400.0 | NaN | 101.0 | 10.0 | Administration | 200.0 | 1700 |
201.0 | Michael | Hartstein | MHARTSTE | 515.123.5555 | 1996-02-17 | MK_MAN | 13000.0 | NaN | 100.0 | 20.0 | Marketing | 201.0 | 1800 |
202.0 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | 20.0 | Marketing | 201.0 | 1800 |
114.0 | Den | Raphaely | DRAPHEAL | 515.127.4561 | 1994-12-07 | PU_MAN | 11000.0 | NaN | 100.0 | 30.0 | Purchasing | 114.0 | 1700 |
115.0 | Alexander | Khoo | AKHOO | 515.127.4562 | 1995-05-18 | PU_CLERK | 3100.0 | NaN | 114.0 | 30.0 | Purchasing | 114.0 | 1700 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 230.0 | IT Helpdesk | NaN | 1700 |
NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 240.0 | Government Sales | NaN | 1700 |
NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 250.0 | Retail Sales | NaN | 1700 |
NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 260.0 | Recruiting | NaN | 1700 |
NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 270.0 | Payroll | NaN | 1700 |
122 rows × 13 columns
Złączenie obustronne (how='outer'
, a w SQL FULL [OUTER] JOIN
) zwróci wszystkie rekordy z obu tabel, łącząc jeśli się da.
employees.merge(departments, left_on='department_id', right_index=True, how='outer')
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id_x | department_id | department_name | manager_id_y | location_id | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
100.0 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Executive | 100.0 | 1700.0 |
101.0 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 100.0 | 1700.0 |
102.0 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 100.0 | 1700.0 |
103.0 | Alexander | Hunold | AHUNOLD | 590.423.4567 | 1990-01-03 | IT_PROG | 9000.0 | NaN | 102.0 | 60.0 | IT | 103.0 | 1400.0 |
104.0 | Bruce | Ernst | BERNST | 590.423.4568 | 1991-05-21 | IT_PROG | 6000.0 | NaN | 103.0 | 60.0 | IT | 103.0 | 1400.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 230.0 | IT Helpdesk | NaN | 1700.0 |
NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 240.0 | Government Sales | NaN | 1700.0 |
NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 250.0 | Retail Sales | NaN | 1700.0 |
NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 260.0 | Recruiting | NaN | 1700.0 |
NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 270.0 | Payroll | NaN | 1700.0 |
123 rows × 13 columns
Parametr suffixes
pozwala określić końcówki dopisywane do nazw kolumn w przypadku, gdy kolumny o tej samej nazwie występują w wielu tabelach.
employees.merge(departments, left_on='department_id', right_index=True, how='outer', suffixes=['_emp', '_dep'])
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id_emp | department_id | department_name | manager_id_dep | location_id | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
100.0 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Executive | 100.0 | 1700.0 |
101.0 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 100.0 | 1700.0 |
102.0 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 100.0 | 1700.0 |
103.0 | Alexander | Hunold | AHUNOLD | 590.423.4567 | 1990-01-03 | IT_PROG | 9000.0 | NaN | 102.0 | 60.0 | IT | 103.0 | 1400.0 |
104.0 | Bruce | Ernst | BERNST | 590.423.4568 | 1991-05-21 | IT_PROG | 6000.0 | NaN | 103.0 | 60.0 | IT | 103.0 | 1400.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 230.0 | IT Helpdesk | NaN | 1700.0 |
NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 240.0 | Government Sales | NaN | 1700.0 |
NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 250.0 | Retail Sales | NaN | 1700.0 |
NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 260.0 | Recruiting | NaN | 1700.0 |
NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | NaN | NaN | 270.0 | Payroll | NaN | 1700.0 |
123 rows × 13 columns
W praktyce nie ma sensu w Pandas łączyć tabel wczytywanych w bazy danych SQL - serwer baz danych na pewno zrobi to lepiej.
Łączenie tabel w Pandas ma największy sens, gdy pobieramy dane z różnych źródeł (część z bazy danych, część z Excela, część w formie eksportu danych z jakiegoś systemu) i potrzebujemy te dane dopasować na podstawie kryterium.
Przykład: oblicz średnią pensję dla każdego regionu (Europa, Azja...)
Dodatkowe przykłady employees¶
- Łącząc dane pracowników z departamentami i lokalizacjami, wybierz tych pracowników, którzy pracują mieście Seattle. Podaj liczbę oraz średnią pensję tych pracowników.
Dzięki takiemu złączeniu zobaczymy pracowników wraz z informacjami o departamencie i lokalizacji, w tym kolumnę city
.
Aby dalsze kroki były wygodniejsze, wynik złączenia zapiszemy do zmiennej.
dane = employees.merge(departments, left_on='department_id', right_index=True, how='left') \
.merge(locations, left_on='location_id', right_index=True, how='left')
dane
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id_x | department_id | department_name | manager_id_y | location_id | street_address | postal_code | city | state_province | country_id | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
employee_id | ||||||||||||||||||
100 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Executive | 100.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
101 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 100.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
102 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 100.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
103 | Alexander | Hunold | AHUNOLD | 590.423.4567 | 1990-01-03 | IT_PROG | 9000.0 | NaN | 102.0 | 60.0 | IT | 103.0 | 1400.0 | 2014 Jabberwocky Rd | 26192 | Southlake | Texas | US |
104 | Bruce | Ernst | BERNST | 590.423.4568 | 1991-05-21 | IT_PROG | 6000.0 | NaN | 103.0 | 60.0 | IT | 103.0 | 1400.0 | 2014 Jabberwocky Rd | 26192 | Southlake | Texas | US |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
202 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | 20.0 | Marketing | 201.0 | 1800.0 | 147 Spadina Ave | M5V 2L7 | Toronto | Ontario | CA |
203 | Susan | Mavris | SMAVRIS | 515.123.7777 | 1994-06-07 | HR_REP | 6500.0 | NaN | 101.0 | 40.0 | Human Resources | 203.0 | 2400.0 | 8204 Arthur St | NaN | London | NaN | UK |
204 | Hermann | Baer | HBAER | 515.123.8888 | 1994-06-07 | PR_REP | 10000.0 | NaN | 101.0 | 70.0 | Public Relations | 204.0 | 2700.0 | Schwanthalerstr. 7031 | 80925 | Munich | Bavaria | DE |
205 | Shelley | Higgins | SHIGGINS | 515.123.8080 | 1994-06-07 | AC_MGR | 12000.0 | NaN | 101.0 | 110.0 | Accounting | 205.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
206 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | 110.0 | Accounting | 205.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
107 rows × 18 columns
Teraz łatwo filtrować rekordy:
dane[dane.city == 'Seattle']
first_name | last_name | phone_number | hire_date | job_id | salary | commission_pct | manager_id_x | department_id | department_name | manager_id_y | location_id | street_address | postal_code | city | state_province | country_id | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
employee_id | ||||||||||||||||||
100 | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 | Executive | 100.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
101 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 100.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
102 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | 90.0 | Executive | 100.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
108 | Nancy | Greenberg | NGREENBE | 515.124.4569 | 1994-08-17 | FI_MGR | 12000.0 | NaN | 101.0 | 100.0 | Finance | 108.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
109 | Daniel | Faviet | DFAVIET | 515.124.4169 | 1994-08-16 | FI_ACCOUNT | 9000.0 | NaN | 108.0 | 100.0 | Finance | 108.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
110 | John | Chen | JCHEN | 515.124.4269 | 1997-09-28 | FI_ACCOUNT | 8200.0 | NaN | 108.0 | 100.0 | Finance | 108.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
111 | Ismael | Sciarra | ISCIARRA | 515.124.4369 | 1997-09-30 | FI_ACCOUNT | 7700.0 | NaN | 108.0 | 100.0 | Finance | 108.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
112 | Jose Manuel | Urman | JMURMAN | 515.124.4469 | 1998-03-07 | FI_ACCOUNT | 7800.0 | NaN | 108.0 | 100.0 | Finance | 108.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
113 | Luis | Popp | LPOPP | 515.124.4567 | 1999-12-07 | FI_ACCOUNT | 6900.0 | NaN | 108.0 | 100.0 | Finance | 108.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
114 | Den | Raphaely | DRAPHEAL | 515.127.4561 | 1994-12-07 | PU_MAN | 11000.0 | NaN | 100.0 | 30.0 | Purchasing | 114.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
115 | Alexander | Khoo | AKHOO | 515.127.4562 | 1995-05-18 | PU_CLERK | 3100.0 | NaN | 114.0 | 30.0 | Purchasing | 114.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
116 | Shelli | Baida | SBAIDA | 515.127.4563 | 1997-12-24 | PU_CLERK | 2900.0 | NaN | 114.0 | 30.0 | Purchasing | 114.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
117 | Sigal | Tobias | STOBIAS | 515.127.4564 | 1997-07-24 | PU_CLERK | 2800.0 | NaN | 114.0 | 30.0 | Purchasing | 114.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
118 | Guy | Himuro | GHIMURO | 515.127.4565 | 1998-11-15 | PU_CLERK | 2600.0 | NaN | 114.0 | 30.0 | Purchasing | 114.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
119 | Karen | Colmenares | KCOLMENA | 515.127.4566 | 1999-08-10 | PU_CLERK | 2500.0 | NaN | 114.0 | 30.0 | Purchasing | 114.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
200 | Jennifer | Whalen | JWHALEN | 515.123.4444 | 1987-09-17 | AD_ASST | 4400.0 | NaN | 101.0 | 10.0 | Administration | 200.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
205 | Shelley | Higgins | SHIGGINS | 515.123.8080 | 1994-06-07 | AC_MGR | 12000.0 | NaN | 101.0 | 110.0 | Accounting | 205.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
206 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | 110.0 | Accounting | 205.0 | 1700.0 | 2004 Charade Rd | 98199 | Seattle | Washington | US |
len(dane[dane.city == 'Seattle'])
18
dane[dane.city == 'Seattle'].count()
first_name 18 last_name 18 email 18 phone_number 18 hire_date 18 job_id 18 salary 18 commission_pct 0 manager_id_x 17 department_id 18 department_name 18 manager_id_y 18 location_id 18 street_address 18 postal_code 18 city 18 state_province 18 country_id 18 dtype: int64
dane[dane.city == 'Seattle'].salary.mean()
8844.444444444445
# Albo:
dane[dane.city == 'Seattle'].salary.agg(['count', 'mean'])
count 18.000000 mean 8844.444444 Name: salary, dtype: float64
- Dla każdego pracownika wypisz jego imię i nazwisko, a obok imię i nazwisko jego szefa (trzeba wykorzystać kolumnę
manager_id
i dołączyć innnego pracownika).
employees.merge(employees, left_on='manager_id', right_index=True)
manager_id | first_name_x | last_name_x | email_x | phone_number_x | hire_date_x | job_id_x | salary_x | commission_pct_x | manager_id_x | ... | first_name_y | last_name_y | email_y | phone_number_y | hire_date_y | job_id_y | salary_y | commission_pct_y | manager_id_y | department_id_y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
employee_id | |||||||||||||||||||||
101 | 100.0 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | ... | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 |
102 | 100.0 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | ... | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 |
114 | 100.0 | Den | Raphaely | DRAPHEAL | 515.127.4561 | 1994-12-07 | PU_MAN | 11000.0 | NaN | 100.0 | ... | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 |
120 | 100.0 | Matthew | Weiss | MWEISS | 650.123.1234 | 1996-07-18 | ST_MAN | 8000.0 | NaN | 100.0 | ... | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 |
121 | 100.0 | Adam | Fripp | AFRIPP | 650.123.2234 | 1997-04-10 | ST_MAN | 8200.0 | NaN | 100.0 | ... | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
177 | 149.0 | Jack | Livingston | JLIVINGS | 011.44.1644.429264 | 1998-04-23 | SA_REP | 8400.0 | 0.20 | 149.0 | ... | Eleni | Zlotkey | EZLOTKEY | 011.44.1344.429018 | 2000-01-29 | SA_MAN | 10500.0 | 0.2 | 100.0 | 80.0 |
178 | 149.0 | Kimberely | Grant | KGRANT | 011.44.1644.429263 | 1999-05-24 | SA_REP | 7000.0 | 0.15 | 149.0 | ... | Eleni | Zlotkey | EZLOTKEY | 011.44.1344.429018 | 2000-01-29 | SA_MAN | 10500.0 | 0.2 | 100.0 | 80.0 |
179 | 149.0 | Charles | Johnson | CJOHNSON | 011.44.1644.429262 | 2000-01-04 | SA_REP | 6200.0 | 0.10 | 149.0 | ... | Eleni | Zlotkey | EZLOTKEY | 011.44.1344.429018 | 2000-01-29 | SA_MAN | 10500.0 | 0.2 | 100.0 | 80.0 |
202 | 201.0 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | ... | Michael | Hartstein | MHARTSTE | 515.123.5555 | 1996-02-17 | MK_MAN | 13000.0 | NaN | 100.0 | 20.0 |
206 | 205.0 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | ... | Shelley | Higgins | SHIGGINS | 515.123.8080 | 1994-06-07 | AC_MGR | 12000.0 | NaN | 101.0 | 110.0 |
106 rows × 21 columns
Wersja o zwiększonej czytelności:
- używamy sufiksów
_pr
jak pracownik i_szef
- wybieram tylko niektóre kolumny
- zapisuję wynik do zmiennej.
employees.merge(employees, left_on='manager_id', right_index=True, suffixes=['_pr', '_szef'])
manager_id | first_name_pr | last_name_pr | email_pr | phone_number_pr | hire_date_pr | job_id_pr | salary_pr | commission_pct_pr | manager_id_pr | ... | first_name_szef | last_name_szef | email_szef | phone_number_szef | hire_date_szef | job_id_szef | salary_szef | commission_pct_szef | manager_id_szef | department_id_szef | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
employee_id | |||||||||||||||||||||
101 | 100.0 | Neena | Kochhar | NKOCHHAR | 515.123.4568 | 1989-09-21 | AD_VP | 17000.0 | NaN | 100.0 | ... | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 |
102 | 100.0 | Lex | De Haan | LDEHAAN | 515.123.4569 | 1993-01-13 | AD_VP | 17000.0 | NaN | 100.0 | ... | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 |
114 | 100.0 | Den | Raphaely | DRAPHEAL | 515.127.4561 | 1994-12-07 | PU_MAN | 11000.0 | NaN | 100.0 | ... | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 |
120 | 100.0 | Matthew | Weiss | MWEISS | 650.123.1234 | 1996-07-18 | ST_MAN | 8000.0 | NaN | 100.0 | ... | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 |
121 | 100.0 | Adam | Fripp | AFRIPP | 650.123.2234 | 1997-04-10 | ST_MAN | 8200.0 | NaN | 100.0 | ... | Steven | King | SKING | 515.123.4567 | 1987-06-17 | AD_PRES | 24000.0 | NaN | NaN | 90.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
177 | 149.0 | Jack | Livingston | JLIVINGS | 011.44.1644.429264 | 1998-04-23 | SA_REP | 8400.0 | 0.20 | 149.0 | ... | Eleni | Zlotkey | EZLOTKEY | 011.44.1344.429018 | 2000-01-29 | SA_MAN | 10500.0 | 0.2 | 100.0 | 80.0 |
178 | 149.0 | Kimberely | Grant | KGRANT | 011.44.1644.429263 | 1999-05-24 | SA_REP | 7000.0 | 0.15 | 149.0 | ... | Eleni | Zlotkey | EZLOTKEY | 011.44.1344.429018 | 2000-01-29 | SA_MAN | 10500.0 | 0.2 | 100.0 | 80.0 |
179 | 149.0 | Charles | Johnson | CJOHNSON | 011.44.1644.429262 | 2000-01-04 | SA_REP | 6200.0 | 0.10 | 149.0 | ... | Eleni | Zlotkey | EZLOTKEY | 011.44.1344.429018 | 2000-01-29 | SA_MAN | 10500.0 | 0.2 | 100.0 | 80.0 |
202 | 201.0 | Pat | Fay | PFAY | 603.123.6666 | 1997-08-17 | MK_REP | 6000.0 | NaN | 201.0 | ... | Michael | Hartstein | MHARTSTE | 515.123.5555 | 1996-02-17 | MK_MAN | 13000.0 | NaN | 100.0 | 20.0 |
206 | 205.0 | William | Gietz | WGIETZ | 515.123.8181 | 1994-06-07 | AC_ACCOUNT | 8300.0 | NaN | 205.0 | ... | Shelley | Higgins | SHIGGINS | 515.123.8080 | 1994-06-07 | AC_MGR | 12000.0 | NaN | 101.0 | 110.0 |
106 rows × 21 columns
wyniki = employees.merge(employees, left_on='manager_id', right_index=True, how='left', suffixes=['_pr', '_szef'])
wyniki = wyniki[['first_name_pr', 'last_name_pr', 'salary_pr', 'first_name_szef', 'last_name_szef', 'salary_szef']]
wyniki.sort_index(inplace=True)
wyniki.head(30)
first_name_pr | last_name_pr | salary_pr | first_name_szef | last_name_szef | salary_szef | |
---|---|---|---|---|---|---|
employee_id | ||||||
100 | Steven | King | 24000.0 | NaN | NaN | NaN |
101 | Neena | Kochhar | 17000.0 | Steven | King | 24000.0 |
102 | Lex | De Haan | 17000.0 | Steven | King | 24000.0 |
103 | Alexander | Hunold | 9000.0 | Lex | De Haan | 17000.0 |
104 | Bruce | Ernst | 6000.0 | Alexander | Hunold | 9000.0 |
105 | David | Austin | 4800.0 | Alexander | Hunold | 9000.0 |
106 | Valli | Pataballa | 4800.0 | Alexander | Hunold | 9000.0 |
107 | Diana | Lorentz | 4200.0 | Alexander | Hunold | 9000.0 |
108 | Nancy | Greenberg | 12000.0 | Neena | Kochhar | 17000.0 |
109 | Daniel | Faviet | 9000.0 | Nancy | Greenberg | 12000.0 |
110 | John | Chen | 8200.0 | Nancy | Greenberg | 12000.0 |
111 | Ismael | Sciarra | 7700.0 | Nancy | Greenberg | 12000.0 |
112 | Jose Manuel | Urman | 7800.0 | Nancy | Greenberg | 12000.0 |
113 | Luis | Popp | 6900.0 | Nancy | Greenberg | 12000.0 |
114 | Den | Raphaely | 11000.0 | Steven | King | 24000.0 |
115 | Alexander | Khoo | 3100.0 | Den | Raphaely | 11000.0 |
116 | Shelli | Baida | 2900.0 | Den | Raphaely | 11000.0 |
117 | Sigal | Tobias | 2800.0 | Den | Raphaely | 11000.0 |
118 | Guy | Himuro | 2600.0 | Den | Raphaely | 11000.0 |
119 | Karen | Colmenares | 2500.0 | Den | Raphaely | 11000.0 |
120 | Matthew | Weiss | 8000.0 | Steven | King | 24000.0 |
121 | Adam | Fripp | 8200.0 | Steven | King | 24000.0 |
122 | Payam | Kaufling | 7900.0 | Steven | King | 24000.0 |
123 | Shanta | Vollman | 6500.0 | Steven | King | 24000.0 |
124 | Kevin | Mourgos | 5800.0 | Steven | King | 24000.0 |
125 | Julia | Nayer | 3200.0 | Matthew | Weiss | 8000.0 |
126 | Irene | Mikkilineni | 2700.0 | Matthew | Weiss | 8000.0 |
127 | James | Landry | 2400.0 | Matthew | Weiss | 8000.0 |
128 | Steven | Markle | 2200.0 | Matthew | Weiss | 8000.0 |
129 | Laura | Bissot | 3300.0 | Adam | Fripp | 8200.0 |