加载数据集
>>> import pandas as pd>>> nba = pd.read_csv(r"d:\nbaallelo.csv")>>> type(nba)<class 'pandas.core.frame.DataFrame'>>>> len(nba)126314>>> nba.shape(126314, 23)>>> nba.head()>>> nba.head() gameorder game_id lg_id ... game_result forecast notes0 1 194611010TRH NBA ... L 0.640065 NaN1 1 194611010TRH NBA ... W 0.359935 NaN2 2 194611020CHS NBA ... W 0.631101 NaN3 2 194611020CHS NBA ... L 0.368899 NaN4 3 194611020DTF NBA ... L 0.640065 NaN[5 rows x 23 columns]>>> pd.set_option("display.max.columns", None) # 显示所有列>>> nba.head() gameorder game_id lg_id _iscopy year_id date_game seasongame \0 1 194611010TRH NBA 0 1947 11/1/1946 1 1 1 194611010TRH NBA 1 1947 11/1/1946 1 2 2 194611020CHS NBA 0 1947 11/2/1946 1 3 2 194611020CHS NBA 1 1947 11/2/1946 2 4 3 194611020DTF NBA 0 1947 11/2/1946 1 is_playoffs team_id fran_id pts elo_i elo_n win_equiv opp_id \0 0 TRH Huskies 66 1300.0000 1293.2767 40.294830 NYK 1 0 NYK Knicks 68 1300.0000 1306.7233 41.705170 TRH 2 0 CHS Stags 63 1300.0000 1309.6521 42.012257 NYK 3 0 NYK Knicks 47 1306.7233 1297.0712 40.692783 CHS 4 0 DTF Falcons 33 1300.0000 1279.6189 38.864048 WSC opp_fran opp_pts opp_elo_i opp_elo_n game_location game_result \0 Knicks 68 1300.0000 1306.7233 H L 1 Huskies 66 1300.0000 1293.2767 A W 2 Knicks 47 1306.7233 1297.0712 H W 3 Stags 63 1300.0000 1309.6521 A L 4 Capitols 50 1300.0000 1320.3811 H L forecast notes 0 0.640065 NaN 1 0.359935 NaN 2 0.631101 NaN 3 0.368899 NaN 4 0.640065 NaN >>> pd.set_option("display.max.columns", 8) # 显示8列>>> nba.head() gameorder game_id lg_id _iscopy ... game_location game_result \0 1 194611010TRH NBA 0 ... H L 1 1 194611010TRH NBA 1 ... A W 2 2 194611020CHS NBA 0 ... H W 3 2 194611020CHS NBA 1 ... A L 4 3 194611020DTF NBA 0 ... H L forecast notes 0 0.640065 NaN 1 0.359935 NaN 2 0.631101 NaN 3 0.368899 NaN 4 0.640065 NaN [5 rows x 23 columns]>>> pd.set_option("display.precision", 2) # 设置浮点数的精度>>> nba.tail() gameorder game_id lg_id _iscopy ... game_location \126309 63155 201506110CLE NBA 0 ... H 126310 63156 201506140GSW NBA 0 ... H 126311 63156 201506140GSW NBA 1 ... A 126312 63157 201506170CLE NBA 0 ... H 126313 63157 201506170CLE NBA 1 ... A game_result forecast notes 126309 L 0.55 NaN 126310 W 0.77 NaN 126311 L 0.23 NaN 126312 L 0.48 NaN 126313 W 0.52 NaN [5 rows x 23 columns]

head.7c86dafd4141.png

tail.0dc48c8c2803.png
'nbaallelo.csv'可以在扣扣群630011153 144081101找到。
了解数据集
>>> nba.info()<class 'pandas.core.frame.DataFrame'>RangeIndex: 126314 entries, 0 to 126313Data columns (total 23 columns):gameorder 126314 non-null int64game_id 126314 non-null objectlg_id 126314 non-null object_iscopy 126314 non-null int64year_id 126314 non-null int64date_game 126314 non-null objectseasongame 126314 non-null int64is_playoffs 126314 non-null int64team_id 126314 non-null objectfran_id 126314 non-null objectpts 126314 non-null int64elo_i 126314 non-null float64elo_n 126314 non-null float64win_equiv 126314 non-null float64opp_id 126314 non-null objectopp_fran 126314 non-null objectopp_pts 126314 non-null int64opp_elo_i 126314 non-null float64opp_elo_n 126314 non-null float64game_location 126314 non-null objectgame_result 126314 non-null objectforecast 126314 non-null float64notes 5424 non-null objectdtypes: float64(6), int64(7), object(10)memory usage: 22.2+ MB>>> nba.describe() gameorder _iscopy ... opp_elo_n forecastcount 126314.000000 126314.000000 ... 126314.000000 126314.000000mean 31579.000000 0.500000 ... 1495.236055 0.500000std 18231.927643 0.500002 ... 112.461687 0.215252min 1.000000 0.000000 ... 1085.774400 0.02044725% 15790.000000 0.000000 ... 1416.994900 0.32798950% 31579.000000 0.500000 ... 1500.954400 0.50000075% 47368.000000 1.000000 ... 1576.291625 0.672011max 63157.000000 1.000000 ... 1853.104500 0.979553[8 rows x 13 columns]>>> import numpy as np # nba.describe() 默认只分析数值,分析其他类型需要传递include=np.object>>> nba.describe(include=np.object) game_id lg_id ... game_result notescount 126314 126314 ... 126314 5424unique 63157 2 ... 2 231top 201403190MEM NBA ... L at New York NYfreq 2 118016 ... 63157 440[4 rows x 10 columns]

describe.0be00956e704.png

describe_object.2ec0a6039517.png
探索数据集
>>> nba["team_id"].value_counts()BOS 5997NYK 5769LAL 5078DET 4985PHI 4533INJ 60DTF 60PIT 60TRH 60SDS 11Name: team_id, Length: 104, dtype: int64>>> nba["fran_id"].value_counts()Lakers 6024Celtics 5997Knicks 5769Warriors 5657Pistons 5650Sixers 5644Hawks 5572Kings 5475Wizards 4582Spurs 4309Bulls 4307Pacers 4227Thunder 4178Rockets 4154Nuggets 4120Nets 4106Suns 4080Bucks 4034Trailblazers 3870Cavaliers 3810Clippers 3733Jazz 3555Mavericks 3013Heat 2371Pelicans 2254Magic 2207Timberwolves 2131Grizzlies 1657Raptors 1634Hornets 894Colonels 846Squires 799Spirits 777Stars 756Sounds 697Baltimore 467Floridians 440Condors 430Capitols 291Olympians 282Sails 274Stags 260Bombers 249Steamrollers 168Packers 72Redskins 65Rebels 63Denver 62Waterloo 62Huskies 60Falcons 60Ironmen 60Jets 60Name: fran_id, dtype: int64>>> nba.loc[nba["fran_id"] == "Lakers", "team_id"].value_counts()LAL 5078MNL 946Name: team_id, dtype: int64>>> nba.loc[nba["team_id"] == "MNL", "date_game"].min()'1/1/1949'>>> nba.loc[nba["team_id"] == "MNL", "date_game"].max()'4/9/1959'>>> nba.loc[nba["team_id"] == "MNL", "date_game"].agg(("min", "max"))min 1/1/1949max 4/9/1959Name: date_game, dtype: object>>> nba.loc[nba["team_id"] == "BOS", "pts"].sum()626484
pandas数据结构基础
- 序列(Series)
>>> revenues = pd.Series([5555, 7000, 1980])>>> revenues.valuesarray([5555, 7000, 1980], dtype=int64)>>> revenues.indexRangeIndex(start=0, stop=3, step=1)>>> type(revenues.values)numpy.ndarray>>> city_revenues = pd.Series( [4200, 8000, 6500], index=["Amsterdam", "Toronto", "Tokyo"])>>> city_revenuesAmsterdam 4200Toronto 8000Tokyo 6500dtype: int64>>> city_employee_count = pd.Series({"Amsterdam": 5, "Tokyo": 8})>>> city_employee_countAmsterdam 5Tokyo 8dtype: int64>>> city_employee_count.keys()Index(['Amsterdam', 'Tokyo'], dtype='object')>>> "Tokyo" in city_employee_countTrue>>> "New York" in city_employee_countFalse
- DataFrame
>>> city_data = pd.DataFrame({ "revenue": city_revenues, "employee_count": city_employee_count})>>> city_data revenue employee_countAmsterdam 4200 5.0Tokyo 6500 8.0Toronto 8000 NaN>>> city_data.indexIndex(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object')>>> city_data.valuesarray([[4.2e+03, 5.0e+00], [6.5e+03, 8.0e+00], [8.0e+03, nan]])>>> city_data.axes[Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object'), Index(['revenue', 'employee_count'], dtype='object')]>>> city_data.axes[0]Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object')>>> city_data.axes[1]Index(['revenue', 'employee_count'], dtype='object')>>> city_data.keys() # 注意这里是列Index(['revenue', 'employee_count'], dtype='object')>>> "Amsterdam" in city_data False>>> "revenue" in city_dataTrue>>> nba.indexRangeIndex(start=0, stop=126314, step=1)>>> nba.axes[RangeIndex(start=0, stop=126314, step=1), Index(['gameorder', 'game_id', 'lg_id', '_iscopy', 'year_id', 'date_game', 'seasongame', 'is_playoffs', 'team_id', 'fran_id', 'pts', 'elo_i', 'elo_n', 'win_equiv', 'opp_id', 'opp_fran', 'opp_pts', 'opp_elo_i', 'opp_elo_n', 'game_location', 'game_result', 'forecast', 'notes'], dtype='object')]>>> "points" in nba.keys()False>>> "pts" in nba.keys()True
访问序列(Series)元素
>>> city_revenues["Toronto"]8000>>> city_revenues[1]8000>>> city_revenues[-1]6500>>> city_revenues[1:]Toronto 8000Tokyo 6500dtype: int64>>> city_revenues["Toronto":]Toronto 8000Tokyo 6500dtype: int64>>> colors = pd.Series( ["red", "purple", "blue", "green", "yellow"], index=[1, 2, 3, 5, 8])>>> colors # 此时colors[1]有歧义1 red2 purple3 blue5 green8 yellowdtype: object>>> colors.loc[1] # loc和iloc的性能更好'red'>>> colors.iloc[1]'purple'>>> colors.iloc[1:3]2 purple3 bluedtype: object>>> colors.loc[3:8] # 注意loc包含最后一个元素,iloc不包含3 blue5 green8 yellowdtype: object

文章转载于:https://www.jianshu.com/p/f4118c79c596
原著是一个有趣的人,若有侵权,请通知删除
还没有人抢沙发呢~