使用Pandas和Python探索数据集1

加载数据集

>>> import pandas as pd>>> nba = pd.read_csv(r"d:\nbaallelo.csv")>>> type(nba)<class 'pandas.core.frame.DataFrame'>>>> len(nba)126314>>> nba.shape(126314, 23)>>> nba.head()>>> nba.head()   gameorder       game_id lg_id  ...  game_result  forecast notes0          1  194611010TRH   NBA  ...            L  0.640065   NaN1          1  194611010TRH   NBA  ...            W  0.359935   NaN2          2  194611020CHS   NBA  ...            W  0.631101   NaN3          2  194611020CHS   NBA  ...            L  0.368899   NaN4          3  194611020DTF   NBA  ...            L  0.640065   NaN[5 rows x 23 columns]>>> pd.set_option("display.max.columns", None) # 显示所有列>>> nba.head()   gameorder       game_id lg_id  _iscopy  year_id  date_game  seasongame  \0          1  194611010TRH   NBA        0     1947  11/1/1946           1   1          1  194611010TRH   NBA        1     1947  11/1/1946           1   2          2  194611020CHS   NBA        0     1947  11/2/1946           1   3          2  194611020CHS   NBA        1     1947  11/2/1946           2   4          3  194611020DTF   NBA        0     1947  11/2/1946           1      is_playoffs team_id  fran_id  pts      elo_i      elo_n  win_equiv opp_id  \0            0     TRH  Huskies   66  1300.0000  1293.2767  40.294830    NYK   1            0     NYK   Knicks   68  1300.0000  1306.7233  41.705170    TRH   2            0     CHS    Stags   63  1300.0000  1309.6521  42.012257    NYK   3            0     NYK   Knicks   47  1306.7233  1297.0712  40.692783    CHS   4            0     DTF  Falcons   33  1300.0000  1279.6189  38.864048    WSC      opp_fran  opp_pts  opp_elo_i  opp_elo_n game_location game_result  \0    Knicks       68  1300.0000  1306.7233             H           L   1   Huskies       66  1300.0000  1293.2767             A           W   2    Knicks       47  1306.7233  1297.0712             H           W   3     Stags       63  1300.0000  1309.6521             A           L   4  Capitols       50  1300.0000  1320.3811             H           L      forecast notes  0  0.640065   NaN  1  0.359935   NaN  2  0.631101   NaN  3  0.368899   NaN  4  0.640065   NaN  >>> pd.set_option("display.max.columns", 8) # 显示8列>>> nba.head()   gameorder       game_id lg_id  _iscopy  ...  game_location game_result  \0          1  194611010TRH   NBA        0  ...              H           L   1          1  194611010TRH   NBA        1  ...              A           W   2          2  194611020CHS   NBA        0  ...              H           W   3          2  194611020CHS   NBA        1  ...              A           L   4          3  194611020DTF   NBA        0  ...              H           L      forecast  notes  0  0.640065    NaN  1  0.359935    NaN  2  0.631101    NaN  3  0.368899    NaN  4  0.640065    NaN  [5 rows x 23 columns]>>> pd.set_option("display.precision", 2) # 设置浮点数的精度>>> nba.tail()        gameorder       game_id lg_id  _iscopy  ...  game_location  \126309      63155  201506110CLE   NBA        0  ...              H   126310      63156  201506140GSW   NBA        0  ...              H   126311      63156  201506140GSW   NBA        1  ...              A   126312      63157  201506170CLE   NBA        0  ...              H   126313      63157  201506170CLE   NBA        1  ...              A          game_result  forecast  notes  126309           L      0.55    NaN  126310           W      0.77    NaN  126311           L      0.23    NaN  126312           L      0.48    NaN  126313           W      0.52    NaN  [5 rows x 23 columns]

head.7c86dafd4141.png

tail.0dc48c8c2803.png

'nbaallelo.csv'可以在扣扣群630011153 144081101找到。

了解数据集

>>> nba.info()<class 'pandas.core.frame.DataFrame'>RangeIndex: 126314 entries, 0 to 126313Data columns (total 23 columns):gameorder        126314 non-null int64game_id          126314 non-null objectlg_id            126314 non-null object_iscopy          126314 non-null int64year_id          126314 non-null int64date_game        126314 non-null objectseasongame       126314 non-null int64is_playoffs      126314 non-null int64team_id          126314 non-null objectfran_id          126314 non-null objectpts              126314 non-null int64elo_i            126314 non-null float64elo_n            126314 non-null float64win_equiv        126314 non-null float64opp_id           126314 non-null objectopp_fran         126314 non-null objectopp_pts          126314 non-null int64opp_elo_i        126314 non-null float64opp_elo_n        126314 non-null float64game_location    126314 non-null objectgame_result      126314 non-null objectforecast         126314 non-null float64notes            5424 non-null objectdtypes: float64(6), int64(7), object(10)memory usage: 22.2+ MB>>> nba.describe()           gameorder        _iscopy  ...      opp_elo_n       forecastcount  126314.000000  126314.000000  ...  126314.000000  126314.000000mean    31579.000000       0.500000  ...    1495.236055       0.500000std     18231.927643       0.500002  ...     112.461687       0.215252min         1.000000       0.000000  ...    1085.774400       0.02044725%     15790.000000       0.000000  ...    1416.994900       0.32798950%     31579.000000       0.500000  ...    1500.954400       0.50000075%     47368.000000       1.000000  ...    1576.291625       0.672011max     63157.000000       1.000000  ...    1853.104500       0.979553[8 rows x 13 columns]>>> import numpy as np # nba.describe() 默认只分析数值，分析其他类型需要传递include=np.object>>> nba.describe(include=np.object)             game_id   lg_id  ... game_result           notescount         126314  126314  ...      126314            5424unique         63157       2  ...           2             231top     201403190MEM     NBA  ...           L  at New York NYfreq               2  118016  ...       63157             440[4 rows x 10 columns]

describe.0be00956e704.png

describe_object.2ec0a6039517.png

探索数据集

>>> nba["team_id"].value_counts()BOS    5997NYK    5769LAL    5078DET    4985PHI    4533INJ      60DTF      60PIT      60TRH      60SDS      11Name: team_id, Length: 104, dtype: int64>>> nba["fran_id"].value_counts()Lakers          6024Celtics         5997Knicks          5769Warriors        5657Pistons         5650Sixers          5644Hawks           5572Kings           5475Wizards         4582Spurs           4309Bulls           4307Pacers          4227Thunder         4178Rockets         4154Nuggets         4120Nets            4106Suns            4080Bucks           4034Trailblazers    3870Cavaliers       3810Clippers        3733Jazz            3555Mavericks       3013Heat            2371Pelicans        2254Magic           2207Timberwolves    2131Grizzlies       1657Raptors         1634Hornets          894Colonels         846Squires          799Spirits          777Stars            756Sounds           697Baltimore        467Floridians       440Condors          430Capitols         291Olympians        282Sails            274Stags            260Bombers          249Steamrollers     168Packers           72Redskins          65Rebels            63Denver            62Waterloo          62Huskies           60Falcons           60Ironmen           60Jets              60Name: fran_id, dtype: int64>>> nba.loc[nba["fran_id"] == "Lakers", "team_id"].value_counts()LAL    5078MNL     946Name: team_id, dtype: int64>>> nba.loc[nba["team_id"] == "MNL", "date_game"].min()'1/1/1949'>>> nba.loc[nba["team_id"] == "MNL", "date_game"].max()'4/9/1959'>>> nba.loc[nba["team_id"] == "MNL", "date_game"].agg(("min", "max"))min    1/1/1949max    4/9/1959Name: date_game, dtype: object>>> nba.loc[nba["team_id"] == "BOS", "pts"].sum()626484

pandas数据结构基础

序列（Series）

>>> revenues = pd.Series([5555, 7000, 1980])>>> revenues.valuesarray([5555, 7000, 1980], dtype=int64)>>> revenues.indexRangeIndex(start=0, stop=3, step=1)>>> type(revenues.values)numpy.ndarray>>> city_revenues = pd.Series(    [4200, 8000, 6500],    index=["Amsterdam", "Toronto", "Tokyo"])>>> city_revenuesAmsterdam    4200Toronto      8000Tokyo        6500dtype: int64>>> city_employee_count = pd.Series({"Amsterdam": 5, "Tokyo": 8})>>> city_employee_countAmsterdam    5Tokyo        8dtype: int64>>> city_employee_count.keys()Index(['Amsterdam', 'Tokyo'], dtype='object')>>> "Tokyo" in city_employee_countTrue>>> "New York" in city_employee_countFalse

DataFrame

>>> city_data = pd.DataFrame({    "revenue": city_revenues,    "employee_count": city_employee_count})>>> city_data           revenue  employee_countAmsterdam     4200             5.0Tokyo         6500             8.0Toronto       8000             NaN>>> city_data.indexIndex(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object')>>> city_data.valuesarray([[4.2e+03, 5.0e+00],       [6.5e+03, 8.0e+00],       [8.0e+03,     nan]])>>> city_data.axes[Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object'), Index(['revenue', 'employee_count'], dtype='object')]>>> city_data.axes[0]Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object')>>> city_data.axes[1]Index(['revenue', 'employee_count'], dtype='object')>>> city_data.keys()  # 注意这里是列Index(['revenue', 'employee_count'], dtype='object')>>> "Amsterdam" in city_data False>>> "revenue" in city_dataTrue>>> nba.indexRangeIndex(start=0, stop=126314, step=1)>>> nba.axes[RangeIndex(start=0, stop=126314, step=1), Index(['gameorder', 'game_id', 'lg_id', '_iscopy', 'year_id', 'date_game',        'seasongame', 'is_playoffs', 'team_id', 'fran_id', 'pts', 'elo_i',        'elo_n', 'win_equiv', 'opp_id', 'opp_fran', 'opp_pts', 'opp_elo_i',        'opp_elo_n', 'game_location', 'game_result', 'forecast', 'notes'],       dtype='object')]>>> "points" in nba.keys()False>>> "pts" in nba.keys()True

访问序列（Series）元素

>>> city_revenues["Toronto"]8000>>> city_revenues[1]8000>>> city_revenues[-1]6500>>> city_revenues[1:]Toronto    8000Tokyo      6500dtype: int64>>> city_revenues["Toronto":]Toronto    8000Tokyo      6500dtype: int64>>> colors = pd.Series(    ["red", "purple", "blue", "green", "yellow"],    index=[1, 2, 3, 5, 8])>>> colors # 此时colors[1]有歧义1       red2    purple3      blue5     green8    yellowdtype: object>>> colors.loc[1] # loc和iloc的性能更好'red'>>> colors.iloc[1]'purple'>>> colors.iloc[1:3]2    purple3      bluedtype: object>>> colors.loc[3:8] # 注意loc包含最后一个元素，iloc不包含3      blue5     green8    yellowdtype: object

文章转载于:https://www.jianshu.com/p/f4118c79c596

原著是一个有趣的人,若有侵权,请通知删除

本博客所有文章如无特别注明均为原创。
复制或转载请以超链接形式注明转自起风了，原文地址《使用Pandas和Python探索数据集1》