-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpython-pandas.html
96 lines (96 loc) · 45.8 KB
/
python-pandas.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
<!DOCTYPE html><html lang="zh-CN"><head><meta charset="UTF-8"><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=2"><meta name="theme-color" content="#222"><meta http-equiv="X-UA-COMPATIBLE" content="IE=edge,chrome=1"><meta name="renderer" content="webkit"><link rel="icon" type="image/ico" sizes="32x32" href="/assets/favicon.ico"><link rel="apple-touch-icon" sizes="180x180" href="/assets/apple-touch-icon.png"><link rel="alternate" href="/rss.xml" title="Jiankychen's Blog" type="application/rss+xml"><link rel="alternate" href="/atom.xml" title="Jiankychen's Blog" type="application/atom+xml"><link rel="alternate" type="application/json" title="Jiankychen's Blog" href="https://jiankychen.github.io/feed.json"><link rel="preconnect" href="https://lf9-cdn-tos.bytecdntp.com"><link rel="preconnect" href="https://at.alicdn.com"><link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Mulish:400,400italic,700,700italic%7CFredericka%20the%20Great:400,400italic,700,700italic%7CNoto%20Serif%20JP:400,400italic,700,700italic%7CNoto%20Serif%20SC:400,400italic,700,700italic%7CInconsolata:400,400italic,700,700italic&display=swap&subset=latin,latin-ext" media="none" onload="this.media='all'"><link rel="stylesheet" href="/css/app.css?v=0.4.2"><link rel="modulepreload" href="/js/chunk-7IVLRIQ3.js"><link rel="modulepreload" href="/js/chunk-IXT6LZJL.js"><link rel="modulepreload" href="/js/chunk-PHSEV26P.js"><link rel="modulepreload" href="/js/chunk-XHQGHZCW.js"><link rel="modulepreload" href="/js/comments-TUWNDU5I.js"><link rel="modulepreload" href="/js/post-P6IN2S3Y.js"><link rel="modulepreload" href="/js/quicklink-HAJEHOPK.js"><link rel="modulepreload" href="/js/search-WFXK2K66.js"><link rel="modulepreload" href="/js/siteInit.js"><link rel="stylesheet" href="https://npm.webcache.cn/@waline/[email protected]/dist/waline.css" media="none" onload="this.media='all'"><link rel="preload" href="https://img.timelessq.com/images/2022/07/26/e5221f7d85b0900837a45fb933fa34ec.jpg" as="image" fetchpriority="high"><link rel="preload" href="https://img.timelessq.com/images/2022/07/26/2aabaeb8aca379b991071d1c41632741.jpg" as="image" fetchpriority="high"><link rel="preload" href="https://i.imgtg.com/2023/03/09/YQSYM.jpg" as="image" fetchpriority="high"><link rel="preload" href="https://i.imgtg.com/2023/03/09/YS6XY.jpg" as="image" fetchpriority="high"><link rel="preload" href="https://i.imgtg.com/2023/03/09/YS2LU.jpg" as="image" fetchpriority="high"><link rel="preload" href="https://img.timelessq.com/images/2022/07/26/488297bfd0233b6c6a444f1860e55d45.jpg" as="image" fetchpriority="high"><link rel="canonical" href="https://jiankychen.github.io/python-pandas"><title>pandas 基础</title><meta name="generator" content="Hexo 7.0.0"></head><body itemscope="" itemtype="http://schema.org/WebPage"><div id="loading"><div class="cat"><div class="body"></div><div class="head"><div class="face"></div></div><div class="foot"><div class="tummy-end"></div><div class="bottom"></div><div class="legs left"></div><div class="legs right"></div></div><div class="paw"><div class="hands left"></div><div class="hands right"></div></div></div></div><div id="container"><header id="header" itemscope="" itemtype="http://schema.org/WPHeader"><div class="inner"><div id="brand"><div class="pjax"><h1 itemprop="name headline">pandas 基础</h1><div class="meta"><span class="item" title="创建时间:2023-01-20 21:02:02"><span class="icon"><i class="ic i-calendar"></i></span><span class="text">发表于</span><time itemprop="dateCreated datePublished" datetime="2023-01-20T21:02:02+08:00">2023-01-20</time></span><span class="item" title="本文字数"><span class="icon"><i class="ic i-pen"></i></span><span class="text">本文字数</span><span>3.6k</span><span class="text">字</span></span><span class="item" title="阅读时长"><span class="icon"><i class="ic i-clock"></i></span><span class="text">阅读时长</span><span>3 分钟</span></span></div></div></div><nav id="nav"><div class="inner"><div class="toggle"><div class="lines" aria-label="切换导航栏"><span class="line"></span><span class="line"></span><span class="line"></span></div></div><ul class="menu"><li class="item title"><a href="/" rel="start">Jiankychen</a></li></ul><ul class="right" id="rightNav"><li class="item theme"><i class="ic i-sun"></i></li><li class="item search"><i class="ic i-search"></i></li></ul></div></nav></div><div class="pjax" id="imgs"><ul><li class="item" style="background-image: url("https://img.timelessq.com/images/2022/07/26/e5221f7d85b0900837a45fb933fa34ec.jpg");"></li><li class="item" style="background-image: url("https://img.timelessq.com/images/2022/07/26/2aabaeb8aca379b991071d1c41632741.jpg");"></li><li class="item" style="background-image: url("https://i.imgtg.com/2023/03/09/YQSYM.jpg");"></li><li class="item" style="background-image: url("https://i.imgtg.com/2023/03/09/YS6XY.jpg");"></li><li class="item" style="background-image: url("https://i.imgtg.com/2023/03/09/YS2LU.jpg");"></li><li class="item" style="background-image: url("https://img.timelessq.com/images/2022/07/26/488297bfd0233b6c6a444f1860e55d45.jpg");"></li></ul></div></header><div id="waves"><svg class="waves" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 24 150 28" preserveAspectRatio="none" shape-rendering="auto"><defs><path id="gentle-wave" d="M-160 44c30 0 58-18 88-18s 58 18 88 18 58-18 88-18 58 18 88 18 v44h-352z"></path></defs><g class="parallax"><use xlink:href="#gentle-wave" x="48" y="0"></use><use xlink:href="#gentle-wave" x="48" y="3"></use><use xlink:href="#gentle-wave" x="48" y="5"></use><use xlink:href="#gentle-wave" x="48" y="7"></use></g></svg></div><main><div class="inner"><div class="pjax" id="main"><div class="article wrap"><div class="breadcrumb" itemlistelement="" itemscope="" itemtype="https://schema.org/BreadcrumbList"><i class="ic i-home"></i><span><a href="/">首页</a></span><i class="ic i-angle-right"></i><span class="current" itemprop="itemListElement" itemscope="itemscope" itemtype="https://schema.org/ListItem"><a href="/categories/Python/" itemprop="item" rel="index" title="分类于Python"><span itemprop="name">Python<meta itemprop="position" content="0"></span></a></span></div><article class="post block" itemscope="itemscope" itemtype="http://schema.org/Article" lang="zh-CN"><link itemprop="mainEntityOfPage" href="https://jiankychen.github.io/python-pandas.html"><span hidden="hidden" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><meta itemprop="image" content="/assets/avatar.jpg"><meta itemprop="name" content="Jiankychen"><meta itemprop="description" content="Never put off till tomorrow what you can do today, "></span><span hidden="hidden" itemprop="publisher" itemscope="itemscope" itemtype="http://schema.org/Organization"><meta itemprop="name" content="Jiankychen's Blog"></span><div class="body md" itemprop="articleBody"><h1 id="安装及导入"><a class="anchor" href="#安装及导入">#</a> 安装及导入</h1>
<h2 id="安装"><a class="anchor" href="#安装">#</a> 安装</h2>
<p>可以使用 pip 安装</p>
<pre><code>pip install pandas
</code></pre>
<h2 id="导入"><a class="anchor" href="#导入">#</a> 导入</h2>
<figure class="highlight python"><figcaption data-lang="python"></figcaption><table><tbody><tr><td data-num="1"></td><td><pre><span class="token keyword">import</span> pandas <span class="token keyword">as</span> pd</pre></td></tr></tbody></table></figure><h1 id="数据结构"><a class="anchor" href="#数据结构">#</a> 数据结构</h1>
<h2 id="series"><a class="anchor" href="#series">#</a> Series</h2>
<p>Series 是一种类似于一维数组的对象,它由一组数据(各种 Numpy 数据类型)以及一组与之相关的数据标签(即索引)组成</p>
<blockquote>
<p>Series 只有行索引</p>
</blockquote>
<p>例如,创建一个 Series 对象:</p>
<figure class="highlight python"><figcaption data-lang="python"></figcaption><table><tbody><tr><td data-num="1"></td><td><pre>ser <span class="token operator">=</span> pd<span class="token punctuation">.</span>Series<span class="token punctuation">(</span>data <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">,</span> <span class="token number">2</span><span class="token punctuation">,</span> <span class="token number">3</span><span class="token punctuation">]</span><span class="token punctuation">,</span> index <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token string">'a'</span><span class="token punctuation">,</span> <span class="token string">'b'</span><span class="token punctuation">,</span> <span class="token string">'c'</span><span class="token punctuation">]</span><span class="token punctuation">)</span></pre></td></tr></tbody></table></figure><p>其内容为:</p>
<pre><code>a 1
b 2
c 3
</code></pre>
<p><a target="_blank" rel="noopener" href="https://pandas.pydata.org/docs/reference/series.html">Series 的各种方法</a></p>
<h2 id="dataframe"><a class="anchor" href="#dataframe">#</a> DataFrame</h2>
<p>DataFrame 是一个表格型的数据结构,它含有一组有序的列,每列可以是不同的值类型(数值、字符串、布尔型值)</p>
<blockquote>
<p>DataFrame 既有行索引又有列索引</p>
</blockquote>
<p>例如,创建一个 DataFrame 对象:</p>
<figure class="highlight python"><figcaption data-lang="python"></figcaption><table><tbody><tr><td data-num="1"></td><td><pre>df <span class="token operator">=</span> pd<span class="token punctuation">.</span>DataFrame<span class="token punctuation">(</span>data <span class="token operator">=</span> np<span class="token punctuation">.</span>array<span class="token punctuation">(</span><span class="token punctuation">[</span><span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">,</span> <span class="token number">2</span><span class="token punctuation">,</span> <span class="token number">3</span><span class="token punctuation">]</span><span class="token punctuation">,</span> <span class="token punctuation">[</span><span class="token number">4</span><span class="token punctuation">,</span> <span class="token number">5</span><span class="token punctuation">,</span> <span class="token number">6</span><span class="token punctuation">]</span><span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">,</span> columns <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token string">'a'</span><span class="token punctuation">,</span> <span class="token string">'b'</span><span class="token punctuation">,</span> <span class="token string">'c'</span><span class="token punctuation">]</span><span class="token punctuation">)</span></pre></td></tr></tbody></table></figure><p>其内容为:</p>
<pre><code> a b c
0 1 2 3
1 4 5 6
</code></pre>
<p><a target="_blank" rel="noopener" href="https://pandas.pydata.org/docs/reference/frame.html">DataFrame 的各种方法</a></p>
<p>下面将以 DataFrame 对象为例来介绍 pandas 的基本用法</p>
<h1 id="数据输入"><a class="anchor" href="#数据输入">#</a> 数据输入</h1>
<figure class="highlight python"><figcaption data-lang="python"></figcaption><table><tbody><tr><td data-num="1"></td><td><pre><span class="token comment"># 使用 read_csv 读取 csv 文件</span></pre></td></tr><tr><td data-num="2"></td><td><pre>df <span class="token operator">=</span> pd<span class="token punctuation">.</span>read_csv<span class="token punctuation">(</span><span class="token string">'file_name'</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="3"></td><td><pre></pre></td></tr><tr><td data-num="4"></td><td><pre><span class="token comment"># 或者:使用 read_csv 读取 csv 文件</span></pre></td></tr><tr><td data-num="5"></td><td><pre><span class="token comment"># read_csv 默认使用 '\t' 作为数据分隔符,需将分隔符设为 ','</span></pre></td></tr><tr><td data-num="6"></td><td><pre>df <span class="token operator">=</span> pd<span class="token punctuation">.</span>read_table<span class="token punctuation">(</span><span class="token string">'file_name'</span><span class="token punctuation">,</span> sep<span class="token operator">=</span><span class="token string">','</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="7"></td><td><pre></pre></td></tr><tr><td data-num="8"></td><td><pre><span class="token comment"># 读取 excel 文件</span></pre></td></tr><tr><td data-num="9"></td><td><pre>pd<span class="token punctuation">.</span>read_excel<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="10"></td><td><pre></pre></td></tr><tr><td data-num="11"></td><td><pre><span class="token comment"># 读取 json 文件</span></pre></td></tr><tr><td data-num="12"></td><td><pre>pd<span class="token punctuation">.</span>json<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="13"></td><td><pre></pre></td></tr><tr><td data-num="14"></td><td><pre><span class="token comment"># 读取 xml 文件</span></pre></td></tr><tr><td data-num="15"></td><td><pre>pd<span class="token punctuation">.</span>read_xml<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr></tbody></table></figure><h1 id="数据输出"><a class="anchor" href="#数据输出">#</a> 数据输出</h1>
<figure class="highlight python"><figcaption data-lang="python"></figcaption><table><tbody><tr><td data-num="1"></td><td><pre><span class="token comment"># 将 DataFrame 对象写入 csv 文件</span></pre></td></tr><tr><td data-num="2"></td><td><pre>df<span class="token punctuation">.</span>to_csv<span class="token punctuation">(</span><span class="token string">'file_name'</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="3"></td><td><pre></pre></td></tr><tr><td data-num="4"></td><td><pre><span class="token comment"># 写入 excel 文件</span></pre></td></tr><tr><td data-num="5"></td><td><pre>df<span class="token punctuation">.</span>to_excel<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="6"></td><td><pre></pre></td></tr><tr><td data-num="7"></td><td><pre><span class="token comment"># 写入 json 文件</span></pre></td></tr><tr><td data-num="8"></td><td><pre>df<span class="token punctuation">.</span>to_json<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="9"></td><td><pre></pre></td></tr><tr><td data-num="10"></td><td><pre><span class="token comment"># 写入 xml 文件</span></pre></td></tr><tr><td data-num="11"></td><td><pre>df<span class="token punctuation">.</span>to_xml<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr></tbody></table></figure><p><a target="_blank" rel="noopener" href="http://pandas.pydata.org/docs/reference/io.html">pandas 读取 / 写入文件</a></p>
<h1 id="查看数据"><a class="anchor" href="#查看数据">#</a> 查看数据</h1>
<figure class="highlight python"><figcaption data-lang="python"></figcaption><table><tbody><tr><td data-num="1"></td><td><pre><span class="token comment"># DataFrame 对象的头部数据(默认显示 5 条数据)</span></pre></td></tr><tr><td data-num="2"></td><td><pre>df<span class="token punctuation">.</span>head<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="3"></td><td><pre></pre></td></tr><tr><td data-num="4"></td><td><pre><span class="token comment"># DataFrame 对象的尾部数据(默认显示 5 条数据)</span></pre></td></tr><tr><td data-num="5"></td><td><pre>df<span class="token punctuation">.</span>tail<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="6"></td><td><pre></pre></td></tr><tr><td data-num="7"></td><td><pre><span class="token comment"># DataFrame 对象的索引</span></pre></td></tr><tr><td data-num="8"></td><td><pre>df<span class="token punctuation">.</span>index</pre></td></tr><tr><td data-num="9"></td><td><pre></pre></td></tr><tr><td data-num="10"></td><td><pre><span class="token comment"># DataFrame 对象的列名</span></pre></td></tr><tr><td data-num="11"></td><td><pre>df<span class="token punctuation">.</span>columns</pre></td></tr><tr><td data-num="12"></td><td><pre></pre></td></tr><tr><td data-num="13"></td><td><pre><span class="token comment"># DataFrame 对象的统计信息</span></pre></td></tr><tr><td data-num="14"></td><td><pre>df<span class="token punctuation">.</span>describe<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="15"></td><td><pre></pre></td></tr><tr><td data-num="16"></td><td><pre><span class="token comment"># DataFrame 对象的摘要</span></pre></td></tr><tr><td data-num="17"></td><td><pre>df<span class="token punctuation">.</span>info<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="18"></td><td><pre></pre></td></tr><tr><td data-num="19"></td><td><pre><span class="token comment"># DataFrame 对象的形状</span></pre></td></tr><tr><td data-num="20"></td><td><pre>df<span class="token punctuation">.</span>shape</pre></td></tr><tr><td data-num="21"></td><td><pre></pre></td></tr><tr><td data-num="22"></td><td><pre><span class="token comment"># 查看数据是否为空</span></pre></td></tr><tr><td data-num="23"></td><td><pre>df<span class="token punctuation">.</span>isnull<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="24"></td><td><pre></pre></td></tr><tr><td data-num="25"></td><td><pre><span class="token comment"># 查看各数据类型</span></pre></td></tr><tr><td data-num="26"></td><td><pre>df<span class="token punctuation">.</span>dtypes</pre></td></tr><tr><td data-num="27"></td><td><pre></pre></td></tr><tr><td data-num="28"></td><td><pre><span class="token comment"># 查看某一列的所有值</span></pre></td></tr><tr><td data-num="29"></td><td><pre>df<span class="token punctuation">[</span><span class="token punctuation">[</span><span class="token string">'column_name'</span><span class="token punctuation">]</span><span class="token punctuation">]</span> <span class="token comment"># 方法一</span></pre></td></tr><tr><td data-num="30"></td><td><pre>df<span class="token punctuation">.</span>loc<span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span> <span class="token punctuation">[</span><span class="token string">'column_name'</span><span class="token punctuation">]</span><span class="token punctuation">]</span> <span class="token comment"># 方法二</span></pre></td></tr><tr><td data-num="31"></td><td><pre></pre></td></tr><tr><td data-num="32"></td><td><pre><span class="token comment"># 查看某一列的变量名及种类</span></pre></td></tr><tr><td data-num="33"></td><td><pre>df<span class="token punctuation">[</span><span class="token string">'column_name'</span><span class="token punctuation">]</span><span class="token punctuation">.</span>value_counts<span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token comment"># 方法一</span></pre></td></tr><tr><td data-num="34"></td><td><pre>df<span class="token punctuation">[</span><span class="token string">'column_name'</span><span class="token punctuation">]</span><span class="token punctuation">.</span>unique<span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token comment"># 方法二</span></pre></td></tr><tr><td data-num="35"></td><td><pre></pre></td></tr><tr><td data-num="36"></td><td><pre><span class="token comment"># 查看满足条件的值</span></pre></td></tr><tr><td data-num="37"></td><td><pre>df<span class="token punctuation">[</span>condition<span class="token punctuation">]</span></pre></td></tr><tr><td data-num="38"></td><td><pre></pre></td></tr><tr><td data-num="39"></td><td><pre><span class="token comment"># 最大值对应的索引</span></pre></td></tr><tr><td data-num="40"></td><td><pre>df<span class="token punctuation">.</span>idxmax<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="41"></td><td><pre></pre></td></tr><tr><td data-num="42"></td><td><pre><span class="token comment"># 最小值对应的索引</span></pre></td></tr><tr><td data-num="43"></td><td><pre>df<span class="token punctuation">.</span>idxmin<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr></tbody></table></figure><h1 id="缺失值处理"><a class="anchor" href="#缺失值处理">#</a> 缺失值处理</h1>
<p>pandas 主要用 <code>np.nan</code> 表示缺失数据</p>
<p>一般情况下,运算时默认排除缺失值</p>
<figure class="highlight python"><figcaption data-lang="python"></figcaption><table><tbody><tr><td data-num="1"></td><td><pre><span class="token comment"># 查找缺失值</span></pre></td></tr><tr><td data-num="2"></td><td><pre>df<span class="token punctuation">.</span>isnull<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="3"></td><td><pre></pre></td></tr><tr><td data-num="4"></td><td><pre><span class="token comment"># 查找非缺失值</span></pre></td></tr><tr><td data-num="5"></td><td><pre>df<span class="token punctuation">.</span>notnull<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="6"></td><td><pre></pre></td></tr><tr><td data-num="7"></td><td><pre><span class="token comment"># 删除所有含缺失值的行</span></pre></td></tr><tr><td data-num="8"></td><td><pre>df<span class="token punctuation">.</span>dropna<span class="token punctuation">(</span>how<span class="token operator">=</span><span class="token string">'any'</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="9"></td><td><pre></pre></td></tr><tr><td data-num="10"></td><td><pre><span class="token comment"># 用 0 填充缺失值</span></pre></td></tr><tr><td data-num="11"></td><td><pre>df<span class="token punctuation">.</span>fillna<span class="token punctuation">(</span>value<span class="token operator">=</span><span class="token number">0</span><span class="token punctuation">)</span></pre></td></tr></tbody></table></figure><h1 id="重复值处理"><a class="anchor" href="#重复值处理">#</a> 重复值处理</h1>
<figure class="highlight python"><figcaption data-lang="python"></figcaption><table><tbody><tr><td data-num="1"></td><td><pre><span class="token comment"># 查看 DataFrame 数据中的重复值</span></pre></td></tr><tr><td data-num="2"></td><td><pre>df<span class="token punctuation">.</span>duplicated<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="3"></td><td><pre></pre></td></tr><tr><td data-num="4"></td><td><pre><span class="token comment"># 统计 DataFrame 数据中重复值的个数</span></pre></td></tr><tr><td data-num="5"></td><td><pre>df<span class="token punctuation">.</span>duplicated<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span><span class="token builtin">sum</span><span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="6"></td><td><pre></pre></td></tr><tr><td data-num="7"></td><td><pre><span class="token comment"># 删除 DataFrame 数据中的重复值</span></pre></td></tr><tr><td data-num="8"></td><td><pre>df<span class="token punctuation">.</span>drop_duplicates<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr></tbody></table></figure><h1 id="索引变换"><a class="anchor" href="#索引变换">#</a> 索引变换</h1>
<figure class="highlight python"><figcaption data-lang="python"></figcaption><table><tbody><tr><td data-num="1"></td><td><pre><span class="token comment"># 重置索引</span></pre></td></tr><tr><td data-num="2"></td><td><pre>df<span class="token punctuation">.</span>reset_index<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="3"></td><td><pre></pre></td></tr><tr><td data-num="4"></td><td><pre><span class="token comment"># 将现有的列设为索引</span></pre></td></tr><tr><td data-num="5"></td><td><pre>df<span class="token punctuation">.</span>set_index<span class="token punctuation">(</span><span class="token string">'column_name'</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="6"></td><td><pre></pre></td></tr><tr><td data-num="7"></td><td><pre><span class="token comment"># 修改索引与列标签</span></pre></td></tr><tr><td data-num="8"></td><td><pre>df<span class="token punctuation">.</span>reindex<span class="token punctuation">(</span>index<span class="token operator">=</span>index_labels<span class="token punctuation">,</span> columns<span class="token operator">=</span>column_labels<span class="token punctuation">)</span> <span class="token comment"># 方法一</span></pre></td></tr><tr><td data-num="9"></td><td><pre>df<span class="token punctuation">.</span>rename<span class="token punctuation">(</span>index<span class="token operator">=</span>index_labels<span class="token punctuation">,</span> columns<span class="token operator">=</span>column_labels<span class="token punctuation">)</span> <span class="token comment"># 方法二</span></pre></td></tr></tbody></table></figure><p><a target="_blank" rel="noopener" href="https://zhuanlan.zhihu.com/p/277008403">reindex 和 rename</a></p>
<h1 id="排序"><a class="anchor" href="#排序">#</a> 排序</h1>
<figure class="highlight python"><figcaption data-lang="python"></figcaption><table><tbody><tr><td data-num="1"></td><td><pre><span class="token comment"># 按索引排序</span></pre></td></tr><tr><td data-num="2"></td><td><pre>df<span class="token punctuation">.</span>sort_index<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="3"></td><td><pre></pre></td></tr><tr><td data-num="4"></td><td><pre><span class="token comment"># 按行列的值排序</span></pre></td></tr><tr><td data-num="5"></td><td><pre>df<span class="token punctuation">.</span>sort_values<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="6"></td><td><pre></pre></td></tr><tr><td data-num="7"></td><td><pre><span class="token comment"># 按某一列排序</span></pre></td></tr><tr><td data-num="8"></td><td><pre>df<span class="token punctuation">.</span>sort_values<span class="token punctuation">(</span>by<span class="token operator">=</span><span class="token string">'column_name'</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="9"></td><td><pre></pre></td></tr><tr><td data-num="10"></td><td><pre><span class="token comment"># 按多列排序</span></pre></td></tr><tr><td data-num="11"></td><td><pre>df<span class="token punctuation">.</span>sort_values<span class="token punctuation">(</span>by<span class="token operator">=</span><span class="token punctuation">[</span><span class="token string">'column_name_1'</span><span class="token punctuation">,</span> <span class="token string">'column_name_2'</span><span class="token punctuation">]</span><span class="token punctuation">)</span></pre></td></tr></tbody></table></figure><p>特别地,如果数据含有空值,可以用 <code>na_position</code> 参数处理空值,例如:</p>
<figure class="highlight python"><figcaption data-lang="python"></figcaption><table><tbody><tr><td data-num="1"></td><td><pre>df<span class="token punctuation">.</span>sort_values<span class="token punctuation">(</span>na_position<span class="token operator">=</span><span class="token string">'first'</span><span class="token punctuation">)</span></pre></td></tr></tbody></table></figure><h1 id="特征处理"><a class="anchor" href="#特征处理">#</a> 特征处理</h1>
<figure class="highlight python"><figcaption data-lang="python"></figcaption><table><tbody><tr><td data-num="1"></td><td><pre>按照数据的值进行离散化</pre></td></tr><tr><td data-num="2"></td><td><pre>pd<span class="token punctuation">.</span>cut<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="3"></td><td><pre></pre></td></tr><tr><td data-num="4"></td><td><pre><span class="token comment"># 按照数据的数量进行离散化</span></pre></td></tr><tr><td data-num="5"></td><td><pre>pd<span class="token punctuation">.</span>qcut<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="6"></td><td><pre></pre></td></tr><tr><td data-num="7"></td><td><pre><span class="token comment"># 替换数据值</span></pre></td></tr><tr><td data-num="8"></td><td><pre>df<span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="9"></td><td><pre></pre></td></tr><tr><td data-num="10"></td><td><pre><span class="token comment"># 替换数据值</span></pre></td></tr><tr><td data-num="11"></td><td><pre>df<span class="token punctuation">.</span><span class="token builtin">map</span><span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="12"></td><td><pre></pre></td></tr><tr><td data-num="13"></td><td><pre><span class="token comment"># one-hot 编码</span></pre></td></tr><tr><td data-num="14"></td><td><pre>pd<span class="token punctuation">.</span>get_dummies<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr></tbody></table></figure><p>具体用法请参考 <a target="_blank" rel="noopener" href="http://pandas.pydata.org/docs/reference/">pandas Document</a></p>
<h1 id="数据拼接"><a class="anchor" href="#数据拼接">#</a> 数据拼接</h1>
<figure class="highlight python"><figcaption data-lang="python"></figcaption><table><tbody><tr><td data-num="1"></td><td><pre><span class="token comment"># 既可以横向拼接,又可以纵向拼接</span></pre></td></tr><tr><td data-num="2"></td><td><pre>pd<span class="token punctuation">.</span>concat<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="3"></td><td><pre></pre></td></tr><tr><td data-num="4"></td><td><pre><span class="token comment"># 横向拼接</span></pre></td></tr><tr><td data-num="5"></td><td><pre>pd<span class="token punctuation">.</span>merge<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="6"></td><td><pre></pre></td></tr><tr><td data-num="7"></td><td><pre><span class="token comment"># 横向拼接 DataFrame 对象</span></pre></td></tr><tr><td data-num="8"></td><td><pre>df<span class="token punctuation">.</span>join<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr><tr><td data-num="9"></td><td><pre></pre></td></tr><tr><td data-num="10"></td><td><pre><span class="token comment"># 纵向拼接 DataFrame 对象</span></pre></td></tr><tr><td data-num="11"></td><td><pre>df<span class="token punctuation">.</span>append<span class="token punctuation">(</span><span class="token punctuation">)</span></pre></td></tr></tbody></table></figure><p><a target="_blank" rel="noopener" href="https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#">Merge, join, concatenate and compare</a></p>
<h1 id="数据分组"><a class="anchor" href="#数据分组">#</a> 数据分组</h1>
<ul>
<li>分割:按条件把数据分割成多组</li>
<li>应用:为每组单独应用函数</li>
<li>组合:将处理结果组合成一个数据结构</li>
</ul>
<p>具体请参考:<a target="_blank" rel="noopener" href="https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#">Groupby API</a></p>
<h1 id="参考资料"><a class="anchor" href="#参考资料">#</a> 参考资料</h1>
<ul>
<li><a target="_blank" rel="noopener" href="http://pandas.pydata.org/docs/reference/">pandas 官方文档</a></li>
<li><a target="_blank" rel="noopener" href="https://www.pypandas.cn/docs/getting_started/tutorials.html">pandas 教程资料</a></li>
<li><a target="_blank" rel="noopener" href="https://www.pypandas.cn/docs/getting_started/10min.html">十分钟入门 pandas</a></li>
</ul>
</div><footer><div class="meta"><span class="item"><span class="icon"><i class="ic i-calendar-check"></i></span><span class="text">更新于</span><time title="修改时间:2024-06-08 23:09:05" itemprop="dateModified" datetime="2024-06-08T23:09:05+08:00">2024-06-08</time></span></div><div id="copyright"><ul><li class="author"><strong>本文作者:</strong>Jiankychen<i class="ic i-at"><em>@</em></i>Jiankychen's Blog</li><li class="link"><strong>本文链接:</strong><a href="https://jiankychen.github.io/python-pandas.html" title="pandas 基础">https://jiankychen.github.io/python-pandas.html</a></li><li class="license"><strong>版权声明:</strong>本站所有文章除特别声明外,均采用 <a target="_blank" rel="noopener" href="https://creativecommons.org/licenses/by-nc-sa/4.0/deed.zh"><i class="ic i-creative-commons"><em>(CC)</em></i>BY-NC-SA</a> 许可协议。转载请注明出处!</li></ul></div></footer></article></div><div class="post-nav"><div class="item left"><a href="/python-conda.html" rel="prev" itemprop="url" data-background-image="https://img.timelessq.com/images/2022/07/26/8491109c4ae2ac88bbf9659a4f6d5ed2.jpg" title="conda 常用命令"><span class="type">上一篇</span><span class="category"><i class="ic i-flag"></i>Python</span><h3>conda 常用命令</h3></a></div><div class="item right"><a href="/python-oop.html" rel="next" itemprop="url" data-background-image="https://img.timelessq.com/images/2022/07/26/e5221f7d85b0900837a45fb933fa34ec.jpg" title="Python 面向对象"><span class="type">下一篇</span><span class="category"><i class="ic i-flag"></i>Python</span><h3>Python 面向对象</h3></a></div></div><div class="wrap" id="comments"></div></div><div id="sidebar"><div class="inner"><div class="panels"><div class="inner"><div class="contents panel pjax" data-title="文章目录"><ol class="toc"><li class="toc-item toc-level-1"><a class="toc-link" href="#%E5%AE%89%E8%A3%85%E5%8F%8A%E5%AF%BC%E5%85%A5"><span class="toc-number">1.</span> <span class="toc-text"> 安装及导入</span></a><ol class="toc-child"><li class="toc-item toc-level-2"><a class="toc-link" href="#%E5%AE%89%E8%A3%85"><span class="toc-number">1.1.</span> <span class="toc-text"> 安装</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#%E5%AF%BC%E5%85%A5"><span class="toc-number">1.2.</span> <span class="toc-text"> 导入</span></a></li></ol></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E6%95%B0%E6%8D%AE%E7%BB%93%E6%9E%84"><span class="toc-number">2.</span> <span class="toc-text"> 数据结构</span></a><ol class="toc-child"><li class="toc-item toc-level-2"><a class="toc-link" href="#series"><span class="toc-number">2.1.</span> <span class="toc-text"> Series</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#dataframe"><span class="toc-number">2.2.</span> <span class="toc-text"> DataFrame</span></a></li></ol></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E6%95%B0%E6%8D%AE%E8%BE%93%E5%85%A5"><span class="toc-number">3.</span> <span class="toc-text"> 数据输入</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E6%95%B0%E6%8D%AE%E8%BE%93%E5%87%BA"><span class="toc-number">4.</span> <span class="toc-text"> 数据输出</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E6%9F%A5%E7%9C%8B%E6%95%B0%E6%8D%AE"><span class="toc-number">5.</span> <span class="toc-text"> 查看数据</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E7%BC%BA%E5%A4%B1%E5%80%BC%E5%A4%84%E7%90%86"><span class="toc-number">6.</span> <span class="toc-text"> 缺失值处理</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E9%87%8D%E5%A4%8D%E5%80%BC%E5%A4%84%E7%90%86"><span class="toc-number">7.</span> <span class="toc-text"> 重复值处理</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E7%B4%A2%E5%BC%95%E5%8F%98%E6%8D%A2"><span class="toc-number">8.</span> <span class="toc-text"> 索引变换</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E6%8E%92%E5%BA%8F"><span class="toc-number">9.</span> <span class="toc-text"> 排序</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E7%89%B9%E5%BE%81%E5%A4%84%E7%90%86"><span class="toc-number">10.</span> <span class="toc-text"> 特征处理</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E6%95%B0%E6%8D%AE%E6%8B%BC%E6%8E%A5"><span class="toc-number">11.</span> <span class="toc-text"> 数据拼接</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E6%95%B0%E6%8D%AE%E5%88%86%E7%BB%84"><span class="toc-number">12.</span> <span class="toc-text"> 数据分组</span></a></li><li class="toc-item toc-level-1"><a class="toc-link" href="#%E5%8F%82%E8%80%83%E8%B5%84%E6%96%99"><span class="toc-number">13.</span> <span class="toc-text"> 参考资料</span></a></li></ol></div><div class="related panel pjax" data-title="系列文章"><ul><li><a href="/python-basics.html" rel="bookmark" title="Python 基本语法">Python 基本语法</a></li><li><a href="/python-container.html" rel="bookmark" title="Python 数据容器">Python 数据容器</a></li><li><a href="/python-files.html" rel="bookmark" title="Python 文件操作">Python 文件操作</a></li><li><a href="/python-modules.html" rel="bookmark" title="Python 异常、模块、包">Python 异常、模块、包</a></li><li><a href="/python-conda.html" rel="bookmark" title="conda 常用命令">conda 常用命令</a></li><li class="active"><a href="/python-pandas.html" rel="bookmark" title="pandas 基础">pandas 基础</a></li><li><a href="/python-oop.html" rel="bookmark" title="Python 面向对象">Python 面向对象</a></li></ul></div><div class="overview panel" data-title="站点概览"><div class="author" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><img class="image" loading="lazy" decoding="async" itemprop="image" alt="Jiankychen" src="/assets/avatar.webp"><p class="name" itemprop="name">Jiankychen</p><div class="description" itemprop="description"></div></div><nav class="state"><div class="item posts"><a href="/archives/"><span class="count">51</span><span class="name">文章</span></a></div><div class="item categories"><a href="/categories/"><span class="count">8</span><span class="name">分类</span></a></div><div class="item tags"><a href="/tags/"><span class="count">20</span><span class="name">标签</span></a></div></nav><div class="social"><a target="_blank" rel="noopener" href="https://github.com/jiankychen" class="item github" title="https://github.com/jiankychen"><i class="ic i-github"></i></a><a href="mailto:[email protected]" class="item email" title="mailto:[email protected]"><i class="ic i-envelope"></i></a><a target="_blank" rel="noopener" href="https://music.163.com/#/user/home?id=447771275" class="item music" title="https://music.163.com/#/user/home?id=447771275"><i class="ic i-cloud-music"></i></a><a target="_blank" rel="noopener" href="https://www.zhihu.com/people/jiankychen" class="item zhihu" title="https://www.zhihu.com/people/jiankychen"><i class="ic i-zhihu"></i></a></div><div class="menu"><li class="item"><a href="/" rel="section"><i class="ic i-home"></i>首页</a></li><li class="item dropdown"><a href="#" onclick="return false;"><i class="ic i-feather"></i>文章</a><ul class="submenu"><li class="item"><a href="/archives/" rel="section"><i class="ic i-list-alt"></i>归档</a></li><li class="item"><a href="/categories/" rel="section"><i class="ic i-th"></i>分类</a></li><li class="item"><a href="/tags/" rel="section"><i class="ic i-tags"></i>标签</a></li></ul></li><li class="item dropdown"><a href="#" onclick="return false;"><i class="ic i-feather"></i>链接</a><ul class="submenu"><li class="item"><a href="/peers/" rel="section"><i class="ic i-magic"></i>链环</a></li><li class="item"><a href="/friends/" rel="section"><i class="ic i-heart"></i>友链</a></li></ul></li><li class="item dropdown"><a href="#" onclick="return false;"><i class="ic i-stars"></i>关于</a><ul class="submenu"><li class="item"><a href="/owner/" rel="section"><i class="ic i-user"></i>关于博主</a></li><li class="item"><a href="/site/" rel="section"><i class="ic i-paw"></i>关于本站</a></li><li class="item"><a href="/update/" rel="section"><i class="ic i-cloud"></i>更新日志</a></li></ul></li></div></div></div></div><ul id="quick"><li class="prev pjax"><a href="/python-oop.html" rel="prev" title="上一篇"><i class="ic i-chevron-left"></i></a></li><li class="up"><i class="ic i-arrow-up"></i></li><li class="down"><i class="ic i-arrow-down"></i></li><li class="next pjax"><a href="/python-conda.html" rel="next" title="下一篇"><i class="ic i-chevron-right"></i></a></li><li class="percent"></li></ul></div></div><div class="dimmer"></div></div></main><footer id="footer"><div class="inner"><div class="widgets"><div class="rpost pjax"><h2>随机文章</h2><ul><li class="item"><div class="breadcrumb"><a href="/categories/Coding/" title="分类于Coding">Coding</a></div><span><a href="/leetcode-analog.html">LeetCode - 模拟专题</a></span></li><li class="item"><div class="breadcrumb"><a href="/categories/Coding/" title="分类于Coding">Coding</a></div><span><a href="/leetcode-binarysearch.html">LeetCode - 二分查找专题</a></span></li><li class="item"><div class="breadcrumb"><a href="/categories/Data-Structure/" title="分类于Data Structure">Data Structure</a></div><span><a href="/traverse.html">回溯</a></span></li><li class="item"><div class="breadcrumb"><a href="/categories/Python/" title="分类于Python">Python</a></div><span><a href="/python-conda.html">conda 常用命令</a></span></li><li class="item"><div class="breadcrumb"><a href="/categories/Data-Structure/" title="分类于Data Structure">Data Structure</a></div><span><a href="/sort-algorithm.html">排序</a></span></li><li class="item"><div class="breadcrumb"><a href="/categories/Coding/" title="分类于Coding">Coding</a></div><span><a href="/leetcode-list.html">LeetCode - 链表专题</a></span></li><li class="item"><div class="breadcrumb"><a href="/categories/Data-Structure/" title="分类于Data Structure">Data Structure</a></div><span><a href="/hash-table.html">哈希表</a></span></li><li class="item"><div class="breadcrumb"><a href="/categories/Coding/" title="分类于Coding">Coding</a></div><span><a href="/leetcode-greedy.html">LeetCode - 贪心专题</a></span></li><li class="item"><div class="breadcrumb"></div><span><a href="/job.html">23 求职笔面试</a></span></li><li class="item"><div class="breadcrumb"><a href="/categories/Data-Structure/" title="分类于Data Structure">Data Structure</a></div><span><a href="/binary-tree.html">二叉树</a></span></li></ul></div><div class="rpost pjax"><h2>最新评论</h2><ul class="leancloud-recent-comment" id="new-comment"></ul></div></div><div class="status"><div class="copyright">© 2021 -<span itemprop="copyrightYear">2024</span><span class="with-love"><i class="ic i-sakura rotate"></i></span><span class="author" itemprop="copyrightHolder">Jiankychen @ Jiankychen</span></div><div class="count"><span class="post-meta-item-icon"><i class="ic i-chart-area"></i></span><span title="站点总字数">955k 字</span><span class="post-meta-divider"> | </span><span class="post-meta-item-icon"><i class="ic i-coffee"></i></span><span title="站点阅读时长">14:28</span></div><div class="powered-by">基于 <a target="_blank" rel="noopener" href="https://hexo.io/">Hexo</a> & Theme.<a target="_blank" rel="noopener" href="https://github.com/theme-shoka-x/hexo-theme-shokaX/">ShokaX</a></div></div><script src="https://unpkg.com/[email protected]/bsz.pure.mini.js"></script><div id="busuanzi-wrap"><span class="ic i-eye"></span><span id="busuanzi_container_site_pv">本站总访问量 <span id="busuanzi_value_site_pv"></span> 次</span> | <span class="ic i-user"></span><span id="busuanzi_container_site_uv">本站总访客量 <span id="busuanzi_value_site_uv"></span> 次</span></div></div></footer></div><script data-config="" type="text/javascript">var LOCAL = {
ispost: true,
path: `/python-pandas`,
favicon: {
show: `Jiankychen`,
hide: `Jiankychen`
},
search: {
placeholder: "文章搜索",
empty: "关于 「 ${query} 」,什么也没搜到",
stats: "${time} ms 内找到 ${hits} 条结果"
},
copy_tex: false,
katex: false,
mermaid: false,
audio: undefined,
fancybox: true,
nocopy: false,
outime: true,
template: `<div class="note warning"><p><span class="label warning">文章时效性提示</span><br>这是一篇发布于 {{publish}} 天前,最后一次更新在 {{updated}} 天前的文章,部分信息可能已经发生改变,请注意甄别。</p></div>`,
quiz: {
choice: `单选题`,
multiple: `多选题`,
true_false: `判断题`,
essay: `问答题`,
gap_fill: `填空题`,
mistake: `错题备注`
},
ignores: [
(uri) => uri.includes('#'),
(uri) => new RegExp(LOCAL.path + '$').test(uri),
[]
]
};
</script><script src="https://lf9-cdn-tos.bytecdntp.com/cdn/expire-6-M/pace/1.2.4/pace.min.js" async=""></script><script src="https://polyfill.io/v3/polyfill.min.js?features=default,fetch" defer=""></script><script src="/js/siteInit.js?v=0.4.2" type="module" fetchpriority="high" defer=""></script></body></html>