mirror of
https://github.com/beyondx/Notes.git
synced 2026-02-03 18:33:26 +08:00
add lots file of APUE
This commit is contained in:
@@ -6,8 +6,6 @@ Creation-Date: 2011-10-07T18:42:52+08:00
|
||||
Created Friday 07 October 2011
|
||||
http://yibin.us/archives/6817
|
||||
|
||||
在一些python讨论版里,经常会见到一些“月经帖”,类似于“我用python读取一个文件乱码”,然后就会抱怨python的编码很麻烦,其实不是python编码难搞定,而是没有真正理解python的编码。
|
||||
|
||||
如在windows环境下的以下示例代码:
|
||||
|
||||
#!/usr/bin/env python
|
||||
@@ -21,9 +19,10 @@ if __name__=='__main__':
|
||||
do()
|
||||
|
||||
此时的ansi.txt编码为ansi,我们在cmd窗口执行,看到如下结果:
|
||||
{{~/sync/notes/zim/python/python中的编码/1.png}}
|
||||
{{./1.png}}
|
||||
|
||||
此时一切正常,但,如果还是用上面的脚本去读取utf8.txt,文件是utf8编码,就会得到下面的结果:
|
||||
{{~/sync/notes/zim/python/python中的编码/2.png}}
|
||||
{{./2.png}}
|
||||
经典的“乱码”出现了,有朋友可能会说了,我在python脚本里指定编码应该就解决了,于是:
|
||||
|
||||
#!/usr/bin/env python
|
||||
@@ -40,7 +39,7 @@ if __name__=='__main__':
|
||||
do()
|
||||
|
||||
再次运行:
|
||||
{{~/sync/notes/zim/python/python中的编码/3.png}}
|
||||
{{./3.png}}
|
||||
OMG,还是乱码。。。。
|
||||
|
||||
能不能正常输出中文不取决于#coding=utf-8,也不取决于目标文件的编码,而是取决于你的__终端输出设备__,这里就是CMD窗口,CMD窗口是不支持UTF-8的,它只支持__GBK__,所以,我们要转码。
|
||||
@@ -64,7 +63,7 @@ if __name__=='__main__':
|
||||
do()
|
||||
|
||||
结果:
|
||||
{{~/sync/notes/zim/python/python中的编码/4.png}}
|
||||
{{./4.png}}
|
||||
正常输出。
|
||||
|
||||
做一个小结:
|
||||
|
||||
@@ -122,13 +122,3 @@ Base64编码可用于在HTTP环境下传递较长的__标识信息__。例如,
|
||||
MIME::Base64 Perl module
|
||||
Firefox extension
|
||||
emacs函数
|
||||
|
||||
===== 参见 =====
|
||||
|
||||
Radix-64
|
||||
ASCII85
|
||||
Quoted-printable
|
||||
uuencode
|
||||
yEnc
|
||||
8BITMIME
|
||||
URL
|
||||
|
||||
@@ -9,10 +9,13 @@ Created Friday 14 October 2011
|
||||
Python 2.7.2 (default, Jun 29 2011, 11:17:09)
|
||||
[GCC 4.6.1] on linux2
|
||||
Type "help", "copyright", "credits" or "license" for more information.
|
||||
>>> '张俊' #python2 会自动将字符串转换为合适的编码字节字符串
|
||||
|
||||
>>> '张俊' #python2 会自动将字符串转换为合适编码的字节字符串
|
||||
'\xe5\xbc\xa0\xe4\xbf\x8a' #自动转换为utf-8编码的字节字符串
|
||||
|
||||
>>> u'张俊' #显式指定字符串类型为unicode类型, 此类型字符串没有编码,保存的是字符在unicode**字符集中的代码点(序号)**
|
||||
u'\u5f20\u4fca'
|
||||
|
||||
>>> '张俊'.encode('utf-8') #python2 已经自动将其转化为utf-8类型编码,因此再次编码(python2会将该字符串当作用ascii或unicode编码过)会出现错误。
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
|
||||
7
Zim/Programme/python/python笔记.txt
Normal file
7
Zim/Programme/python/python笔记.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
Content-Type: text/x-zim-wiki
|
||||
Wiki-Format: zim 0.4
|
||||
Creation-Date: 2012-10-04T13:20:28+08:00
|
||||
|
||||
====== python笔记 ======
|
||||
Created Thursday 04 October 2012
|
||||
|
||||
40
Zim/Programme/python/python笔记/dict.txt
Normal file
40
Zim/Programme/python/python笔记/dict.txt
Normal file
@@ -0,0 +1,40 @@
|
||||
Content-Type: text/x-zim-wiki
|
||||
Wiki-Format: zim 0.4
|
||||
Creation-Date: 2012-10-04T13:21:05+08:00
|
||||
|
||||
====== dict ======
|
||||
Created Thursday 04 October 2012
|
||||
|
||||
Help on class dict in module __builtin__:
|
||||
|
||||
class dict(object)
|
||||
| dict() -> new empty dictionary
|
||||
| dict(mapping) -> new dictionary initialized from **a mapping object**'s #mapping为__一个__pairs对象
|
||||
| (key, value) pairs
|
||||
| dict(iterable) -> new dictionary initialized as if via:
|
||||
| d = {}
|
||||
| for **k, v** in iterable: #迭代器对象每次返回的元素必须是一个容器类型,__容器中元素的个数为2__.**如[a,b], "ab",(a,b)**
|
||||
| d[k] = v
|
||||
| dict(__**kwargs)__ -> new dictionary initialized with the name=value pairs
|
||||
| in the keyword argument list. For example: dict(one=1, two=2)
|
||||
|
|
||||
| Methods defined here:
|
||||
|
||||
|
||||
>>> dict(__[('sape', 4139), ('guido', 4127), ('jack', 4098)]__)
|
||||
{'sape': 4139, 'jack': 4098, 'guido': 4127}
|
||||
|
||||
>>> dict([(x, x**2) for x in (2, 4, 6)]) # use a list comprehension
|
||||
{2: 4, 4: 16, 6: 36}
|
||||
|
||||
>>> dict(sape=4139, guido=4127, jack=4098)
|
||||
{'sape': 4139, 'jack': 4098, 'guido': 4127}
|
||||
tel = {'jack': 4098, 'sape': 4139}
|
||||
|
||||
>>> dc=dict(["df","12"]);dc #["df","12"]为一科迭代对象,每次返回的元素为两个字符的str,所以可以被unpack给key,value
|
||||
{'1': '2', 'd': 'f'}
|
||||
>>> dc=dict(["df",__"123"__]);dc
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
ValueError: dictionary update sequence element __#1 has length 3; 2 is required__
|
||||
>>>
|
||||
20
Zim/Programme/python/python笔记/float.txt
Normal file
20
Zim/Programme/python/python笔记/float.txt
Normal file
@@ -0,0 +1,20 @@
|
||||
Content-Type: text/x-zim-wiki
|
||||
Wiki-Format: zim 0.4
|
||||
Creation-Date: 2012-10-04T13:20:56+08:00
|
||||
|
||||
====== float ======
|
||||
Created Thursday 04 October 2012
|
||||
|
||||
>>> float("0xff")
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
ValueError: invalid literal for float(): 0xff
|
||||
>>>
|
||||
|
||||
>>> __float.fromhex("0xfff")__
|
||||
4095.0
|
||||
>>>
|
||||
|
||||
>>> float("0.111")
|
||||
0.111
|
||||
>>>
|
||||
37
Zim/Programme/python/python笔记/int_long.txt
Normal file
37
Zim/Programme/python/python笔记/int_long.txt
Normal file
@@ -0,0 +1,37 @@
|
||||
Content-Type: text/x-zim-wiki
|
||||
Wiki-Format: zim 0.4
|
||||
Creation-Date: 2012-10-04T13:20:52+08:00
|
||||
|
||||
====== int long ======
|
||||
Created Thursday 04 October 2012
|
||||
|
||||
>>> 1&2 #按__位与__
|
||||
0
|
||||
|
||||
>>> 0xff&0xf1 #按位与
|
||||
241
|
||||
>>> 0xff&0xf0
|
||||
240
|
||||
>>> __hex__(0xff&0xf0) #返回的__字符串__
|
||||
'0xf0'
|
||||
__与hex()类似, bin(), oct()等返回的都是int或long型的字符串代表__
|
||||
|
||||
>>> 1&&2 __#python没有&&, ||, !逻辑运算符,但是有and, or, not,而且这三个逻辑运算符返回的是最后一个元素的内容__
|
||||
File "<stdin>", line 1
|
||||
1&&2
|
||||
^
|
||||
SyntaxError: invalid syntax
|
||||
|
||||
>>> 1 and 2 __#返回的是最后一个元素的内容而不是True或False,这里为2__
|
||||
2
|
||||
|
||||
>>> 'fff' & 'dfad' __#str类型没有定义__and__方法,所以没有位运算__
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
TypeError: unsupported operand type(s) for &: 'str' and 'str'
|
||||
>>> help(str)
|
||||
|
||||
>>> 'fff' and 'dfad'
|
||||
'dfad'
|
||||
>>>
|
||||
|
||||
25
Zim/Programme/python/python笔记/list.txt
Normal file
25
Zim/Programme/python/python笔记/list.txt
Normal file
@@ -0,0 +1,25 @@
|
||||
Content-Type: text/x-zim-wiki
|
||||
Wiki-Format: zim 0.4
|
||||
Creation-Date: 2012-10-04T13:21:01+08:00
|
||||
|
||||
====== list ======
|
||||
Created Thursday 04 October 2012
|
||||
|
||||
>>> l=range(1,10)
|
||||
>>> l
|
||||
[1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
>>> __l[1:8:-1]__
|
||||
[]
|
||||
>>> l[8:1:-1]
|
||||
[9, 8, 7, 6, 5, 4, 3]
|
||||
>>> __l[::-1]__
|
||||
[9, 8, 7, 6, 5, 4, 3, 2, 1]
|
||||
>>> l[-1::-1]
|
||||
[9, 8, 7, 6, 5, 4, 3, 2, 1]
|
||||
>>> __l[-1:-9:-1]__
|
||||
[9, 8, 7, 6, 5, 4, 3, 2]
|
||||
>>> l[:]
|
||||
[1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
>>> l[::]
|
||||
[1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
|
||||
18
Zim/Programme/python/python笔记/set.txt
Normal file
18
Zim/Programme/python/python笔记/set.txt
Normal file
@@ -0,0 +1,18 @@
|
||||
Content-Type: text/x-zim-wiki
|
||||
Wiki-Format: zim 0.4
|
||||
Creation-Date: 2012-10-04T13:21:08+08:00
|
||||
|
||||
====== set ======
|
||||
Created Thursday 04 October 2012
|
||||
|
||||
>>> set1=set(1,2,3)
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
TypeError: set expected __at most 1 arguments,__ got 3
|
||||
|
||||
>>> set1=set((1,2,3))
|
||||
>>> set1
|
||||
set([1, 2, 3])
|
||||
>>>
|
||||
|
||||
|
||||
40
Zim/Programme/python/python笔记/str.txt
Normal file
40
Zim/Programme/python/python笔记/str.txt
Normal file
@@ -0,0 +1,40 @@
|
||||
Content-Type: text/x-zim-wiki
|
||||
Wiki-Format: zim 0.4
|
||||
Creation-Date: 2012-10-04T13:20:41+08:00
|
||||
|
||||
====== str ======
|
||||
Created Thursday 04 October 2012
|
||||
|
||||
| join(...)
|
||||
| S.join(iterable) -> string
|
||||
|
|
||||
| Return a string which is the concatenation of __the strings in the__
|
||||
__ | iterable__. The separator between elements is S.
|
||||
|
||||
iterable迭代器对象每次返回的__必须是字符串对象__。
|
||||
|
||||
>>> ":".join("abcd")
|
||||
'a:b:c:d'
|
||||
|
||||
>>> ":".join(['a','b','c','d'])
|
||||
'a:b:c:d'
|
||||
|
||||
>>> ":".join(['a',__123__,'c'])
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
TypeError: sequence item 1: __expected string__, int found
|
||||
|
||||
>>> ":".join(['a',['ab'],'c'])
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
TypeError: sequence item 1: expected string, list found
|
||||
>>>
|
||||
|
||||
| rsplit(...)
|
||||
| S.rsplit([sep [,maxsplit]]) -> list of strings
|
||||
|
|
||||
| Return a list of the words in the string S, using sep as the
|
||||
| delimiter string, starting at the end of the string and working
|
||||
| to the front. If maxsplit is given, at most maxsplit splits are
|
||||
| done. If sep is not specified or is __None__, any whitespace string
|
||||
| is a separator.
|
||||
38
Zim/Programme/python/python笔记/unpack.txt
Normal file
38
Zim/Programme/python/python笔记/unpack.txt
Normal file
@@ -0,0 +1,38 @@
|
||||
Content-Type: text/x-zim-wiki
|
||||
Wiki-Format: zim 0.4
|
||||
Creation-Date: 2012-10-04T13:47:32+08:00
|
||||
|
||||
====== unpack ======
|
||||
Created Thursday 04 October 2012
|
||||
|
||||
>>> for k,v in ["fdf",23,"dfdf",33]:
|
||||
... print k,v
|
||||
...
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
ValueError: __too many__ values to unpack
|
||||
|
||||
顺序容器类型如str, list, tuple__每次迭代时只能返回其中的一个元素__。
|
||||
所以第一次返回循环返回**"fdf"**,但是它有三个元素最多只能赋值给两个
|
||||
变量。
|
||||
|
||||
>>> for k,v in "dfdf":
|
||||
... print k,v
|
||||
...
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
ValueError: __need more than 1 value__ to unpack
|
||||
|
||||
字符串迭代时,每次返回其中的一个字符。所以最多只能unpack给一个变量。
|
||||
|
||||
>>> k,v="dfdf"
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
ValueError: __too many values to unpack__
|
||||
|
||||
unpack一个顺序容器类型时,左边变量的数目必须要与容器中元素的个数相同。
|
||||
|
||||
>>> k,v="df"
|
||||
>>> print k,v
|
||||
d f
|
||||
>>>
|
||||
7
Zim/Programme/python/python笔记/内置函数.txt
Normal file
7
Zim/Programme/python/python笔记/内置函数.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
Content-Type: text/x-zim-wiki
|
||||
Wiki-Format: zim 0.4
|
||||
Creation-Date: 2012-10-04T13:21:30+08:00
|
||||
|
||||
====== 内置函数 ======
|
||||
Created Thursday 04 October 2012
|
||||
|
||||
121
Zim/Programme/python/编写_Unix_管道风格的_Python_代码.txt
Normal file
121
Zim/Programme/python/编写_Unix_管道风格的_Python_代码.txt
Normal file
@@ -0,0 +1,121 @@
|
||||
Content-Type: text/x-zim-wiki
|
||||
Wiki-Format: zim 0.4
|
||||
Creation-Date: 2012-10-04T22:01:26+08:00
|
||||
|
||||
====== 编写 Unix 管道风格的 Python 代码 ======
|
||||
Created Thursday 04 October 2012
|
||||
http://www.oschina.net/question/54100_11910
|
||||
|
||||
先推荐一份幻灯片,David Beazley ("Python essiential reference", PLY 的作者) 在 PyCon’2008 上报告的幻灯片,强烈推荐!!这篇文章的很多内容都来自或者受这份幻灯片的启发而来。
|
||||
|
||||
在上一篇文章里介绍了 Unix 管道的好处,那可不可以在写程序时也使用这样的思想呢?当然可以。看过 SICP 就知道,其实函数式编程中的 __map, filter__ 都可以看作是管道思想的应用。但其实管道的思想不仅可以在函数式语言中使用,只要语言支持定义函数,有能够存放一组数据的数据结构,就可以使用管道的思 想。
|
||||
|
||||
一个日志处理任务
|
||||
|
||||
这里直接以前面推荐的幻灯片里的例子来说明,应用场景如下:
|
||||
|
||||
某个目录及子目录下有一些 web 服务器的日志文件,日志文件名以 access-log 开头
|
||||
日志格式如下
|
||||
81.107.39.38 - ... "GET /ply/ply.html HTTP/1.1" 200 97238
|
||||
81.107.39.38 - ... "GET /ply HTTP/1.1" 304 -
|
||||
其中最后一列数字为发送的字节数,若为 ‘-’ 则表示没有发送数据
|
||||
|
||||
目标是算出总共发送了多少字节的数据,实际上也就是要把日志记录的没一行的最后一列数值加起来
|
||||
我不直接展示如何用 Unix 管道的风格来处理这个问题,而是先给出一些“不那么好”的代码,指出它们的问题,最后再展示管道风格的代码,并介绍如何使用 generator 来避免效率上的问题。想直接看管道风格的,点这里。
|
||||
|
||||
问题并不复杂,几个 for 循环就能搞定:
|
||||
|
||||
sum = 0
|
||||
for path, dirlist, filelist in __os.walk(top)__:
|
||||
for name in __fnmatch.filter__(filelist, "access-log*"):
|
||||
# 对子目录中的每个日志文件进行处理
|
||||
with open(name) as f:
|
||||
for line in f:
|
||||
if line[-1] == '-':
|
||||
continue
|
||||
else:
|
||||
sum += int(line__.rsplit(None, 1)__[1])
|
||||
|
||||
利用 os.walk 这个问题解决起来很方便,由此也可以看出 python 的 for 语句做遍历是多么的方便,不需要额外控制循环次数的变量,省去了设置初始值、更新、判断循环结束条件等工作,相比 C/C++/Java 这样的语言真是太方便了。看起来一切都很美好。
|
||||
|
||||
然而,设想以后有了新的统计任务,比如:
|
||||
|
||||
1. 统计某个特定页面的访问次数
|
||||
2. 处理另外的一些日志文件,日志文件名字以 error-log 开头
|
||||
|
||||
完成这些任务直接拿上面的代码过来改改就可以了,文件名的 pattern 改一下,处理每个文件的代码改一下。其实每次任务的处理中,找到特定名字为特定 pattern 的文件的代码是一样的,直接修改之前的代码其实就引入了重复。
|
||||
|
||||
如果重复的代码量很大,我们很自然的会注意到。然而 python 的 for 循环实在太方便了,像这里找文件的代码一共就两行,哪怕重写一遍也不会觉得太麻烦。for 循环的方便使得我们会忽略这样简单代码的重复。然而,再怎么方便好用,for 循环无法重用,只有把它放到函数中才能进行重用。
|
||||
|
||||
(先考虑下是你会如何避免这里的代码的重复。下面马上出现的代码并不好,是“误导性”的代码,我会在之后再给出“更好”的代码。)
|
||||
|
||||
因此,我们__把上面代码中不变的部分提取成一个通用的函数,可变的部分以参数的形式传入__,得到下面的代码。
|
||||
|
||||
def generic_process(topdir, filepat, processfunc):
|
||||
for path, dirlist, filelist in os.walk(top):
|
||||
for name in fnmatch.filter(filelist, filepat):
|
||||
with open(name) f:
|
||||
processfunc(f)
|
||||
|
||||
sum = 0
|
||||
# 很遗憾,python 对 closure 中的变量不能进行赋值操作,
|
||||
# 因此这里只能使用全局变量
|
||||
def add_count(f):
|
||||
global sum
|
||||
for line in f:
|
||||
if line[-1] == '-':
|
||||
continue
|
||||
else:
|
||||
sum += int(line.rsplit(None, 1)[1])
|
||||
|
||||
generic_process('logdir', 'access-log*', add_count)
|
||||
|
||||
看起来不变和可变的部分分开了,然而 generic_process 的设计并不好。它除了寻找文件以外还调用了日志文件处理函数,因此在其他任务中很可能就无法使用。另外 add_count 的参数必须是 file like object,因此测试时不能简单的直接使用字符串。
|
||||
|
||||
===== 管道风格的程序 =====
|
||||
下面考虑用 Unix 的工具和管道我们会如何完成这个任务:
|
||||
|
||||
find logdir -name "access-log*" | \
|
||||
xargs cat | \
|
||||
grep '[^-]$' | \
|
||||
awk '{ total += $NF } END { print total }'
|
||||
|
||||
find 根据文件名 pattern 找到文件,cat 把所有文件内容合并输出到 stdout,grep 从 stdin 读入,过滤掉行末为 ‘-’ 的行,awk 提取每行最后一列,将数值相加,最后打印出结果。(省掉 cat 是可以的,但这样一来 grep 就需要直接读文件而不是只从标准输入读。)
|
||||
|
||||
我们可以在 python 代码中模拟这些工具,__Unix 的工具通过文本来传递结果,在 python 中可以使用 list__。
|
||||
|
||||
def find(topdir, filepat, processfunc):
|
||||
files = []
|
||||
for path, dirlist, filelist in os.walk(top):
|
||||
for name in fnmatch.filter(filelist, filepat):
|
||||
files.append(name)
|
||||
return files
|
||||
|
||||
def cat(files):
|
||||
lines = []
|
||||
for file in files:
|
||||
with open(file) as f:
|
||||
for line in f:
|
||||
lines.append(line)
|
||||
return lines
|
||||
|
||||
def grep(pattern, lines):
|
||||
result = []
|
||||
import re
|
||||
pat = re.compile(pattern)
|
||||
for line in lines:
|
||||
if pat.search(line):
|
||||
result.append(line)
|
||||
resurn result
|
||||
|
||||
lines = grep('[^-]$', cat(find('logdir', 'access-log*')))
|
||||
col = (line.rsplit(None, 1)[1] for line in lines)
|
||||
print sum(int(c) for c in col)
|
||||
|
||||
有了 find, cat, grep 这三个函数,只需要连续调用就可以像 Unix 的管道一样将这些函数组合起来。数据在管道中的变化如下图(简洁起见,过滤器直接标在箭头上 ):
|
||||
|
||||
{{./1.gif}}
|
||||
|
||||
看起来现在的代码行数比最初直接用 for 循环的代码要多,但现在的代码就像 Unix 的那些小工具一样,每一个都更加可能被用到。我们可以把更多常用的 Unix 工具用 Python 来模拟,从而在 Python 代码中以 Unix 管道的风格来编写代码。
|
||||
|
||||
不过上面的代码性能很差,多个临时的 list 被创建。解决的办法是用 generator,因为篇幅比较长,具体做法放到下一篇文章中。
|
||||
BIN
Zim/Programme/python/编写_Unix_管道风格的_Python_代码/1.gif
Normal file
BIN
Zim/Programme/python/编写_Unix_管道风格的_Python_代码/1.gif
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 6.7 KiB |
Reference in New Issue
Block a user