add lots file of APUE

2026-02-03 18:33:26 +08:00 · 2012-10-30 20:31:20 +08:00
parent 05e8ae5877
commit 6642e173f9
113 changed files with 4954 additions and 181 deletions
--- a/Zim/Programme/python/python中的编码.txt
+++ b/Zim/Programme/python/python中的编码.txt
@@ -6,8 +6,6 @@ Creation-Date: 2011-10-07T18:42:52+08:00
 Created Friday 07 October 2011
 http://yibin.us/archives/6817

-在一些python讨论版里，经常会见到一些“月经帖”，类似于“我用python读取一个文件乱码”，然后就会抱怨python的编码很麻烦，其实不是python编码难搞定，而是没有真正理解python的编码。
-
 如在windows环境下的以下示例代码：

 #!/usr/bin/env python
@@ -21,9 +19,10 @@ if __name__=='__main__':
    do()

 此时的ansi.txt编码为ansi,我们在cmd窗口执行，看到如下结果：
-{{~/sync/notes/zim/python/python中的编码/1.png}}
+{{./1.png}}
+
 此时一切正常，但，如果还是用上面的脚本去读取utf8.txt，文件是utf8编码，就会得到下面的结果：
-{{~/sync/notes/zim/python/python中的编码/2.png}}
+{{./2.png}}
 经典的“乱码”出现了，有朋友可能会说了，我在python脚本里指定编码应该就解决了，于是：
 	
 #!/usr/bin/env python
@@ -40,7 +39,7 @@ if __name__=='__main__':
    do()

 再次运行：
-{{~/sync/notes/zim/python/python中的编码/3.png}}
+{{./3.png}}
 OMG，还是乱码。。。。

 能不能正常输出中文不取决于#coding=utf-8，也不取决于目标文件的编码，而是取决于你的__终端输出设备__，这里就是CMD窗口，CMD窗口是不支持UTF-8的，它只支持__GBK__，所以，我们要转码。
@@ -64,7 +63,7 @@ if __name__=='__main__':
    do()

 结果：
-{{~/sync/notes/zim/python/python中的编码/4.png}}
+{{./4.png}}
 正常输出。

 做一个小结：
--- a/Zim/Programme/python/python中的编码/Base64.txt
+++ b/Zim/Programme/python/python中的编码/Base64.txt
@@ -122,13 +122,3 @@ Base64编码可用于在HTTP环境下传递较长的__标识信息__。例如，
        MIME::Base64 Perl module
        Firefox extension
        emacs函数
-
-===== 参见 =====
-
-    Radix-64
-    ASCII85
-    Quoted-printable
-    uuencode
-    yEnc
-    8BITMIME
-    URL
--- a/Zim/Programme/python/python中的编码/示例.txt
+++ b/Zim/Programme/python/python中的编码/示例.txt
@@ -9,10 +9,13 @@ Created Friday 14 October 2011
 Python 2.7.2 (default, Jun 29 2011, 11:17:09) 
 [GCC 4.6.1] on linux2
 Type "help", "copyright", "credits" or "license" for more information.
->>> '张俊'     #python2 会自动将字符串转换为合适的编码字节字符串
+
+>>> '张俊'     #python2 会自动将字符串转换为合适编码的字节字符串
 '\xe5\xbc\xa0\xe4\xbf\x8a'    #自动转换为utf-8编码的字节字符串
+
 >>> u'张俊'             #显式指定字符串类型为unicode类型， 此类型字符串没有编码，保存的是字符在unicode**字符集中的代码点(序号)**
 u'\u5f20\u4fca'
+
 >>> '张俊'.encode('utf-8')   #python2 已经自动将其转化为utf-8类型编码，因此再次编码(python2会将该字符串当作用ascii或unicode编码过)会出现错误。
 Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
--- a/Zim/Programme/python/python笔记.txt
+++ b/Zim/Programme/python/python笔记.txt
@@ -0,0 +1,7 @@
+Content-Type: text/x-zim-wiki
+Wiki-Format: zim 0.4
+Creation-Date: 2012-10-04T13:20:28+08:00
+
+====== python笔记 ======
+Created Thursday 04 October 2012
+
--- a/Zim/Programme/python/python笔记/dict.txt
+++ b/Zim/Programme/python/python笔记/dict.txt
@@ -0,0 +1,40 @@
+Content-Type: text/x-zim-wiki
+Wiki-Format: zim 0.4
+Creation-Date: 2012-10-04T13:21:05+08:00
+
+====== dict ======
+Created Thursday 04 October 2012
+
+Help on class dict in module __builtin__:
+
+class dict(object)
+ |  dict() -> new empty dictionary
+ |  dict(mapping) -> new dictionary initialized from **a mapping object**'s   #mapping为__一个__pairs对象
+ |      (key, value) pairs
+ |  dict(iterable) -> new dictionary initialized as if via:
+ |      d = {}
+ |      for **k, v** in iterable:    #迭代器对象每次返回的元素必须是一个容器类型，__容器中元素的个数为2__.**如[a,b], "ab",(a,b)**
+ |          d[k] = v
+ |  dict(__**kwargs)__ -> new dictionary initialized with the name=value pairs
+ |      in the keyword argument list.  For example:  dict(one=1, two=2)
+ |  
+ |  Methods defined here:
+
+
+>>> dict(__[('sape', 4139), ('guido', 4127), ('jack', 4098)]__)
+{'sape': 4139, 'jack': 4098, 'guido': 4127}
+
+>>> dict([(x, x**2) for x in (2, 4, 6)])     # use a list comprehension
+{2: 4, 4: 16, 6: 36}
+
+>>> dict(sape=4139, guido=4127, jack=4098)
+{'sape': 4139, 'jack': 4098, 'guido': 4127}
+tel = {'jack': 4098, 'sape': 4139}
+
+>>> dc=dict(["df","12"]);dc   #["df","12"]为一科迭代对象，每次返回的元素为两个字符的str，所以可以被unpack给key,value
+{'1': '2', 'd': 'f'}
+>>> dc=dict(["df",__"123"__]);dc
+Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+ValueError: dictionary update sequence element __#1 has length 3; 2 is required__
+>>> 
--- a/Zim/Programme/python/python笔记/float.txt
+++ b/Zim/Programme/python/python笔记/float.txt
@@ -0,0 +1,20 @@
+Content-Type: text/x-zim-wiki
+Wiki-Format: zim 0.4
+Creation-Date: 2012-10-04T13:20:56+08:00
+
+====== float ======
+Created Thursday 04 October 2012
+
+>>> float("0xff")
+Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+ValueError: invalid literal for float(): 0xff
+>>> 
+
+>>> __float.fromhex("0xfff")__
+4095.0
+>>> 
+
+>>> float("0.111")
+0.111
+>>> 
--- a/Zim/Programme/python/python笔记/int_long.txt
+++ b/Zim/Programme/python/python笔记/int_long.txt
@@ -0,0 +1,37 @@
+Content-Type: text/x-zim-wiki
+Wiki-Format: zim 0.4
+Creation-Date: 2012-10-04T13:20:52+08:00
+
+====== int long ======
+Created Thursday 04 October 2012
+
+>>> 1&2  #按__位与__
+0
+
+>>> 0xff&0xf1  #按位与
+241
+>>> 0xff&0xf0
+240
+>>> __hex__(0xff&0xf0)  #返回的__字符串__
+'0xf0'   
+__与hex()类似, bin(), oct()等返回的都是int或long型的字符串代表__
+
+>>> 1&&2   __#python没有&&, ||, !逻辑运算符，但是有and, or, not，而且这三个逻辑运算符返回的是最后一个元素的内容__
+  File "<stdin>", line 1
+    1&&2
+      ^
+SyntaxError: invalid syntax
+
+>>> 1 and 2  __#返回的是最后一个元素的内容而不是True或False，这里为2__
+2
+
+>>> 'fff' & 'dfad'  __#str类型没有定义__and__方法，所以没有位运算__
+Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+TypeError: unsupported operand type(s) for &: 'str' and 'str'
+>>> help(str)
+
+>>> 'fff' and 'dfad'
+'dfad'
+>>> 
+
--- a/Zim/Programme/python/python笔记/list.txt
+++ b/Zim/Programme/python/python笔记/list.txt
@@ -0,0 +1,25 @@
+Content-Type: text/x-zim-wiki
+Wiki-Format: zim 0.4
+Creation-Date: 2012-10-04T13:21:01+08:00
+
+====== list ======
+Created Thursday 04 October 2012
+
+>>> l=range(1,10)
+>>> l
+[1, 2, 3, 4, 5, 6, 7, 8, 9]
+>>> __l[1:8:-1]__
+[]
+>>> l[8:1:-1]
+[9, 8, 7, 6, 5, 4, 3]
+>>> __l[::-1]__
+[9, 8, 7, 6, 5, 4, 3, 2, 1]
+>>> l[-1::-1]
+[9, 8, 7, 6, 5, 4, 3, 2, 1]
+>>> __l[-1:-9:-1]__
+[9, 8, 7, 6, 5, 4, 3, 2]
+>>> l[:]
+[1, 2, 3, 4, 5, 6, 7, 8, 9]
+>>> l[::]
+[1, 2, 3, 4, 5, 6, 7, 8, 9]
+
--- a/Zim/Programme/python/python笔记/set.txt
+++ b/Zim/Programme/python/python笔记/set.txt
@@ -0,0 +1,18 @@
+Content-Type: text/x-zim-wiki
+Wiki-Format: zim 0.4
+Creation-Date: 2012-10-04T13:21:08+08:00
+
+====== set ======
+Created Thursday 04 October 2012
+
+>>> set1=set(1,2,3)
+Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+TypeError: set expected __at most 1 arguments,__ got 3
+
+>>> set1=set((1,2,3))
+>>> set1
+set([1, 2, 3])
+>>> 
+
+
--- a/Zim/Programme/python/python笔记/str.txt
+++ b/Zim/Programme/python/python笔记/str.txt
@@ -0,0 +1,40 @@
+Content-Type: text/x-zim-wiki
+Wiki-Format: zim 0.4
+Creation-Date: 2012-10-04T13:20:41+08:00
+
+====== str ======
+Created Thursday 04 October 2012
+
+ |  join(...)
+ |      S.join(iterable) -> string
+ |      
+ |      Return a string which is the concatenation of __the strings in the__
+__ |      iterable__.  The separator between elements is S.
+
+iterable迭代器对象每次返回的__必须是字符串对象__。
+
+>>> ":".join("abcd")
+'a:b:c:d'
+
+>>> ":".join(['a','b','c','d'])
+'a:b:c:d'
+
+>>> ":".join(['a',__123__,'c'])
+Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+TypeError: sequence item 1: __expected string__, int found
+
+>>> ":".join(['a',['ab'],'c'])
+Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+TypeError: sequence item 1: expected string, list found
+>>> 
+
+ |  rsplit(...)
+ |      S.rsplit([sep [,maxsplit]]) -> list of strings
+ |      
+ |      Return a list of the words in the string S, using sep as the
+ |      delimiter string, starting at the end of the string and working
+ |      to the front.  If maxsplit is given, at most maxsplit splits are
+ |      done. If sep is not specified or is __None__, any whitespace string
+ |      is a separator.
--- a/Zim/Programme/python/python笔记/unpack.txt
+++ b/Zim/Programme/python/python笔记/unpack.txt
@@ -0,0 +1,38 @@
+Content-Type: text/x-zim-wiki
+Wiki-Format: zim 0.4
+Creation-Date: 2012-10-04T13:47:32+08:00
+
+====== unpack ======
+Created Thursday 04 October 2012
+
+>>> for k,v in ["fdf",23,"dfdf",33]:
+...   print k,v
+... 
+Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+ValueError: __too many__ values to unpack
+
+顺序容器类型如str, list, tuple__每次迭代时只能返回其中的一个元素__。
+所以第一次返回循环返回**"fdf"**，但是它有三个元素最多只能赋值给两个
+变量。
+
+>>> for k,v in "dfdf":
+...   print k,v
+... 
+Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+ValueError: __need more than 1 value__ to unpack
+
+字符串迭代时，每次返回其中的一个字符。所以最多只能unpack给一个变量。
+
+>>> k,v="dfdf"
+Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+ValueError: __too many values to unpack__
+
+unpack一个顺序容器类型时，左边变量的数目必须要与容器中元素的个数相同。
+
+>>> k,v="df"
+>>> print k,v
+d f
+>>> 
--- a/Zim/Programme/python/python笔记/内置函数.txt
+++ b/Zim/Programme/python/python笔记/内置函数.txt
@@ -0,0 +1,7 @@
+Content-Type: text/x-zim-wiki
+Wiki-Format: zim 0.4
+Creation-Date: 2012-10-04T13:21:30+08:00
+
+====== 内置函数 ======
+Created Thursday 04 October 2012
+
--- a/Zim/Programme/python/编写_Unix_管道风格的_Python_代码.txt
+++ b/Zim/Programme/python/编写_Unix_管道风格的_Python_代码.txt
@@ -0,0 +1,121 @@
+Content-Type: text/x-zim-wiki
+Wiki-Format: zim 0.4
+Creation-Date: 2012-10-04T22:01:26+08:00
+
+====== 编写 Unix 管道风格的 Python 代码 ======
+Created Thursday 04 October 2012
+http://www.oschina.net/question/54100_11910
+
+先推荐一份幻灯片，David Beazley ("Python essiential reference", PLY 的作者) 在 PyCon’2008 上报告的幻灯片，强烈推荐！！这篇文章的很多内容都来自或者受这份幻灯片的启发而来。
+
+在上一篇文章里介绍了 Unix 管道的好处，那可不可以在写程序时也使用这样的思想呢？当然可以。看过 SICP 就知道，其实函数式编程中的 __map, filter__ 都可以看作是管道思想的应用。但其实管道的思想不仅可以在函数式语言中使用，只要语言支持定义函数，有能够存放一组数据的数据结构，就可以使用管道的思 想。
+
+一个日志处理任务
+
+这里直接以前面推荐的幻灯片里的例子来说明，应用场景如下：
+
+某个目录及子目录下有一些 web 服务器的日志文件，日志文件名以 access-log 开头
+日志格式如下
+81.107.39.38 - ... "GET /ply/ply.html HTTP/1.1" 200 97238
+81.107.39.38 - ... "GET /ply HTTP/1.1" 304 -
+其中最后一列数字为发送的字节数，若为 ‘-’ 则表示没有发送数据
+
+目标是算出总共发送了多少字节的数据，实际上也就是要把日志记录的没一行的最后一列数值加起来
+我不直接展示如何用 Unix 管道的风格来处理这个问题，而是先给出一些“不那么好”的代码，指出它们的问题，最后再展示管道风格的代码，并介绍如何使用 generator 来避免效率上的问题。想直接看管道风格的，点这里。
+
+问题并不复杂，几个 for 循环就能搞定：
+
+sum = 0
+for path, dirlist, filelist in __os.walk(top)__:
+    for name in __fnmatch.filter__(filelist, "access-log*"):
+        # 对子目录中的每个日志文件进行处理
+        with open(name) as f:
+            for line in f:
+                if line[-1] == '-':
+                    continue
+                else:
+                    sum += int(line__.rsplit(None, 1)__[1])
+
+利用 os.walk 这个问题解决起来很方便，由此也可以看出 python 的 for 语句做遍历是多么的方便，不需要额外控制循环次数的变量，省去了设置初始值、更新、判断循环结束条件等工作，相比 C/C++/Java 这样的语言真是太方便了。看起来一切都很美好。
+
+然而，设想以后有了新的统计任务，比如：
+
+   1. 统计某个特定页面的访问次数
+   2. 处理另外的一些日志文件，日志文件名字以 error-log 开头
+
+完成这些任务直接拿上面的代码过来改改就可以了，文件名的 pattern 改一下，处理每个文件的代码改一下。其实每次任务的处理中，找到特定名字为特定 pattern 的文件的代码是一样的，直接修改之前的代码其实就引入了重复。
+
+如果重复的代码量很大，我们很自然的会注意到。然而 python 的 for 循环实在太方便了，像这里找文件的代码一共就两行，哪怕重写一遍也不会觉得太麻烦。for 循环的方便使得我们会忽略这样简单代码的重复。然而，再怎么方便好用，for 循环无法重用，只有把它放到函数中才能进行重用。
+
+(先考虑下是你会如何避免这里的代码的重复。下面马上出现的代码并不好，是“误导性”的代码，我会在之后再给出“更好”的代码。)
+
+因此，我们__把上面代码中不变的部分提取成一个通用的函数，可变的部分以参数的形式传入__，得到下面的代码。
+
+def generic_process(topdir, filepat, processfunc):
+    for path, dirlist, filelist in os.walk(top):
+        for name in fnmatch.filter(filelist, filepat):
+            with open(name) f:
+                processfunc(f)
+ 
+sum = 0
+# 很遗憾，python 对 closure 中的变量不能进行赋值操作，
+# 因此这里只能使用全局变量
+def add_count(f):
+    global sum
+    for line in f:
+        if line[-1] == '-':
+            continue
+        else:
+            sum += int(line.rsplit(None, 1)[1])
+ 
+generic_process('logdir', 'access-log*', add_count)
+
+看起来不变和可变的部分分开了，然而 generic_process 的设计并不好。它除了寻找文件以外还调用了日志文件处理函数，因此在其他任务中很可能就无法使用。另外 add_count 的参数必须是 file like object，因此测试时不能简单的直接使用字符串。
+
+===== 管道风格的程序 =====
+下面考虑用 Unix 的工具和管道我们会如何完成这个任务：
+
+find logdir -name "access-log*" | \
+xargs cat | \
+grep '[^-]$' | \
+awk '{ total += $NF } END { print total }'
+
+find 根据文件名 pattern 找到文件，cat 把所有文件内容合并输出到 stdout，grep 从 stdin 读入，过滤掉行末为 ‘-’ 的行，awk 提取每行最后一列，将数值相加，最后打印出结果。（省掉 cat 是可以的，但这样一来 grep 就需要直接读文件而不是只从标准输入读。）
+
+我们可以在 python 代码中模拟这些工具，__Unix 的工具通过文本来传递结果，在 python 中可以使用 list__。
+
+def find(topdir, filepat, processfunc):
+    files = []
+    for path, dirlist, filelist in os.walk(top):
+        for name in fnmatch.filter(filelist, filepat):
+            files.append(name)
+    return files
+ 
+ def cat(files):
+    lines = []
+    for file in files:
+        with open(file) as f:
+            for line in f:
+                lines.append(line)
+    return lines
+ 
+ def grep(pattern, lines):
+    result = []
+    import re
+    pat = re.compile(pattern)
+    for line in lines:
+        if pat.search(line):
+            result.append(line)
+    resurn result
+ 
+lines = grep('[^-]$', cat(find('logdir', 'access-log*')))
+col = (line.rsplit(None, 1)[1] for line in lines)
+print sum(int(c) for c in col)
+
+有了 find, cat, grep 这三个函数，只需要连续调用就可以像 Unix 的管道一样将这些函数组合起来。数据在管道中的变化如下图（简洁起见，过滤器直接标在箭头上 ）：
+
+{{./1.gif}}
+
+看起来现在的代码行数比最初直接用 for 循环的代码要多，但现在的代码就像 Unix 的那些小工具一样，每一个都更加可能被用到。我们可以把更多常用的 Unix 工具用 Python 来模拟，从而在 Python 代码中以 Unix 管道的风格来编写代码。
+
+不过上面的代码性能很差，多个临时的 list 被创建。解决的办法是用 generator，因为篇幅比较长，具体做法放到下一篇文章中。
--- a/Zim/Programme/python/编写_Unix_管道风格的_Python_代码/1.gif
+++ b/Zim/Programme/python/编写_Unix_管道风格的_Python_代码/1.gif