====== 第七章:字符串深度解析 ======
===== 本章目标 =====
完成本章学习后,你将能够:
* 理解Python字符串的不可变性和Unicode编码
* 掌握所有常用字符串方法
* 熟练使用字符串格式化(%, format, f-string)
* 理解正则表达式基础
* 高效处理文本数据
===== 字符串基础 =====
==== 字符串创建 ====
# 单引号和双引号等价
s1 = 'hello'
s2 = "hello"
# 三引号用于多行字符串
s3 = '''这是一个
多行字符串'''
s4 = """这也是一个
多行字符串"""
# 原始字符串(不转义)
path = r"C:\Users\name\file.txt" # 注意:\n不会被当作换行
regex = r"\d+\.\d+" # 正则表达式常用
# 字节串
b = b"hello" # bytes类型,不是str
==== 字符串不可变性 ====
s = "hello"
# s[0] = "H" # TypeError: 'str' object does not support item assignment
# 创建新字符串
s = "H" + s[1:] # "Hello"
# 字符串方法返回新字符串
s = " hello "
s2 = s.strip() # 返回"hello",s不变
print(s) # " hello "(原字符串未变)
print(s2) # "hello"
===== 字符串编码 =====
==== Unicode基础 ====
Python 3字符串是**Unicode**序列:
# Unicode字符
s = "你好,世界!🌍"
print(len(s)) # 7(字符数,不是字节数)
# 获取Unicode码点
print(ord('A')) # 65
print(ord('中')) # 20013
print(chr(65)) # A
print(chr(20013)) # 中
# Unicode转义
s = "\u4e2d\u6587" # "中文"
s = "\U0001f600" # "😀"
==== 编码与解码 ====
# 编码:str -> bytes
s = "中文"
utf8_bytes = s.encode('utf-8') # b'\xe4\xb8\xad\xe6\x96\x87'
gbk_bytes = s.encode('gbk') # b'\xd6\xd0\xce\xc4'
print(len(utf8_bytes)) # 6(UTF-8中文字符占3字节)
print(len(gbk_bytes)) # 4(GBK中文字符占2字节)
# 解码:bytes -> str
s1 = utf8_bytes.decode('utf-8') # "中文"
s2 = gbk_bytes.decode('gbk') # "中文"
# 错误处理
b = b"\xff\xfe" # 无效UTF-8序列
s = b.decode('utf-8', errors='replace') # 用�替换无效字节
s = b.decode('utf-8', errors='ignore') # 忽略无效字节
===== 字符串索引与切片 =====
s = "Hello, World!"
# 索引
print(s[0]) # H
print(s[-1]) # !(最后一个字符)
print(s[7]) # W
# 切片 [start:end:step]
print(s[0:5]) # Hello
print(s[7:12]) # World
print(s[:5]) # Hello(从头开始)
print(s[7:]) # World!(到末尾)
print(s[:]) # Hello, World!(副本)
print(s[::2]) # Hlo ol!(每隔一个字符)
print(s[::-1]) # !dlroW ,olleH(反转)
# 高级切片
print(s[-5:]) # orld!(最后5个字符)
print(s[:-1]) # Hello, World(去掉最后一个字符)
===== 字符串方法 =====
==== 查找方法 ====
s = "Hello, World! Hello!"
# find - 找不到返回-1
print(s.find("Hello")) # 0
print(s.find("Hello", 5)) # 14(从索引5开始找)
print(s.find("xyz")) # -1
# index - 找不到抛出ValueError
print(s.index("Hello")) # 0
# print(s.index("xyz")) # ValueError
# rfind/rindex - 从右开始查找
print(s.rfind("Hello")) # 14
# count - 计数
print(s.count("Hello")) # 2
print(s.count("l")) # 4
# startswith/endswith
print(s.startswith("Hello")) # True
print(s.endswith("!")) # True
print(s.startswith(("Hi", "Hello"))) # True,匹配任一
==== 修改方法 ====
s = " Hello, World! "
# 去除空白
print(s.strip()) # "Hello, World!"
print(s.lstrip()) # "Hello, World! "
print(s.rstrip()) # " Hello, World!"
print(s.strip(" !")) # "Hello, World"(去除指定字符)
# 大小写转换
s = "Hello World"
print(s.upper()) # HELLO WORLD
print(s.lower()) # hello world
print(s.capitalize()) # Hello world(首字母大写)
print(s.title()) # Hello World(每个单词首字母大写)
print(s.swapcase()) # hELLO wORLD(大小写互换)
# 替换
s = "Hello, World! World!"
print(s.replace("World", "Python")) # Hello, Python! Python!
print(s.replace("World", "Python", 1)) # Hello, Python! World!(只替换1次)
==== 判断方法 ====
# 判断类型
print("hello".isalpha()) # True(全是字母)
print("hello123".isalnum()) # True(字母或数字)
print("123".isdigit()) # True(全是数字)
print("123.45".isdecimal()) # False(有小数点)
print(" ".isspace()) # True(全是空白)
print("Hello".istitle()) # True(标题格式)
print("HELLO".isupper()) # True(全大写)
print("hello".islower()) # True(全小写)
# 其他判断
print("hello".isidentifier()) # True(可作为标识符)
print("123abc".isidentifier()) # False(数字开头)
==== 分割与连接 ====
# split
s = "apple,banana,cherry"
fruits = s.split(",") # ['apple', 'banana', 'cherry']
s = "a b c" # 多个空格
cols = s.split() # ['a', 'b', 'c'](默认按任意空白分割)
cols = s.split(" ") # ['a', '', 'b', '', '', 'c']
# 限制分割次数
s = "a,b,c,d,e"
print(s.split(",", 2)) # ['a', 'b', 'c,d,e']
# rsplit(从右边开始)
print(s.rsplit(",", 2)) # ['a,b,c', 'd', 'e']
# splitlines
s = "line1\nline2\r\nline3"
print(s.splitlines()) # ['line1', 'line2', 'line3']
# join
words = ["Hello", "World"]
s = " ".join(words) # "Hello World"
s = "-".join(words) # "Hello-World"
s = "".join(words) # "HelloWorld"
# 连接多个相同字符
s = "-" * 50 # 50个连字符
===== 字符串格式化 =====
==== %格式化(旧式) ====
name = "Alice"
age = 25
print("Name: %s, Age: %d" % (name, age))
print("Pi: %.2f" % 3.14159) # Pi: 3.14
print("Hex: %x" % 255) # Hex: ff
==== str.format()方法 ====
# 位置参数
print("Hello, {}!".format("World"))
print("{0} {1}".format("Hello", "World"))
print("{1} {0}".format("World", "Hello")) # Hello World
# 关键字参数
print("Name: {name}, Age: {age}".format(name="Alice", age=25))
# 格式规范
print("{:.2f}".format(3.14159)) # 3.14
print("{:>10}".format("hi")) # " hi"(右对齐)
print("{:<10}".format("hi")) # "hi "(左对齐)
print("{:^10}".format("hi")) # " hi "(居中)
print("{:0>5}".format(42)) # 00042(补零)
print("{:,}".format(1234567)) # 1,234,567(千分位)
print("{:.2%}".format(0.25)) # 25.00%(百分比)
==== f-string(Python 3.6+,推荐) ====
name = "Alice"
age = 25
# 基本用法
print(f"Hello, {name}!")
print(f"Next year you'll be {age + 1}")
# 表达式
print(f"Square of 5: {5 ** 2}")
print(f"Name length: {len(name)}")
# 格式规范
pi = 3.14159265359
print(f"Pi: {pi:.2f}") # Pi: 3.14
print(f"Pi: {pi:10.2f}") # Pi: 3.14
print(f"Pi: {pi:<10.2f}") # Pi: 3.14
print(f"Large: {1000000:,}") # Large: 1,000,000
# 调试(Python 3.8+)
print(f"{age=}") # age=25
print(f"{age + 5=}") # age + 5=30
# 日期格式化
from datetime import datetime
now = datetime.now()
print(f"Now: {now:%Y-%m-%d %H:%M:%S}")
===== 正则表达式基础 =====
import re
# 基本匹配
text = "The quick brown fox jumps over 13 lazy dogs."
# search - 搜索第一个匹配
match = re.search(r"fox", text)
if match:
print(f"找到'{match.group()}'在位置{match.start()}-{match.end()}")
# findall - 查找所有匹配
numbers = re.findall(r"\d+", text) # ['13']
words = re.findall(r"\b\w+\b", text)
# 常用模式
patterns = {
r"\d+": "一个或多个数字",
r"\w+": "一个或多个单词字符",
r"\s+": "一个或多个空白字符",
r"[a-z]+": "一个或多个小写字母",
r"^The": "以The开头",
r"dogs\.$": "以dogs.结尾",
r"o.": "o后跟任意字符",
}
# 替换
new_text = re.sub(r"fox", "cat", text)
new_text = re.sub(r"\d+", "XX", text) # 将所有数字替换为XX
# 分割
parts = re.split(r"\s+", text) # 按空白分割
===== 字符串性能优化 ======
# 低效:字符串拼接
result = ""
for i in range(1000):
result += str(i) # 每次创建新字符串
# 高效:使用join
parts = [str(i) for i in range(1000)]
result = "".join(parts)
# 更高效:使用列表再join
parts = []
for i in range(1000):
parts.append(str(i))
result = "".join(parts)
# 或使用io.StringIO
from io import StringIO
buffer = StringIO()
for i in range(1000):
buffer.write(str(i))
result = buffer.getvalue()
===== 本章练习 =====
1. **字符串处理**:实现函数将驼峰命名转换为下划线命名(camelCase -> camel_case)
2. **格式化输出**:使用f-string格式化一个表格,对齐各列
3. **正则提取**:从HTML中提取所有URL
4. **文本统计**:统计一段文本中各单词出现频率
5. **模板引擎**:实现一个简单的字符串模板替换功能
===== 本章小结 =====
本章我们深入学习了:
* Python字符串的Unicode本质
* 字符串的索引、切片方法
* 丰富的字符串方法
* 三种格式化方式(%、format、f-string)
* 正则表达式基础
* 字符串操作性能优化
下一章:[[python_course:chapter08|第八章:列表与元组]]