第七章：字符串深度解析

本章目标

完成本章学习后，你将能够：

理解Python字符串的不可变性和Unicode编码
掌握所有常用字符串方法
熟练使用字符串格式化（%, format, f-string）
理解正则表达式基础
高效处理文本数据

字符串基础

字符串创建

# 单引号和双引号等价
s1 = 'hello'
s2 = "hello"
 
# 三引号用于多行字符串
s3 = '''这是一个
多行字符串'''
 
s4 = """这也是一个
多行字符串"""
 
# 原始字符串（不转义）
path = r"C:\Users\name\file.txt"  # 注意：\n不会被当作换行
regex = r"\d+\.\d+"  # 正则表达式常用
 
# 字节串
b = b"hello"  # bytes类型，不是str

字符串不可变性

s = "hello"
# s[0] = "H"  # TypeError: 'str' object does not support item assignment
 
# 创建新字符串
s = "H" + s[1:]  # "Hello"
 
# 字符串方法返回新字符串
s = "  hello  "
s2 = s.strip()  # 返回"hello"，s不变
print(s)   # "  hello  "（原字符串未变）
print(s2)  # "hello"

字符串编码

Unicode基础

Python 3字符串是Unicode序列：

# Unicode字符
s = "你好，世界！🌍"
print(len(s))  # 7（字符数，不是字节数）
 
# 获取Unicode码点
print(ord('A'))   # 65
print(ord('中'))  # 20013
print(chr(65))    # A
print(chr(20013)) # 中
 
# Unicode转义
s = "\u4e2d\u6587"  # "中文"
s = "\U0001f600"   # "😀"

编码与解码

# 编码：str -> bytes
s = "中文"
utf8_bytes = s.encode('utf-8')    # b'\xe4\xb8\xad\xe6\x96\x87'
gbk_bytes = s.encode('gbk')       # b'\xd6\xd0\xce\xc4'
 
print(len(utf8_bytes))  # 6（UTF-8中文字符占3字节）
print(len(gbk_bytes))   # 4（GBK中文字符占2字节）
 
# 解码：bytes -> str
s1 = utf8_bytes.decode('utf-8')   # "中文"
s2 = gbk_bytes.decode('gbk')      # "中文"
 
# 错误处理
b = b"\xff\xfe"  # 无效UTF-8序列
s = b.decode('utf-8', errors='replace')  # 用�替换无效字节
s = b.decode('utf-8', errors='ignore')   # 忽略无效字节

字符串索引与切片

s = "Hello, World!"
 
# 索引
print(s[0])    # H
print(s[-1])   # !（最后一个字符）
print(s[7])    # W
 
# 切片 [start:end:step]
print(s[0:5])   # Hello
print(s[7:12])  # World
print(s[:5])    # Hello（从头开始）
print(s[7:])    # World!（到末尾）
print(s[:])     # Hello, World!（副本）
print(s[::2])   # Hlo ol!（每隔一个字符）
print(s[::-1])  # !dlroW ,olleH（反转）
 
# 高级切片
print(s[-5:])   # orld!（最后5个字符）
print(s[:-1])   # Hello, World（去掉最后一个字符）

字符串方法

查找方法

s = "Hello, World! Hello!"
 
# find - 找不到返回-1
print(s.find("Hello"))     # 0
print(s.find("Hello", 5))  # 14（从索引5开始找）
print(s.find("xyz"))       # -1
 
# index - 找不到抛出ValueError
print(s.index("Hello"))    # 0
# print(s.index("xyz"))    # ValueError
 
# rfind/rindex - 从右开始查找
print(s.rfind("Hello"))    # 14
 
# count - 计数
print(s.count("Hello"))    # 2
print(s.count("l"))        # 4
 
# startswith/endswith
print(s.startswith("Hello"))  # True
print(s.endswith("!"))        # True
print(s.startswith(("Hi", "Hello")))  # True，匹配任一

修改方法

s = "  Hello, World!  "
 
# 去除空白
print(s.strip())     # "Hello, World!"
print(s.lstrip())    # "Hello, World!  "
print(s.rstrip())    # "  Hello, World!"
print(s.strip(" !"))  # "Hello, World"（去除指定字符）
 
# 大小写转换
s = "Hello World"
print(s.upper())       # HELLO WORLD
print(s.lower())       # hello world
print(s.capitalize())  # Hello world（首字母大写）
print(s.title())       # Hello World（每个单词首字母大写）
print(s.swapcase())    # hELLO wORLD（大小写互换）
 
# 替换
s = "Hello, World! World!"
print(s.replace("World", "Python"))       # Hello, Python! Python!
print(s.replace("World", "Python", 1))    # Hello, Python! World!（只替换1次）

判断方法

# 判断类型
print("hello".isalpha())      # True（全是字母）
print("hello123".isalnum())   # True（字母或数字）
print("123".isdigit())        # True（全是数字）
print("123.45".isdecimal())   # False（有小数点）
print("   ".isspace())        # True（全是空白）
print("Hello".istitle())      # True（标题格式）
print("HELLO".isupper())      # True（全大写）
print("hello".islower())      # True（全小写）
 
# 其他判断
print("hello".isidentifier())  # True（可作为标识符）
print("123abc".isidentifier()) # False（数字开头）

分割与连接

# split
s = "apple,banana,cherry"
fruits = s.split(",")  # ['apple', 'banana', 'cherry']
 
s = "a  b   c"  # 多个空格
cols = s.split()     # ['a', 'b', 'c']（默认按任意空白分割）
cols = s.split(" ")   # ['a', '', 'b', '', '', 'c']
 
# 限制分割次数
s = "a,b,c,d,e"
print(s.split(",", 2))  # ['a', 'b', 'c,d,e']
 
# rsplit（从右边开始）
print(s.rsplit(",", 2))  # ['a,b,c', 'd', 'e']
 
# splitlines
s = "line1\nline2\r\nline3"
print(s.splitlines())  # ['line1', 'line2', 'line3']
 
# join
words = ["Hello", "World"]
s = " ".join(words)   # "Hello World"
s = "-".join(words)   # "Hello-World"
s = "".join(words)    # "HelloWorld"
 
# 连接多个相同字符
s = "-" * 50  # 50个连字符

字符串格式化

%格式化（旧式）

name = "Alice"
age = 25
 
print("Name: %s, Age: %d" % (name, age))
print("Pi: %.2f" % 3.14159)  # Pi: 3.14
print("Hex: %x" % 255)       # Hex: ff

str.format()方法

# 位置参数
print("Hello, {}!".format("World"))
print("{0} {1}".format("Hello", "World"))
print("{1} {0}".format("World", "Hello"))  # Hello World
 
# 关键字参数
print("Name: {name}, Age: {age}".format(name="Alice", age=25))
 
# 格式规范
print("{:.2f}".format(3.14159))    # 3.14
print("{:>10}".format("hi"))       # "        hi"（右对齐）
print("{:<10}".format("hi"))       # "hi        "（左对齐）
print("{:^10}".format("hi"))       # "    hi    "（居中）
print("{:0>5}".format(42))         # 00042（补零）
print("{:,}".format(1234567))      # 1,234,567（千分位）
print("{:.2%}".format(0.25))       # 25.00%（百分比）

f-string（Python 3.6+，推荐）

name = "Alice"
age = 25
 
# 基本用法
print(f"Hello, {name}!")
print(f"Next year you'll be {age + 1}")
 
# 表达式
print(f"Square of 5: {5 ** 2}")
print(f"Name length: {len(name)}")
 
# 格式规范
pi = 3.14159265359
print(f"Pi: {pi:.2f}")          # Pi: 3.14
print(f"Pi: {pi:10.2f}")        # Pi:       3.14
print(f"Pi: {pi:<10.2f}")       # Pi: 3.14
print(f"Large: {1000000:,}")    # Large: 1,000,000
 
# 调试（Python 3.8+）
print(f"{age=}")        # age=25
print(f"{age + 5=}")    # age + 5=30
 
# 日期格式化
from datetime import datetime
now = datetime.now()
print(f"Now: {now:%Y-%m-%d %H:%M:%S}")

正则表达式基础

import re
 
# 基本匹配
text = "The quick brown fox jumps over 13 lazy dogs."
 
# search - 搜索第一个匹配
match = re.search(r"fox", text)
if match:
    print(f"找到'{match.group()}'在位置{match.start()}-{match.end()}")
 
# findall - 查找所有匹配
numbers = re.findall(r"\d+", text)  # ['13']
words = re.findall(r"\b\w+\b", text)
 
# 常用模式
patterns = {
    r"\d+": "一个或多个数字",
    r"\w+": "一个或多个单词字符",
    r"\s+": "一个或多个空白字符",
    r"[a-z]+": "一个或多个小写字母",
    r"^The": "以The开头",
    r"dogs\.$": "以dogs.结尾",
    r"o.": "o后跟任意字符",
}
 
# 替换
new_text = re.sub(r"fox", "cat", text)
new_text = re.sub(r"\d+", "XX", text)  # 将所有数字替换为XX
 
# 分割
parts = re.split(r"\s+", text)  # 按空白分割

字符串性能优化

# 低效：字符串拼接
result = ""
for i in range(1000):
    result += str(i)  # 每次创建新字符串
 
# 高效：使用join
parts = [str(i) for i in range(1000)]
result = "".join(parts)
 
# 更高效：使用列表再join
parts = []
for i in range(1000):
    parts.append(str(i))
result = "".join(parts)
 
# 或使用io.StringIO
from io import StringIO
buffer = StringIO()
for i in range(1000):
    buffer.write(str(i))
result = buffer.getvalue()

本章练习

1. 字符串处理：实现函数将驼峰命名转换为下划线命名（camelCase → camel_case） 2. 格式化输出：使用f-string格式化一个表格，对齐各列 3. 正则提取：从HTML中提取所有URL 4. 文本统计：统计一段文本中各单词出现频率 5. 模板引擎：实现一个简单的字符串模板替换功能

本章小结

本章我们深入学习了：

Python字符串的Unicode本质
字符串的索引、切片方法
丰富的字符串方法
三种格式化方式（%、format、f-string）
正则表达式基础
字符串操作性能优化

下一章：第八章：列表与元组

目录