第七章:字符串深度解析

完成本章学习后,你将能够:

  • 理解Python字符串的不可变性和Unicode编码
  • 掌握所有常用字符串方法
  • 熟练使用字符串格式化(%, format, f-string)
  • 理解正则表达式基础
  • 高效处理文本数据
# 单引号和双引号等价
s1 = 'hello'
s2 = "hello"
 
# 三引号用于多行字符串
s3 = '''这是一个
多行字符串'''
 
s4 = """这也是一个
多行字符串"""
 
# 原始字符串(不转义)
path = r"C:\Users\name\file.txt"  # 注意:\n不会被当作换行
regex = r"\d+\.\d+"  # 正则表达式常用
 
# 字节串
b = b"hello"  # bytes类型,不是str
s = "hello"
# s[0] = "H"  # TypeError: 'str' object does not support item assignment
 
# 创建新字符串
s = "H" + s[1:]  # "Hello"
 
# 字符串方法返回新字符串
s = "  hello  "
s2 = s.strip()  # 返回"hello",s不变
print(s)   # "  hello  "(原字符串未变)
print(s2)  # "hello"

Python 3字符串是Unicode序列:

# Unicode字符
s = "你好,世界!🌍"
print(len(s))  # 7(字符数,不是字节数)
 
# 获取Unicode码点
print(ord('A'))   # 65
print(ord('中'))  # 20013
print(chr(65))    # A
print(chr(20013)) # 中
 
# Unicode转义
s = "\u4e2d\u6587"  # "中文"
s = "\U0001f600"   # "😀"
# 编码:str -> bytes
s = "中文"
utf8_bytes = s.encode('utf-8')    # b'\xe4\xb8\xad\xe6\x96\x87'
gbk_bytes = s.encode('gbk')       # b'\xd6\xd0\xce\xc4'
 
print(len(utf8_bytes))  # 6(UTF-8中文字符占3字节)
print(len(gbk_bytes))   # 4(GBK中文字符占2字节)
 
# 解码:bytes -> str
s1 = utf8_bytes.decode('utf-8')   # "中文"
s2 = gbk_bytes.decode('gbk')      # "中文"
 
# 错误处理
b = b"\xff\xfe"  # 无效UTF-8序列
s = b.decode('utf-8', errors='replace')  # 用�替换无效字节
s = b.decode('utf-8', errors='ignore')   # 忽略无效字节
s = "Hello, World!"
 
# 索引
print(s[0])    # H
print(s[-1])   # !(最后一个字符)
print(s[7])    # W
 
# 切片 [start:end:step]
print(s[0:5])   # Hello
print(s[7:12])  # World
print(s[:5])    # Hello(从头开始)
print(s[7:])    # World!(到末尾)
print(s[:])     # Hello, World!(副本)
print(s[::2])   # Hlo ol!(每隔一个字符)
print(s[::-1])  # !dlroW ,olleH(反转)
 
# 高级切片
print(s[-5:])   # orld!(最后5个字符)
print(s[:-1])   # Hello, World(去掉最后一个字符)
s = "Hello, World! Hello!"
 
# find - 找不到返回-1
print(s.find("Hello"))     # 0
print(s.find("Hello", 5))  # 14(从索引5开始找)
print(s.find("xyz"))       # -1
 
# index - 找不到抛出ValueError
print(s.index("Hello"))    # 0
# print(s.index("xyz"))    # ValueError
 
# rfind/rindex - 从右开始查找
print(s.rfind("Hello"))    # 14
 
# count - 计数
print(s.count("Hello"))    # 2
print(s.count("l"))        # 4
 
# startswith/endswith
print(s.startswith("Hello"))  # True
print(s.endswith("!"))        # True
print(s.startswith(("Hi", "Hello")))  # True,匹配任一
s = "  Hello, World!  "
 
# 去除空白
print(s.strip())     # "Hello, World!"
print(s.lstrip())    # "Hello, World!  "
print(s.rstrip())    # "  Hello, World!"
print(s.strip(" !"))  # "Hello, World"(去除指定字符)
 
# 大小写转换
s = "Hello World"
print(s.upper())       # HELLO WORLD
print(s.lower())       # hello world
print(s.capitalize())  # Hello world(首字母大写)
print(s.title())       # Hello World(每个单词首字母大写)
print(s.swapcase())    # hELLO wORLD(大小写互换)
 
# 替换
s = "Hello, World! World!"
print(s.replace("World", "Python"))       # Hello, Python! Python!
print(s.replace("World", "Python", 1))    # Hello, Python! World!(只替换1次)
# 判断类型
print("hello".isalpha())      # True(全是字母)
print("hello123".isalnum())   # True(字母或数字)
print("123".isdigit())        # True(全是数字)
print("123.45".isdecimal())   # False(有小数点)
print("   ".isspace())        # True(全是空白)
print("Hello".istitle())      # True(标题格式)
print("HELLO".isupper())      # True(全大写)
print("hello".islower())      # True(全小写)
 
# 其他判断
print("hello".isidentifier())  # True(可作为标识符)
print("123abc".isidentifier()) # False(数字开头)
# split
s = "apple,banana,cherry"
fruits = s.split(",")  # ['apple', 'banana', 'cherry']
 
s = "a  b   c"  # 多个空格
cols = s.split()     # ['a', 'b', 'c'](默认按任意空白分割)
cols = s.split(" ")   # ['a', '', 'b', '', '', 'c']
 
# 限制分割次数
s = "a,b,c,d,e"
print(s.split(",", 2))  # ['a', 'b', 'c,d,e']
 
# rsplit(从右边开始)
print(s.rsplit(",", 2))  # ['a,b,c', 'd', 'e']
 
# splitlines
s = "line1\nline2\r\nline3"
print(s.splitlines())  # ['line1', 'line2', 'line3']
 
# join
words = ["Hello", "World"]
s = " ".join(words)   # "Hello World"
s = "-".join(words)   # "Hello-World"
s = "".join(words)    # "HelloWorld"
 
# 连接多个相同字符
s = "-" * 50  # 50个连字符
name = "Alice"
age = 25
 
print("Name: %s, Age: %d" % (name, age))
print("Pi: %.2f" % 3.14159)  # Pi: 3.14
print("Hex: %x" % 255)       # Hex: ff
# 位置参数
print("Hello, {}!".format("World"))
print("{0} {1}".format("Hello", "World"))
print("{1} {0}".format("World", "Hello"))  # Hello World
 
# 关键字参数
print("Name: {name}, Age: {age}".format(name="Alice", age=25))
 
# 格式规范
print("{:.2f}".format(3.14159))    # 3.14
print("{:>10}".format("hi"))       # "        hi"(右对齐)
print("{:<10}".format("hi"))       # "hi        "(左对齐)
print("{:^10}".format("hi"))       # "    hi    "(居中)
print("{:0>5}".format(42))         # 00042(补零)
print("{:,}".format(1234567))      # 1,234,567(千分位)
print("{:.2%}".format(0.25))       # 25.00%(百分比)
name = "Alice"
age = 25
 
# 基本用法
print(f"Hello, {name}!")
print(f"Next year you'll be {age + 1}")
 
# 表达式
print(f"Square of 5: {5 ** 2}")
print(f"Name length: {len(name)}")
 
# 格式规范
pi = 3.14159265359
print(f"Pi: {pi:.2f}")          # Pi: 3.14
print(f"Pi: {pi:10.2f}")        # Pi:       3.14
print(f"Pi: {pi:<10.2f}")       # Pi: 3.14
print(f"Large: {1000000:,}")    # Large: 1,000,000
 
# 调试(Python 3.8+)
print(f"{age=}")        # age=25
print(f"{age + 5=}")    # age + 5=30
 
# 日期格式化
from datetime import datetime
now = datetime.now()
print(f"Now: {now:%Y-%m-%d %H:%M:%S}")
import re
 
# 基本匹配
text = "The quick brown fox jumps over 13 lazy dogs."
 
# search - 搜索第一个匹配
match = re.search(r"fox", text)
if match:
    print(f"找到'{match.group()}'在位置{match.start()}-{match.end()}")
 
# findall - 查找所有匹配
numbers = re.findall(r"\d+", text)  # ['13']
words = re.findall(r"\b\w+\b", text)
 
# 常用模式
patterns = {
    r"\d+": "一个或多个数字",
    r"\w+": "一个或多个单词字符",
    r"\s+": "一个或多个空白字符",
    r"[a-z]+": "一个或多个小写字母",
    r"^The": "以The开头",
    r"dogs\.$": "以dogs.结尾",
    r"o.": "o后跟任意字符",
}
 
# 替换
new_text = re.sub(r"fox", "cat", text)
new_text = re.sub(r"\d+", "XX", text)  # 将所有数字替换为XX
 
# 分割
parts = re.split(r"\s+", text)  # 按空白分割
# 低效:字符串拼接
result = ""
for i in range(1000):
    result += str(i)  # 每次创建新字符串
 
# 高效:使用join
parts = [str(i) for i in range(1000)]
result = "".join(parts)
 
# 更高效:使用列表再join
parts = []
for i in range(1000):
    parts.append(str(i))
result = "".join(parts)
 
# 或使用io.StringIO
from io import StringIO
buffer = StringIO()
for i in range(1000):
    buffer.write(str(i))
result = buffer.getvalue()

1. 字符串处理:实现函数将驼峰命名转换为下划线命名(camelCase → camel_case) 2. 格式化输出:使用f-string格式化一个表格,对齐各列 3. 正则提取:从HTML中提取所有URL 4. 文本统计:统计一段文本中各单词出现频率 5. 模板引擎:实现一个简单的字符串模板替换功能

本章我们深入学习了:

  • Python字符串的Unicode本质
  • 字符串的索引、切片方法
  • 丰富的字符串方法
  • 三种格式化方式(%、format、f-string)
  • 正则表达式基础
  • 字符串操作性能优化

下一章:第八章:列表与元组

该主题尚不存在

您访问的页面并不存在。如果允许,您可以使用创建该页面按钮来创建它。

  • python/chapter07.txt
  • 最后更改: 2026/04/09 14:26
  • 张叶安