首先,定义一个 Q-learning 代理类:
import numpy as np
class QLearningAgent:
def __init__(self, action_space, state_space, learning_rate=0.1, discount_factor=0.9, exploration_prob=0.1):
self.action_space = action_space
self.state_space = state_space
self.learning_rate = learning_rate
self.discount_factor = discount_factor
self.exploration_prob = exploration_prob
# 初始化 Q-value 表
self.q_values = np.zeros((len(state_space), len(action_space)))
def select_action(self, state):
# epsilon-greedy 策略选择动作
if np.random.rand() < self.exploration_prob:
return np.random.choice(self.action_space)
else:
state_index = self.state_space.index(state)
return self.action_space[np.argmax(self.q_values[state_index, :])]
def update_q_values(self, state, action, reward, next_state):
# 更新 Q-value 表
state_index = self.state_space.index(state)
action_index = self.action_space.index(action)
next_state_index = self.state_space.index(next_state)
max_next_q_value = np.max(self.q_values[next_state_index, :])
self.q_values[state_index, action_index] += self.learning_rate * \
(reward + self.discount_factor * max_next_q_value - self.q_values[state_index, action_index])
接下来,我们可以将这个 Q-learning 代理放入之前定义的网格环境中进行训练:
# 创建网格环境
env = GridWorld(rows=4, cols=4)
# 创建 Q-learning 代理
agent = QLearningAgent(action_space=env.action_space, state_space=env.state_space)
# 训练代理
num_episodes = 1000
for episode in range(num_episodes):
state = env.reset()
done = False
while not done:
action = agent.select_action(state)
next_state, reward, done = env.step(action)
agent.update_q_values(state, action, reward, next_state)
state = next_state
# 测试代理
state = env.reset()
done = False
while not done:
action = agent.select_action(state)
next_state, _, done = env.step(action)
state = next_state
print("代理已训练并在环境中执行动作,最终到达终点。")
这是一个简单的 Q-learning 代理在网格环境中的示例。在实际问题中,可能需要更复杂的环境、更复杂的代理策略和更先进的学习算法,但这个简单的例子可以作为学习代理构建的入门点。
转载请注明出处:http://www.pingtaimeng.com/article/detail/12053/AI人工智能